In [1]:
import pandas as pd
import transformers
import numpy as np
import os
import json
import ast
from samutil import SpacyTransformer
import spacy
model = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
df_main = pd.read_json('./data/main/tmdb_data_main.json')
df_main = df_main[['tmdb_id','movie','overview','genres','genres_new','popularity','release_year','vote_average','budget','revenue','poster_path']]

# Show Some Film Descriptions
---

In [92]:
df_main

Unnamed: 0,tmdb_id,movie,overview,genres,genres_new,popularity,release_year,vote_average,budget,revenue,poster_path
0,694,The Shining,Jack Torrance accepts a caretaker job at the O...,"[Horror, Thriller]",horror,50.841,1980,8.2,19000000.0,44781695.0,https://image.tmdb.org/t/p/w500/nRj5511mZdTl4s...
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]",crime,43.342,1972,8.7,6000000.0,245066411.0,https://image.tmdb.org/t/p/w500/3bhkrj58Vtu7en...
4,185,A Clockwork Orange,"In a near-future Britain, young Alexander DeLa...","[Science Fiction, Drama]",scifi,28.708,1971,8.2,2200000.0,26589000.0,https://image.tmdb.org/t/p/w500/4sHeTAp65WrSSu...
5,4488,Friday the 13th,Camp counselors are stalked and murdered by an...,[Horror],horror,24.947,1980,6.4,550000.0,59754601.0,https://image.tmdb.org/t/p/w500/4nbUlVEg31I8lc...
6,8393,The Gods Must Be Crazy,A Coca-Cola bottle dropped from an airplane ra...,"[Action, Comedy]",action,20.110,1980,7.3,5000000.0,60000000.0,https://image.tmdb.org/t/p/w500/IgBfj5LfT7nwpo...
...,...,...,...,...,...,...,...,...,...,...,...
19640,833391,Together Forever Tea,,"[Romance, TV Movie]",,5.793,2021,0.0,0.0,0.0,https://image.tmdb.org/t/p/w500/AwavquzgUhA1ho...
19641,834509,Hide or Seek,Andrew introduces his boyfriend to his childho...,,,5.862,2021,0.0,0.0,0.0,https://image.tmdb.org/t/p/w500/bdkhNqmPC73XnV...
19644,672475,V2. Escape from Hell,"Mikhail Devyatayev, a captured Soviet pilot wh...","[War, Action, Drama]",,5.671,2021,1.0,0.0,0.0,https://image.tmdb.org/t/p/w500/3lCIHrL4fXHiIb...
19658,841858,The Hobbit: The Ultimate Journey,A trimmed edit of Peter Jackson's The Hobbit t...,"[Fantasy, Adventure, Action]",,5.379,2021,0.0,0.0,0.0,https://image.tmdb.org/t/p/w500/3mojSG585Ipi6O...


# Visualising Embedding Space
--

In [None]:
"""
Choose Bert or USE
2d vs 3d
Size parameter


"""

## Read in data

In [115]:
def get_embeddings(paths=[],drop_full_embeddings=True):
    
    dfs = [pd.read_csv(path,index_col=0) for path in paths]
    
    df = pd.concat(dfs)
    
    if drop_full_embeddings:
    
        return df[df.columns[:6]]
    else:
        return df
    

In [116]:
bert_paths = ['./data/overview_embeddings/overview_embeddings_bert6000.csv',
            './data/overview_embeddings/overview_embeddings_bert12126.csv']

use_paths = ['./data/overview_embeddings/overview_embeddings_6000.csv',
            './data/overview_embeddings/overview_embeddings_12126.csv']

In [117]:
df_embed = get_embeddings(use_paths
                         )

## Merge with master frame

In [128]:
df_semantic_plot = pd.merge(df_embed, df_main, how='left').dropna(subset=['genres_new'])
genres_choose = ['comedy', 'action', 'drama', 'scifi', 'horror', 'crime','action-scifi']
df_semantic_plot = df_semantic_plot[df_semantic_plot.genres_new.isin(genres_choose)]
df_semantic_plot = df_semantic_plot.dropna()

# Plotting

In [3]:
def plotly_streamlit_layout(fig, barmode=None, barnorm=None, height=None,width=None):
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)',
                      barmode=barmode,
                      barnorm=barnorm,
                      height = height,
                      width = width)
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')

    fig.update_layout(margin=dict(l=50, r=50, b=50, t=50, pad=2))
    fig.update_layout(bargap=0.03)

    return fig


def plotly_streamlit_texts(fig, x_title=None, y_title=None):
    fig.update_layout(yaxis=dict(title=y_title, titlefont_size=10, tickfont_size=10),
                      xaxis=dict(title=x_title, titlefont_size=10, tickfont_size=10))

    return fig

In [4]:
import plotly.express as px

In [131]:
df_semantic_plot.overview = df_semantic_plot.overview.str.wrap(30)
df_semantic_plot.overview =  df_semantic_plot.overview.apply(lambda x: x.replace('\n', '<br>'))
df_semantic_plot['plot_data'] = df_semantic_plot.movie + '<br><br>' +df_semantic_plot.overview


In [132]:
df_semantic_plot

Unnamed: 0,tmdb_id,ts_2_x,ts_2_y,ts_3_x,ts_3_y,ts_3_z,movie,overview,genres,genres_new,popularity,release_year,vote_average,budget,revenue,plot_data
1,6,-11.881898,22.651642,-1.560683,10.904191,-7.344006,Judgment Night,"While racing to a boxing<br>match, Frank, Mike...","[Action, Thriller, Crime]",crime,6.374,1993,6.4,21000000.0,12136938.0,Judgment Night<br><br>While racing to a boxing...
2,11,57.944330,48.917633,29.197300,29.898634,-20.231052,Star Wars,Princess Leia is captured and<br>held hostage ...,"[Adventure, Action, Science Fiction]",scifi,61.528,1977,8.2,11000000.0,775398007.0,Star Wars<br><br>Princess Leia is captured and...
5,14,-28.155580,-24.323355,-4.433146,-18.486017,8.558794,American Beauty,"Lester Burnham, a depressed<br>suburban father...",[Drama],drama,21.695,1999,8.0,15000000.0,356296601.0,"American Beauty<br><br>Lester Burnham, a depre..."
6,16,-26.408459,-21.063942,-14.838842,-28.599047,-12.513384,Dancer in the Dark,"Selma, a Czech immigrant on<br>the verge of bl...","[Drama, Crime]",crime,12.196,2000,7.9,12800000.0,40031879.0,"Dancer in the Dark<br><br>Selma, a Czech immig..."
7,17,2.502732,-21.892414,-18.509277,9.144068,12.442909,The Dark,Adèle and her daughter Sarah<br>are traveling ...,"[Horror, Thriller, Mystery]",horror,7.241,2005,5.9,0.0,0.0,The Dark<br><br>Adèle and her daughter Sarah<b...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12104,829477,34.997726,-27.186120,-8.084689,-4.516423,-25.054363,The Chicken Curry,"Excited about their grandson's<br>visit, an el...","[Comedy, Drama]",comedy,6.129,2021,0.0,3400000.0,0.0,The Chicken Curry<br><br>Excited about their g...
12108,833938,-28.883230,-15.594045,-1.229322,-12.919159,46.800087,The Tale of the Fatherless,A group of young people try to<br>navigate lif...,[Drama],drama,6.602,2021,0.0,0.0,0.0,The Tale of the Fatherless<br><br>A group of y...
12123,840543,63.373466,-23.417430,-14.675287,38.215620,9.170152,Amityville Vampire,A group of female campers at<br>Red Moon Lake ...,[Horror],horror,6.210,2021,0.0,0.0,0.0,Amityville Vampire<br><br>A group of female ca...
12124,841702,-46.805000,36.868220,26.801052,-15.997930,30.934656,The Universe Blinks,"A reflection on routine,<br>isolation, and bor...",[Drama],drama,5.683,2021,0.0,40.0,0.0,The Universe Blinks<br><br>A reflection on rou...


In [133]:
fig = px.scatter(df_semantic_plot[df_semantic_plot.budget >= df_semantic_plot.budget.mean()],x='ts_2_x',y='ts_2_y',
                                                       color='genres_new',opacity=0.8,hover_name='plot_data')
plotly_streamlit_layout(fig,height=900,width=900)

# Noun Chunk Analysis
---

In [145]:
spacy.__version__

'2.0.18'

In [26]:
df_copy = df_main.copy()
df_copy = df_copy.dropna()

In [27]:
action = df_copy[df_copy.genres_new == 'action']
comedy = df_copy[df_copy.genres_new == 'comedy']
scifi = df_copy[df_copy.genres_new == 'scifi']
horror = df_copy[df_copy.genres_new == 'horror']
drama = df_copy[df_copy.genres_new == 'drama']
crime = df_copy[df_copy.genres_new == 'crime']

In [28]:
action_trans = SpacyTransformer(action,'overview',model)
comedy_trans = SpacyTransformer(comedy,'overview',model)
scifi_trans = SpacyTransformer(scifi,'overview',model)
horror_trans = SpacyTransformer(horror,'overview',model)
drama_trans = SpacyTransformer(drama,'overview',model)
crime_trans = SpacyTransformer(crime,'overview',model)

In [29]:
%%time
action_trans.fit_transform()
comedy_trans.fit_transform()
scifi_trans.fit_transform()
horror_trans.fit_transform()
drama_trans.fit_transform()
crime_trans.fit_transform()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['spaCy_doc'] = [i for i in self.model.pipe(self.df[self.source_col], n_threads=30, batch_size=1000)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['spaCy_sentences'] = self.df['spaCy_doc'].apply(lambda x: [sent for sent in x.sents])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  la

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lambda x: [(token.root.text, token.text) for i in x for token in i.noun_chunks])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[i] = self.df[i].str.join(', ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['spaCy_doc'] = [i for i in self.model.pipe(self.df[self.source_col], n_thread

CPU times: user 44.7 s, sys: 9.93 s, total: 54.6 s
Wall time: 54.7 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['spaCy_doc'] = [i for i in self.model.pipe(self.df[self.source_col], n_threads=30, batch_size=1000)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['spaCy_sentences'] = self.df['spaCy_doc'].apply(lambda x: [sent for sent in x.sents])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  la

In [32]:
cols = ['tmdb_id','movie','overview','release_year','vote_average','budget','revenue','spaCy_nouns','spaCy_noun_chunk']

In [51]:
with open(os.getcwd()+'/data/nouns/horror.txt', 'w') as file:
     file.write(json.dumps(horror_trans.df[cols].to_dict())) 


In [67]:
path = os.getcwd()+'/data/nouns/action.txt'

with open(path) as f:
    x = json.loads(f.read())
    




In [68]:
x = pd.DataFrame(x)

In [56]:
def getNounchunks(df,noun_chunk_col,noun):
        
    """
    Args:
        DataFrame with spaCy features, the column containing spaCy noun chunks and the noun to analyse
    Returns:
        DataFrame containing the noun chunks for that noun
    """
    
    df_nc = [i for list_ in df[noun_chunk_col].to_list() for i in list_]
    df_nc = pd.DataFrame(df_nc)
    df_nc.columns = ['noun', 'noun_chunk']

    df_nc['chunk_length'] = df_nc['noun_chunk'].map(lambda x: len(x))
    df_nc = df_nc[df_nc['noun'] == noun]
    df_nc = df_nc.sort_values(by='chunk_length',ascending=False)
    df_nc = df_nc[['noun' ,'noun_chunk']]

    return df_nc 

In [77]:
def mostcommonTokens(data,additional_stopwords = [],token=2):
    
    """
    Args:
        Pandas Series, additional stopwords, tokens needed
    Returns
        DataFrame containing the counts of the most common tokens
    """
    
    #include any extra stop words
    add_stop_words = ENGLISH_STOP_WORDS.union(additional_stopwords)

    # instantiate vectorizing object
    vect = CountVectorizer(stop_words=add_stop_words, ngram_range= (token,token))

    # transform text using instance
    X = vect.fit_transform(data)

    # get the word counts
    word_counts = list(zip(vect.get_feature_names(),np.asarray(X.sum(axis=0)).ravel()))

    # create a formatted DataFrame with the counts and normalised counts
    columns = ['word', 'count']
    word_counts = pd.DataFrame(word_counts, columns=columns)
    word_counts = word_counts.sort_values(by='count', ascending=False)
    word_counts['count'] = word_counts['count'].map(lambda x: int(x))
    word_counts = word_counts[word_counts['count'] > 1]
    word_counts['count_norm'] = word_counts['count'].map(lambda x: x /word_counts['count'].sum())

    return word_counts

In [78]:
def add_noun(df,noun):

    word_l = list(df.word)

    for i in word_l:

        if i.split()[-1] != noun:

            word_l[word_l.index(i)] = word_l[word_l.index(i)]+' '+noun

    df.word = word_l

    return df

In [79]:
from tqdm.notebook import tqdm

In [80]:
def getAllNounChunks(df,noun_col,noun_chunk_col,chunk_token,stop_nouns=[],top_nouns=20):
    
    """
    Args:
        DataFrame with spaCy features, the noun column, the noun chunk column, the number of grams needed and 
        any nouns we want to exclude
    Returns
        DataFrame containing the counts of the most common tokens
    """

    mcn = list(mostcommonTokens(df[noun_col],token=1).head(top_nouns)['word'])

    mcn = [i for i in mcn if i not in stop_nouns]#exclude any stop nouns

    master_noun_chunks = [] #empty list for the noun chunks to be appended to

    #loop through each noun
    for noun in tqdm(mcn):

        noun_chunks = getNounchunks(df,noun_chunk_col,noun)['noun_chunk'] #get all noun chunks for noun

        try:
            #get most common noun chunks from all noun chunks
            mc_noun_chunks = mostcommonTokens(noun_chunks,token=chunk_token)

            #adds the noun to the end of a noun chunk if its missing
            mc_noun_chunks = add_noun(mc_noun_chunks,noun)

            #append top n most common noun chunks to list
            master_noun_chunks.append(mc_noun_chunks)
        except:
            print(noun)

            pass
        
    return pd.concat([master_noun_chunks[i] for i in range(len(master_noun_chunks))])

In [81]:
mostcommonTokens(x.spaCy_nouns.values)

Unnamed: 0,word,count,count_norm
2241,drug dealer,9,0.011111
5772,police officer,9,0.011111
1107,cat mouse,9,0.011111
688,biker gang,6,0.007407
8308,wife daughter,6,0.007407
...,...,...,...
6435,run employers,2,0.002469
5466,partner friend,2,0.002469
2759,field agent,2,0.002469
5502,passengers board,2,0.002469


In [83]:
c = getAllNounChunks(x,'spaCy_nouns','spaCy_noun_chunk',chunk_token=2,top_nouns=60)

  0%|          | 0/60 [00:00<?, ?it/s]

In [84]:
def visualiseNounChunks(noun_chunk_df):

    noun_chunk_df["all_nouns"] = "" # empty string in order to have a single root node

    #add column containing root noun for each noun chunk
    noun_chunk_df['noun'] = noun_chunk_df['word'].apply(lambda x: x.split(" ")[-1])

    fig = px.treemap(noun_chunk_df, path=['all_nouns','noun', 'word'], 
         values='count',hover_data=['count'],color='count',color_continuous_scale='Purples')

    fig.update_layout(
            autosize=False,
            width=800,
            height=800)
    

#     fig.update_traces(marker_colorscale = 'Blues')
    fig.update_traces(hovertemplate=None)
    fig.show()
    
    return fig

In [86]:
import plotly.express as px

In [87]:
fig = visualiseNounChunks(c)


# Find a Similar Overview
--

In [3]:
from samutil import UniversalSentenceEncoder

In [4]:
use = UniversalSentenceEncoder()

INFO:absl:Using /var/folders/f8/sngrg7w50kd7fsh_hp055hzm0000gr/T/tfhub_modules to cache modules.


In [5]:
user_text = "The Daywalker known as 'Blade' - a half-vampire, half-mortal man - becomes the protector of humanity against an underground army of vampires"

In [6]:
def get_vector_from_text(user_text,model_instance):
    vectorised_text = model_instance.fit_transform([user_text],reduce=False)
    vectorised_text_array = np.array(vectorised_text.T.iloc[1:][0])
    return vectorised_text_array

In [7]:
text_vec = get_vector_from_text(user_text,use)

  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
def get_full_embeddings():
    dfs = []
    parts = ['3000','6000','9000','12000']
    for i in parts:
        df = pd.read_csv(f'./data/overview_embeddings/embed_full_{i}.csv')
        dfs.append(df)
    return pd.concat(dfs).set_index('tmdb_id').drop_duplicates()

In [9]:
embed_raw = get_full_embeddings()

In [10]:
def mask_main_frame(df_main,threshold=10):
    return df_main[df_main.popularity >=threshold].set_index('tmdb_id')
    

In [11]:
def get_masked_embeddings(df_main,embed_raw,threshold=10):

    df_use = mask_main_frame(df_main,threshold=threshold)
    embed_raw_mask = embed_raw.reindex(index=df_use.index).dropna()
    return embed_raw_mask

In [12]:
embed_raw_mask = get_masked_embeddings(df_main,embed_raw)

In [13]:
def matrix_operation(embed_raw,text_vec):
    results = [(embed_raw.index[i],np.dot(text_vec,embed_raw.iloc[i])) for i in range(embed_raw.shape[0])]
    return pd.DataFrame(results)

In [14]:
df_matrix = matrix_operation(embed_raw_mask,text_vec)

In [15]:
def get_meta_data(df_main,df_matrix):
    df_use = df_main[['tmdb_id','movie','budget','poster_path','overview']]
    # df_use = df_use[df_use.budget >= df_use.budget.median()]
    df_final = pd.merge(df_use,df_matrix,left_on='tmdb_id',right_on=0,how='left').dropna().sort_values(by=1,ascending=False)
    return df_final

In [16]:
get_meta_data(df_main,df_matrix).head(10)

Unnamed: 0,tmdb_id,movie,budget,poster_path,overview,0,1
13867,36647,Blade,45000000.0,https://image.tmdb.org/t/p/w500/e6ErRnIgKmoBtc...,"The Daywalker known as ""Blade"" - a half-vampir...",36647.0,1.0
17036,36648,Blade: Trinity,65000000.0,https://image.tmdb.org/t/p/w500/iQ2fyewqYDNmcU...,"For years, Blade has fought against the vampir...",36648.0,0.615386
12806,19901,Daybreakers,20000000.0,https://image.tmdb.org/t/p/w500/ebnmZ3v7IErHpE...,"In the year 2019, a plague has transformed alm...",19901.0,0.595435
7046,38321,Priest,60000000.0,https://image.tmdb.org/t/p/w500/stFUfo2kJXepLT...,"In an alternate world, humanity and vampires h...",38321.0,0.586693
14090,36586,Blade II,54000000.0,https://image.tmdb.org/t/p/w500/vulF0iqX5z7AW1...,A rare mutation has occurred within the vampir...,36586.0,0.582958
14286,277,Underworld,22000000.0,https://image.tmdb.org/t/p/w500/zsnQ41UZ3jo1wE...,Vampires and werewolves have waged a nocturnal...,277.0,0.578791
14220,834,Underworld: Evolution,50000000.0,https://image.tmdb.org/t/p/w500/yT1EDKhCzAumcZ...,As the war between the vampires and the Lycans...,834.0,0.552248
19182,12437,Underworld: Rise of the Lycans,35000000.0,https://image.tmdb.org/t/p/w500/6pLPWF7AXhljLJ...,"A prequel to the first two Underworld films, t...",12437.0,0.546548
14098,9945,Vampires,20000000.0,https://image.tmdb.org/t/p/w500/iBkoNEyjXNlruG...,The church enlists a team of vampire-hunters t...,9945.0,0.546251
14576,49017,Dracula Untold,70000000.0,https://image.tmdb.org/t/p/w500/dN3D3AvOsZ60jB...,"Vlad Tepes is a great hero, but when he learns...",49017.0,0.53772


In [142]:
np.sum(x*np.array(embed.loc[0]))

0.060995594154718516

In [139]:
np.sum(x*np.array(embed.loc[0]), axis=1)

AxisError: axis 1 is out of bounds for array of dimension 1