In [1]:
# import standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
movie=pd.read_csv('rotten_tomatoes_movies.csv')

In [3]:
movie.isna().sum()

rotten_tomatoes_link                   0
movie_title                            0
movie_info                           321
critics_consensus                   8578
content_rating                         0
genres                                19
directors                            194
authors                             1542
actors                               352
original_release_date               1166
streaming_release_date               384
runtime                              314
production_company                   499
tomatometer_status                    44
tomatometer_rating                    44
tomatometer_count                     44
audience_status                      448
audience_rating                      296
audience_count                       297
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

In [6]:
# read in data (the pre-processed version)
critic_reviews_processed = pd.DataFrame()
for i in range(5):
  # when saving as csv, the lists get turned into strings so we convert back
  temp =  pd.read_csv("critic_reviews_processed_{}.csv".format(i), converters={'tagged': eval,'lemmatized': eval})
  critic_reviews_processed = pd.concat([critic_reviews_processed, temp])

# look at some data to verify the load was successful
critic_reviews_processed.head()

Unnamed: 0,index,rotten_tomatoes_link,review_type,tagged,lemmatized
0,0,m/0814255,Fresh,"[NOUN, NOUN, VERB, ADJ, NOUN, ADJ, ADJ, NOUN, ...","[fantasy, adventure, fuse, greek, mythology, c..."
1,1,m/0814255,Fresh,"[PROPN, NOUN, NOUN, NOUN, NOUN, NOUN, NOUN, NO...","[uma, thurman, medusa, gorgon, coiffure, writh..."
2,2,m/0814255,Fresh,"[ADJ, NOUN, NOUN, VERB, ADJ, NOUN, NOUN, NOUN,...","[top, notch, cast, dazzle, special, effect, ti..."
3,3,m/0814255,Fresh,"[SCONJ, NOUN, VERB, ADP, NOUN, NOUN, ADV, VERB...","[whether, audience, get, behind, lightning, th..."
4,4,m/0814255,Rotten,"[ADV, VERB, NOUN, NOUN, ADJ, NOUN, NOUN, NOUN,...","[really, lack, lightning, thief, genuine, sens..."


In [7]:
# count number of reviews
len(critic_reviews_processed)

949181

In [8]:
# count number of movies
critic_reviews_processed.rotten_tomatoes_link.nunique()

17695

In [9]:
# group by movie/review_type and concatenate each review
# LDA performs better on longer documents (need proof/reference of claim!)

# df = critic_reviews_processed.groupby(['rotten_tomatoes_link']).agg({'lemmatized': 'sum'}).reset_index()
# df.head()

df = critic_reviews_processed.groupby(['rotten_tomatoes_link','review_type']).agg({'lemmatized':'sum','tagged':'count'}).reset_index()
df['prop'] = df.groupby(['rotten_tomatoes_link'])['tagged'].apply(lambda x:100 * x / float(x.sum()))
df_fresh = df[df['review_type']=='Fresh'][['rotten_tomatoes_link','prop']]
df = df.groupby(['rotten_tomatoes_link']).agg({'lemmatized':'sum','tagged':'sum'}).reset_index()
df = df.merge(df_fresh, how='left',on='rotten_tomatoes_link')
df = df.rename(columns={'tagged':'review_count','prop':'freshness'})
df = df.fillna(value={'freshness':50})
df.head()

Unnamed: 0,rotten_tomatoes_link,lemmatized,review_count,freshness
0,m/+_one_2019,"[get, strength, fundamental, lead, crackle, ch...",63,88.888889
1,m/+h,"[ultimately, plush, exceed, limited, expectati...",6,33.333333
2,m/-_man,"[owen, wilson, look, reassure, depend, much, t...",11,63.636364
3,m/-cule_valley_of_the_lost_ants,"[dialogue, free, bug, saga, carry, along, bril...",10,90.0
4,m/0814255,"[fantasy, adventure, fuse, greek, mythology, c...",148,49.324324


In [10]:
# count number of rows
len(df)

17695

In [11]:
# count number of words per document
df['words'] = df['lemmatized'].apply(lambda x: len(x))
# distribution of length of documents
#fig = plt.figure(figsize=(12, 6), dpi=100)
#ax = fig.add_subplot(1,1,1)
#sns.histplot(data=df,x='words',ax=ax)
#ax.set_xlabel('Number of words per document')
#plt.tight_layout()
#plt.show()

In [12]:
# average number of words
df['words'].mean(), df['words'].median()

(656.0611472167279, 280.0)

In [13]:
# how many movies have less than 100 words?
# NEED TO JUSTIFY THIS NUMBER!
len(df[df['words']<100])

3975

In [14]:
# remove them from the training dataset
df = df[df['words']>=40]
df = df.reset_index(drop=True)
# average number of words now
df['words'].mean(), df['words'].median()

(686.8535356528955, 306.0)

In [15]:
df

Unnamed: 0,rotten_tomatoes_link,lemmatized,review_count,freshness,words
0,m/+_one_2019,"[get, strength, fundamental, lead, crackle, ch...",63,88.888889,852
1,m/+h,"[ultimately, plush, exceed, limited, expectati...",6,33.333333,71
2,m/-_man,"[owen, wilson, look, reassure, depend, much, t...",11,63.636364,111
3,m/-cule_valley_of_the_lost_ants,"[dialogue, free, bug, saga, carry, along, bril...",10,90.000000,120
4,m/0814255,"[fantasy, adventure, fuse, greek, mythology, c...",148,49.324324,1868
...,...,...,...,...,...
16866,m/zoot_suit,"[interesting, full, review, spanish, curious, ...",6,50.000000,65
16867,m/zootopia,"[variety, cute, occasionally, slightly, scary,...",291,97.594502,3816
16868,m/zorba_the_greek,"[zorba, greek, motion, picture, right, every, ...",7,71.428571,79
16869,m/zulu,"[amazing, film, devastatingly, accurate, depic...",17,94.117647,199


## Movie description and rating only RS

In [16]:
for column in movie.columns:
    if movie[column].dtype=='O':
        movie[column].fillna('',inplace=True)
    else:
        movie[column].fillna(0, inplace=True)
#Only kept top 5 important actors
movie['actors']=movie['actors'].apply(lambda x:','.join(str(x).split(',')[0:5]))
#Map the content rating by numbers- 0: for everyone, 5: adult only
rating={'NR':0, 'G':1, 'PG':2,'PG-13':3, 'R':4, 'NC17':5}
movie['content_rating']=movie['content_rating'].map(rating)
movie['year']=movie['original_release_date'].apply(lambda x: str(x).split('-')[0])

In [17]:
#Some movie misses a year so I replace the missing year by the streaming year

stream_year=movie['streaming_release_date'].apply(lambda x: str(x).split('-')[0])
movie['year']=movie['year'].replace('',stream_year[movie['year']==''])
#The rest I replace by 1956, which is the mean of year
movie['year']=movie['year'].replace('',1956)
movie['year']=movie['year'].astype(int)

In [18]:
#Calculate the weighted rating based on IMDB formula
def weighted_rating(x, m, C):
    v = x['audience_count']
    R = x['audience_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [19]:
def get_recommendations(title, cosine_sim):
    # Get the index of the movie that matches the title
    
    #Find the most possible movie title if the input title name is slightly wrong
    movie_list = df_new[df_new['movie_title'].str.contains(title)]
    if len(movie_list):
        #In case of similar movie title like 'Iron Man' and 'Iron Man 2'
        if any(movie_list['movie_title']==title):
            movie_title=title
        else:
           # Pick the one with the highest audience rating
            movie_title=movie_list.sort_values(by=['audience_rating'], ascending=False)['movie_title'].iloc[0]
        
        print('Selected movie:',movie_title)
    
        #Some movies are duplicated such as Frozen has two version.
        #Pick the one with higher audience rating
        idx = indices[movie_title]
        if np.isscalar(idx)==False:
            idx=df_new.iloc[idx].sort_values(by=['audience_rating'], ascending=False).index[0]
        
        #Define the target movie content rating
        movie_content_rating=df_new.iloc[idx]['content_rating']
        
        movie_year=df_new.iloc[idx]['year']
        # Get the pairwsie similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 30 most similar movies
        sim_scores = sim_scores[1:30]
        
        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]
        
        #Calculate the weighted rating inside the list
        #Some movies are highly rated due to few audience counts
        #100 score with only 10 ppl rated does not mean a good movie
        selected_movies=df_new.iloc[movie_indices]
        audience_counts = selected_movies[selected_movies['audience_count'].notnull()]['audience_count'].astype('int')
        m = audience_counts.quantile(0.5)
        
        C=selected_movies['audience_rating'].mean()
        wr=selected_movies.apply(lambda x: weighted_rating(x,m,C), axis=1)
        
        df=pd.DataFrame(df_new[['movie_title','content_rating','year','audience_rating','audience_count']].iloc[movie_indices])

        df['Score']=[x[1] for x in sim_scores]
        df['wr']=wr
        #Product for similarity and rating so priortise to recommend high similarity and high rating movies
        df['mix_score']=df['audience_rating']*df['Score']
        
        #df['mix_score']=df['audience_rating']*df['Score']
        #Set the limit for number of audience count
        #At least higher than 50% quantile of the audience count in the whole document
        count_bound=movie['audience_count'].quantile(0.5)
        #Remove movie that is over the target movie content rating and with too few audience
        #ranked by the mix score
        df=df[(df['content_rating']<=movie_content_rating) & (df['audience_count']>count_bound) &(df['year']>movie_year-10)]
        df=df.sort_values('mix_score', ascending=False)
        
        # Return the top 10 most similar movies
        return df[['movie_title','year','Score', 'wr','audience_rating', 'audience_count']].head(10)

    else:
        print('This movie does not exist. Please check your input')

In [20]:
movie['all_feature']=movie['movie_info']+movie['genres']+movie['actors']+movie['directors']

In [21]:
chop_movie=movie.drop(['tomatometer_status','streaming_release_date','tomatometer_status', 'tomatometer_rating',
       'tomatometer_count','tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count','audience_status','runtime' ], axis=1)

In [22]:
df_new=pd.merge(df, chop_movie, how='inner', on=['rotten_tomatoes_link'])
indices = pd.Series(df_new.index, index=df_new['movie_title']).drop_duplicates()

In [23]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_all = tf.fit_transform(df_new['all_feature'])
cosine_sim_all = linear_kernel(tfidf_matrix_all, tfidf_matrix_all)

In [24]:
get_recommendations('Iron Man', cosine_sim_all)

Selected movie: Iron Man


Unnamed: 0,movie_title,year,Score,wr,audience_rating,audience_count
8051,Iron Man 2,2010,0.259877,70.190409,71.0,480879.0
8052,Iron Man 3,2013,0.191389,76.310244,78.0,485128.0
2873,Avengers: Age of Ultron,2015,0.141748,79.406367,83.0,288708.0
2874,Avengers: Endgame,2019,0.106612,77.327586,90.0,70334.0
13002,Spider-Man: Homecoming,2017,0.101618,78.195561,87.0,108167.0
4069,Captain America: Civil War,2016,0.087815,82.164485,89.0,180162.0
2875,Avengers: Infinity War,2018,0.076309,76.642157,91.0,58720.0
12544,Sherlock Holmes: A Game of Shadows,2011,0.077091,73.364507,77.0,168495.0
5321,Dolittle,2020,0.073346,66.25254,76.0,11526.0
12846,The Soloist,2009,0.074162,57.824312,56.0,263355.0


## LDA

In [39]:
import time
from gensim.models import LdaMulticore
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models import TfidfModel

# train model
def train_lda(data, col):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    num_topics = 100
    chunksize = 300
    dictionary = corpora.Dictionary(data[col])
    corpus = [dictionary.doc2bow(doc) for doc in data[col]]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                       alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=2)
    t2 = time.time()
    print("Time to train LDA model on ", len(data), "articles: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda

In [40]:
# takes about ~2 mins
dictionary,corpus,lda = train_lda(df,'lemmatized')

Time to train LDA model on  16871 articles:  2.352600045998891 min


In [None]:
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# stop_words.extend(['movie','film'])

# def remove_stopwords(texts,stop_words):
#     full = []
#     for doc in texts:
#         row = []
#         for token in doc:
#             if token not in stop_words:
#                 row.append(token)
#         full.append(row)
#     return full

# df['lemmatized'] = remove_stopwords(df['lemmatized'].tolist(),stop_words)

## Tuning lda

In [None]:
from gensim.models.coherencemodel import CoherenceModel
num_topics = 100
chunksize = 300
dictionary = corpora.Dictionary(df['lemmatized'])
corpus = [dictionary.doc2bow(doc) for doc in df['lemmatized']]
lda = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                       alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=2)

In [None]:
coherence_model_lda = CoherenceModel(model=lda, texts=df['lemmatized'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
from gensim.models.coherencemodel import CoherenceModel
import tqdm
import gensim
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           chunksize=300,
                                           minimum_probability=0.0,
                                           passes=2,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemmatized'], dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# Alpha parameter
alpha=np.geomspace(1e-10, 1, 10)
# Beta parameter
beta=np.geomspace(1e-10, 1, 10)

model_results = {
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run (around 6 hours )
pbar = tqdm.tqdm(total=100)
    
    # iterate through validation corpuses
        # iterate through number of topics
            # iterate through alpha values
for a in alpha:
                # iterare through beta values
    for b in beta:
                    # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
                                                  k=100, a=a, b=b)
                    # Save the model results
        model_results['Alpha'].append(a)
        model_results['Beta'].append(b)
        model_results['Coherence'].append(cv)
                    
        pbar.update(1)
pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
pbar.close()

In [None]:
import tqdm
import gensim

# Alpha parameter
alpha=np.geomspace(1e-10, 1, 10)
# Beta parameter
beta=np.geomspace(1e-10, 1, 10)

model_results = {
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [None]:
# Can take a long time to run (around 6 hours )
pbar = tqdm.tqdm(total=100)
    
    # iterate through validation corpuses
        # iterate through number of topics
            # iterate through alpha values
for a in alpha:
                # iterare through beta values
    for b in beta:
                    # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
                                                  k=100, a=a, b=b)
                    # Save the model results
        model_results['Alpha'].append(a)
        model_results['Beta'].append(b)
        model_results['Coherence'].append(cv)
                    
        pbar.update(1)
pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
pbar.close()

In [None]:
tuning_result=pd.read_csv('lda_tuning_results.csv')
tuning_result.sort_values(by=['Coherence'], ascending=False)

In [None]:
lda = LdaMulticore(corpus=corpus, num_topics=100, id2word=dictionary,
                       alpha=4.641589e-04, eta=4.641589e-04, chunksize=300, minimum_probability=0.0, passes=2)

In [None]:
compute_coherence_values(corpus, dictionary, 100,4.641589e-04,4.641589e-04)

In [None]:
topic_number_test={'Topics':[], 'Coherence':[]}
topic_range=range(25,151, 25)
dictionary = corpora.Dictionary(df['lemmatized'])
corpus = [dictionary.doc2bow(doc) for doc in df['lemmatized']]
# Can take a long time to run
pbar = tqdm.tqdm(total=6)
    
    # iterate through validation corpuses
        # iterate through number of topics
for k in topic_range:
            # iterate through alpha values
    cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, 
                                                  k=k, a=0.01, b=0.01)
                    # Save the model results
                #model_results['Validation_Set'].append(corpus_title[i])
    topic_number_test['Topics'].append(k)
    topic_number_test['Coherence'].append(cv)
                    
    pbar.update(1)
pd.DataFrame(topic_number_test)
pbar.close()

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda, corpus, dictionary)
LDAvis_prepared

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['movie','film','make', 'one'])

def remove_stopwords(texts,stop_words):
    full = []
    for doc in texts:
        row = []
        for token in doc:
            if token not in stop_words:
                row.append(token)
        full.append(row)
    return full

df['lemmatized'] = remove_stopwords(df['lemmatized'].tolist(),stop_words)

In [None]:
lda = LdaMulticore(corpus=corpus, num_topics=25, id2word=dictionary,
                       alpha=0.01, eta=0.01, chunksize=300, minimum_probability=0.0, passes=2)

In [None]:
compute_coherence_values(corpus, dictionary, 25,0.01,0.01)

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda, corpus, dictionary)
LDAvis_prepared

### Jensen-Shannon Distance

In [41]:
# reference: https://www.kaggle.com/ktattan/lda-and-document-similarity/data
from scipy.spatial import distance
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    sim = [distance.jensenshannon(data,query) for data in matrix]
    return sim

def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sim = jensen_shannon(query,matrix) # list of jensen shannon distances

    return np.argsort(sim)[:k] # the top k positional index of the smallest Jensen Shannon distances

In [42]:
# we need to use nested list comprehension here
doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])

In [43]:
test_bow = dictionary.doc2bow(df.loc[idx]['lemmatized'])
test_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=test_bow)])

NameError: name 'idx' is not defined

In [None]:
# get bow for iron man 2
test_bow = dictionary.doc2bow(df.loc[idx]['lemmatized'])
test_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=test_bow)])
# get similarity
most_sim_ids = get_most_similar_documents(test_doc_distribution,doc_topic_dist)
# best recommendations
closest_ordering = {}
for i,x in enumerate(most_sim_ids):
    closest_ordering[x] = i
closest_ordering = pd.DataFrame.from_dict(closest_ordering,orient='index')
most_similar_df = df[df.index.isin(most_sim_ids)].reset_index()
most_similar_df = df.merge(closest_ordering, how='right', left_index=True, right_index=True).drop(columns=0)
most_similar_df

__Note:__ Much better results. Trying some other movies...

In [44]:
# turning it into a function:
def recommend(movie_name,n=10):
    # is it in the dataset?
    try:
        movie_index = df[df['rotten_tomatoes_link']=='m/{}'.format(movie_name)].index[0]
        test_bow = dictionary.doc2bow(df.loc[movie_index]['lemmatized'])
        test_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=test_bow)])
        # get top n most similar movies
        most_sim_ids = get_most_similar_documents(test_doc_distribution,doc_topic_dist,k=n)
        # best recommendations
        closest_ordering = {}
        for i,x in enumerate(most_sim_ids):
            closest_ordering[x] = i
        closest_ordering = pd.DataFrame.from_dict(closest_ordering,orient='index')
        most_similar_df = df[df.index.isin(most_sim_ids)].reset_index()
        most_similar_df = df.merge(closest_ordering, how='right', left_index=True, right_index=True).drop(columns=0)
        return(most_similar_df)
    except:
        movie_index = df[df['rotten_tomatoes_link'].str.contains(movie_name)]['rotten_tomatoes_link'].tolist()
        print('Movie name not recognised. Did you mean one of: ', movie_index, '?')

In [45]:
recommend('zootopia',5)

Unnamed: 0,rotten_tomatoes_link,lemmatized,review_count,freshness,words
16867,m/zootopia,"[variety, cute, occasionally, slightly, scary,...",291,97.594502,3816
6446,m/fritz_the_cat,"[consider, un, cool, still, take, pleasure, vi...",15,46.666667,161
7929,m/indian_in_the_cupboard,"[wonderful, adaptation, classic, book, inventi...",7,85.714286,77
1048,m/1074108-101_dalmatians,"[one, great, disney, classic, league, snow, wh...",14,42.857143,164
15517,m/tinker_bell_and_the_lost_treasure,"[imaginative, idea, exquisite, execution, key,...",5,100.0,81


In [46]:
recommend('titanic',5)

Unnamed: 0,rotten_tomatoes_link,lemmatized,review_count,freshness,words
15525,m/titanic,"[event, cold, april, night, detailed, reconstr...",191,89.005236,2174
14837,m/the_overture,"[film, probably, lose, effect, bring, western,...",5,60.0,43
6964,m/greystoke_the_legend_of_tarzan_lord_of_the_apes,"[hugh, hudson, fashion, visually, resplendent,...",11,81.818182,114
1763,m/154,"[quite, conventional, pack, powerful, punch, l...",7,57.142857,100
6707,m/gladiator,"[joyous, return, cinematic, epic, spectacle, h...",197,76.649746,2026


# Next steps

- Add document size into the ranking
    - a more prolific movie should get higher ranking
    
    
- Add ratio of positive to negative reviews into the ranking
    - a "better" movie should get higher ranking
    
    
- Try a different similarity metric
    - perhaps `cosine` similarity

# Testing how to combine two scores- test case:Iron Man

In [53]:
#Testing on Iron Man
idx = indices['Iron Man']

In [54]:
#get the similarity score from cosine simlarity matrix
sim_scores = list(enumerate(cosine_sim_all[idx]))

In [55]:
#Movie that not in the movie desciption 
movie_drop=[]
for i in df['rotten_tomatoes_link']:
    if i not in df_new['rotten_tomatoes_link'].unique():
        movie_drop.append(i)
print(len(movie_drop))
# Create a new dataset with some movie dropped and merge with the movie description data
df2=df[~df['rotten_tomatoes_link'].isin(movie_drop)]
df_new=pd.merge(df2, chop_movie, how='inner', on=['rotten_tomatoes_link'])
indices = pd.Series(df_new.index, index=df_new['movie_title']).drop_duplicates()

6


In [56]:
#Rerun lda on the new review dataset
dictionary2,corpus2,lda2 = train_lda(df2,'lemmatized')

Time to train LDA model on  16865 articles:  2.301675339539846 min


In [57]:
doc_topic_dist2 = np.array([[tup[1] for tup in lst] for lst in lda2[corpus2]])

In [58]:
test_bow2 = dictionary2.doc2bow(df_new.loc[idx]['lemmatized'])
test_doc_distribution2 = np.array([tup[1] for tup in lda2.get_document_topics(bow=test_bow2)])
# get top n most similar movies
most_sim_ids2 = get_most_similar_documents(test_doc_distribution2,doc_topic_dist2,k=10)
# best recommendations
closest_ordering = {}
for i,x in enumerate(most_sim_ids2):
    closest_ordering[x] = i
closest_ordering = pd.DataFrame.from_dict(closest_ordering,orient='index')
most_similar_df = df_new[df_new.index.isin(most_sim_ids2)].reset_index()
most_similar_df = df_new.merge(closest_ordering, how='right', left_index=True, right_index=True).drop(columns=0)

In [59]:
#Getting the distance score
lda_score=jensen_shannon(test_doc_distribution2,doc_topic_dist2)

In [60]:
#Checking if the shape lda_score matrix is same as cosine_similarity matrix
len(lda_score)==len(cosine_sim_all[idx])

True

In [61]:
# Lower distance =better, while higher score= better--- Non-comparable
lda_score_df=pd.DataFrame(sorted(list(enumerate(lda_score)), key=lambda x:x[1])[1:], columns=['LDA_index', 'Distance'])
cos_score_df=pd.DataFrame(sorted(list(enumerate(cosine_sim_all[idx])), key=lambda x:x[1], reverse=True)[1:], columns=['COS_index','Score'])
#Iron Man Score
mix_df=pd.concat([lda_score_df, cos_score_df], axis=1)
mix_df

Unnamed: 0,LDA_index,Distance,COS_index,Score
0,8051,0.219978,8051,0.259877
1,8052,0.225843,8052,0.191389
2,9656,0.243542,2873,0.141748
3,2873,0.276263,2874,0.106612
4,16720,0.309203,12680,0.106365
...,...,...,...,...
16859,121,0.832217,16854,0.000000
16860,10585,0.832234,16857,0.000000
16861,6978,0.832237,16858,0.000000
16862,1862,0.832250,16860,0.000000


In [62]:
# 1-the lda score to make it comparable with cosine similarity
lda_score_inv=[1-x for x in lda_score]
lda_score_inv_df=pd.DataFrame(sorted(list(enumerate(lda_score_inv)), key=lambda x:x[1], reverse=True)[1:], 
                              columns=['LDA_index', 'Distance'])
mix_df2=pd.concat([lda_score_inv_df, cos_score_df], axis=1)
mix_df2

Unnamed: 0,LDA_index,Distance,COS_index,Score
0,8051,0.780022,8051,0.259877
1,8052,0.774157,8052,0.191389
2,9656,0.756458,2873,0.141748
3,2873,0.723737,2874,0.106612
4,16720,0.690797,12680,0.106365
...,...,...,...,...
16859,121,0.167783,16854,0.000000
16860,10585,0.167766,16857,0.000000
16861,6978,0.167763,16858,0.000000
16862,1862,0.167750,16860,0.000000


In [140]:
#For comparing
get_recommendations('Wonder Woman', cosine_sim_all)

Selected movie: Wonder Woman


Unnamed: 0,movie_title,year,Score,wr,audience_rating,audience_count
8417,Justice League,2017,0.098627,68.70927,71.0,127743.0
15947,Unstoppable,2010,0.089906,69.057419,72.0,104686.0
13088,Star Trek,2009,0.056687,89.383976,91.0,747806.0
1202,10 Cloverfield Lane,2016,0.059745,71.88622,79.0,60918.0
13095,Star Trek Into Darkness,2013,0.052545,85.66382,89.0,312836.0
16826,Z For Zachariah,2015,0.102695,59.619928,45.0,7525.0
13089,Star Trek Beyond,2016,0.0556,73.329395,80.0,74549.0
11146,People Like Us,2012,0.069095,62.060112,62.0,31935.0
15410,This Means War,2012,0.066061,58.016938,56.0,89752.0
8136,Jack Ryan: Shadow Recruit,2014,0.055554,56.697266,53.0,64773.0


### Only sum up the cosine similarity score and lda distance

In [64]:
sum_rank=sorted(list(enumerate((lda_score_inv+cosine_sim_all[idx]))), key=lambda x:x[1], reverse=True)
sum_movie_indices = [i[0] for i in sum_rank]
df_new.iloc[sum_movie_indices[1:10]][['movie_title','actors']]

Unnamed: 0,movie_title,actors
8051,Iron Man 2,"Robert Downey Jr., Gwyneth Paltrow, Don Cheadl..."
8052,Iron Man 3,"Robert Downey Jr., Gwyneth Paltrow, Don Cheadl..."
2873,Avengers: Age of Ultron,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
9656,Marvel's The Avengers,
2874,Avengers: Endgame,"Robert Downey Jr., Mark Ruffalo, Scarlett Joha..."
4069,Captain America: Civil War,"Chris Evans, Robert Downey Jr., Scarlett Johan..."
2875,Avengers: Infinity War,"Robert Downey Jr., Chris Hemsworth, Mark Ruffa..."
16720,X-Men: Days of Future Past,"Hugh Jackman, James McAvoy, Michael Fassbender..."
13002,Spider-Man: Homecoming,"Tom Holland (II), Michael Keaton, Robert Downe..."


In [65]:
# Only using the review
recommend('iron_man',10)

Unnamed: 0,rotten_tomatoes_link,lemmatized,review_count,freshness,words
8054,m/iron_man,"[iron, man, actually, get, away, whole, film, ...",279,93.548387,4003
8055,m/iron_man_2,"[sequel, go, one, acceptable, nothing, nothing...",297,72.053872,3926
8056,m/iron_man_3,"[black, instinctive, feel, balance, action, se...",324,79.320988,4285
2877,m/avengers_age_of_ultron,"[stake, line, action, adventure, sky, high, so...",367,75.749319,5156
9660,m/marvels_the_avengers,"[emotional, involvement, good, sci, fi, action...",355,91.549296,4579
16722,m/x2_xmen_united,"[may, sound, fine, dandy, make, busy, film, ev...",244,85.245902,2651
16726,m/x_men_days_of_future_past,"[well, complete, entertaining, man, movie, eve...",327,90.214067,4516
4074,m/captain_america_the_first_avenger,"[yet, another, superhero, pic, one, nicely, ro...",268,79.850746,3697
16734,m/xmen,"[perfect, unique, big, budget, comic, book, mo...",167,80.838323,1621
15429,m/thor,"[kenneth, branagh, thor, may, achieve, level, ...",286,76.923077,3683


## Adding weights on two scores

In [66]:
weight=0.4
weight_score=zip([weight*x for x in lda_score_inv],[(1-weight)*y for y in cosine_sim_all[idx]])
weighted_score=[x+y for x,y in weight_score]

In [67]:
weight_rank=sorted(list(enumerate(weighted_score)), key=lambda x:x[1], reverse=True)[1:]
weight_movie_indices = [i[0] for i in weight_rank]
df_new.iloc[weight_movie_indices[0:10]][['movie_title','actors']]

Unnamed: 0,movie_title,actors
8051,Iron Man 2,"Robert Downey Jr., Gwyneth Paltrow, Don Cheadl..."
8052,Iron Man 3,"Robert Downey Jr., Gwyneth Paltrow, Don Cheadl..."
2873,Avengers: Age of Ultron,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
9656,Marvel's The Avengers,
2874,Avengers: Endgame,"Robert Downey Jr., Mark Ruffalo, Scarlett Joha..."
4069,Captain America: Civil War,"Chris Evans, Robert Downey Jr., Scarlett Johan..."
13002,Spider-Man: Homecoming,"Tom Holland (II), Michael Keaton, Robert Downe..."
2875,Avengers: Infinity War,"Robert Downey Jr., Chris Hemsworth, Mark Ruffa..."
16720,X-Men: Days of Future Past,"Hugh Jackman, James McAvoy, Michael Fassbender..."
13007,Spider-Man 3,"Tobey Maguire, Kirsten Dunst, James Franco, Th..."


In [68]:
def combine_score_rating(target_idx, movie_indices, sim_scores):
    
    movie_content_rating=df_new.iloc[target_idx]['content_rating']  
    movie_year=df_new.iloc[target_idx]['year']
    
    selected_movies=df_new.iloc[movie_indices]
    audience_counts = selected_movies[selected_movies['audience_count'].notnull()]['audience_count'].astype('int')
    
    m = audience_counts.quantile(0.6)

    C=selected_movies['audience_rating'].mean()
    wr=selected_movies.apply(lambda x: weighted_rating(x,m,C), axis=1)

    df=pd.DataFrame(df_new[['movie_title','content_rating','year','audience_rating','audience_count']].iloc[movie_indices])
    df['Score']=[x[1] for x in sim_scores]
    df['wr']=wr
    #Product for similarity and rating so priortise to recommend high similarity and high rating movies
    
    df['mix_score']=df['wr']*df['Score']
    
    
    #Set the limit for number of audience count
    #At least higher than 50% quantile of the audience count in the whole document
    count_bound=df_new['audience_count'].quantile(0.5)
    #Remove movie that is over the target movie content rating and with too few audience
    #ranked by the mix score
    #Only select movie that is too old compared with the target movie, threshold is a decade-10 years.
    df=df[(df['content_rating']<=movie_content_rating) & (df['audience_count']>count_bound) & (df['year']>movie_year-10)]
    df=df.sort_values('mix_score', ascending=False)

    # Return the top 10 most similar movies
    return df[['movie_title','year','Score','audience_rating', 'audience_count']]

In [136]:
def recommend2(title, cosine_sim,weight=0.5, n=6):
    
    movie_list = df_new[df_new['movie_title'].str.contains(title)]
    if len(movie_list):
        #In case of similar movie title like 'Iron Man' and 'Iron Man 2'
        if any(movie_list['movie_title']==title):
            movie_title=title
        else:
           # Pick the one with the highest audience rating
            movie_title=movie_list.sort_values(by=['audience_rating'], ascending=False)['movie_title'].iloc[0]
        
        print('Selected movie:',movie_title)
    
        #Some movies are duplicated such as Frozen has two version.
        #Pick the one with higher audience rating
        idx = indices[movie_title]
        if np.isscalar(idx)==False:
            idx=df_new.iloc[idx].sort_values(by=['audience_rating'], ascending=False).index[0]

          
        
        test_bow2 = dictionary2.doc2bow(df_new.loc[idx]['lemmatized'])

        test_doc_distribution2 = np.array([tup[1] for tup in lda2.get_document_topics(bow=test_bow2)])
        lda_score=jensen_shannon(test_doc_distribution2,doc_topic_dist2)
        lda_score_inv=[1-x for x in lda_score]

        weight_score=zip([weight*x for x in lda_score_inv],[(1-weight)*y for y in cosine_sim[idx]])
        weighted_score=[x+y for x,y in weight_score]
        weight_rank=sorted(list(enumerate(weighted_score)), key=lambda x:x[1], reverse=True)[1:]

        weight_movie_indices = [i[0] for i in weight_rank]
        
        #Created another function for simplification
        #Adding constraints to the  movie recommendation list
        recommend=combine_score_rating(idx,weight_movie_indices, weight_rank)
        
        #recommend=df_new.iloc[weight_movie_indices]
        #return df_new.iloc[weight_movie_indices[0:10]][['movie_title','actors','audience_rating']]
        return recommend.head(n)
    else:
        print('No records in our database. Please check your input')

In [103]:
get_recommendations('Captain Marvel', cosine_sim_all)

Selected movie: Captain Marvel


Unnamed: 0,movie_title,year,Score,wr,audience_rating,audience_count
9656,Marvel's The Avengers,2012,0.070653,90.744832,91.0,1135962.0
13001,Spider-Man: Far From Home,2019,0.067559,90.805877,95.0,69242.0
8101,It's Kind of a Funny Story,2010,0.043967,64.427345,66.0,34122.0
3360,Big Game,2015,0.040471,46.224138,34.0,8975.0


In [139]:
recommend2('Inception', cosine_sim_all)

Selected movie: Inception


Unnamed: 0,movie_title,year,Score,audience_rating,audience_count
7997,Interstellar,2014,0.340554,86.0,175957.0
12940,Source Code,2011,0.320631,82.0,125552.0
9923,Minority Report,2002,0.311819,80.0,481543.0
9077,Limitless,2011,0.333149,74.0,108340.0
12408,Serenity,2005,0.252005,91.0,313208.0
11460,Primer,2004,0.296925,79.0,46477.0


In [124]:
recommend2('Catch Me If You Can' ,0.5,10, cosine_sim_all)

Selected movie: Catch Me If You Can


Unnamed: 0,movie_title,year,Score,audience_rating,audience_count
4942,Dave Chappelle: Sticks & Stones,2019,0.217577,99.0,40887.0
10702,Ocean's Eleven,2001,0.25078,80.0,32601771.0
6317,Forrest Gump,1994,0.20638,95.0,1244237.0
12293,Schizopolis,1996,0.285157,81.0,4931.0
12310,Scott Pilgrim vs. the World,2010,0.23388,84.0,141600.0
9172,Logan Lucky,2017,0.264394,76.0,27698.0
3813,Bridge of Spies,2015,0.224291,87.0,65466.0
16861,Zootopia,2016,0.207249,92.0,101511.0
2876,The Aviator,2004,0.236548,79.0,207578.0
16187,Walk the Line,2005,0.203983,90.0,545629.0


In [115]:
df_new.iloc[indices['The Pursuit of Happyness']]

rotten_tomatoes_link                                m/pursuit_of_happyness
lemmatized               [pursuit, happyness, speak, eloquently, anxiet...
review_count                                                           174
freshness                                                          67.2414
words                                                                 2208
movie_title                                       The Pursuit of Happyness
movie_info               Life is a struggle for single father Chris Gar...
critics_consensus        Will Smith's heartfelt performance elevates Th...
content_rating                                                           3
genres                                                               Drama
directors                                                 Gabriele Muccino
authors                                    Steven Conrad, Gabriele Muccino
actors                   Will Smith, Jaden Smith, Thandie Newton, Brian...
original_release_date    

In [142]:
recommend2('Captain Marvel' ,cosine_sim_all)

Selected movie: Captain Marvel


Unnamed: 0,movie_title,year,Score,audience_rating,audience_count
13001,Spider-Man: Far From Home,2019,0.354005,95.0,69242.0
9656,Marvel's The Avengers,2012,0.351873,91.0,1135962.0
16720,X-Men: Days of Future Past,2014,0.339873,91.0,277110.0
2874,Avengers: Endgame,2019,0.346505,90.0,70334.0
6987,Guardians of the Galaxy,2014,0.317435,92.0,255582.0
2686,Ant-Man,2015,0.329066,86.0,166901.0


In [126]:
recommend2('Wonder Woman' ,0.5,6, cosine_sim_all)

Selected movie: Wonder Woman


Unnamed: 0,movie_title,year,Score,audience_rating,audience_count
8417,Justice League,2017,0.341681,71.0,127743.0
13376,Man of Steel,2013,0.313933,75.0,448951.0
16660,Wonder Woman,2009,0.317992,78.0,8565.0
16720,X-Men: Days of Future Past,2014,0.213063,91.0,277110.0
6987,Guardians of the Galaxy,2014,0.203739,92.0,255582.0
9656,Marvel's The Avengers,2012,0.196959,91.0,1135962.0
