In [1]:
# import standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import time
from gensim.models import LdaMulticore, LdaModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models import TfidfModel
import pickle
import wikipedia



In [2]:
df=pd.read_csv('data/combined_df.csv')
df['lemmatized']=df['lemmatized'].apply(lambda x:x.split(','))

FileNotFoundError: [Errno 2] File data/combined_df.csv does not exist: 'data/combined_df.csv'

In [5]:
dict_dist = open("data/dict.pickle","rb")
doc_topic_dist = pickle.load(dict_dist)
cosine_pickle = open("data/cosine.pickle","rb")
cosine_sim_all=pickle.load(cosine_pickle)

In [6]:
lda = LdaModel.load('LDA/lda_combined_review')
corpus_pickle = open("LDA/corpus.pickle","rb")
corpus=pickle.load(corpus_pickle)
dictionary_pickle = open("LDA/dictionary.pickle","rb")
dictionary=pickle.load(dictionary_pickle)

In [7]:
#Calculate the weighted rating based on IMDB formula
def weighted_rating(x, m, C):
    v = x['audience_count']
    R = x['audience_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [8]:
# reference: https://www.kaggle.com/ktattan/lda-and-document-similarity/data
from scipy.spatial import distance
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    sim = [distance.jensenshannon(data,query) for data in matrix]
    return sim

def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sim = jensen_shannon(query,matrix) # list of jensen shannon distances

    return np.argsort(sim)[:k] # the top k positional index of the smallest Jensen Shannon distances

In [9]:
def combine_score_rating(target_idx, movie_indices, sim_scores):
    
    movie_content_rating=df.iloc[target_idx]['content_rating']  
    movie_year=df.iloc[target_idx]['year']
    
    selected_movies=df.iloc[movie_indices]
    audience_counts = selected_movies[selected_movies['audience_count'].notnull()]['audience_count'].astype('int')
    
    m = audience_counts.quantile(0.5)

    C=selected_movies['audience_rating'].mean()
    wr=selected_movies.apply(lambda x: weighted_rating(x,m,C), axis=1)

    selected_df=pd.DataFrame(df[['movie_title','content_rating','year','audience_rating','audience_count']].iloc[movie_indices])
    selected_df['Score']=[x[1] for x in sim_scores]
    selected_df['wr']=wr
    #Product for similarity and rating so priortise to recommend high similarity and high rating movies
    
    selected_df['mix_score']=selected_df['wr']*selected_df['Score']
    
    
    #Set the limit for number of audience count
    #At least higher than 50% quantile of the audience count in the whole document
    count_bound=df['audience_count'].quantile(0.5)
    #Remove movie that is over the target movie content rating and with too few audience
    #ranked by the mix score
    #Only select movie that is too old compared with the target movie, threshold is a decade-10 years.
    selected_df=selected_df[(selected_df['content_rating']<=movie_content_rating) & (selected_df['audience_count']>count_bound) & (selected_df['year']>movie_year-10)]
    selected_df=selected_df.sort_values('mix_score', ascending=False)

    # Return the top 10 most similar movies
    return selected_df[['movie_title','year','Score','audience_rating', 'audience_count']]

In [10]:
def recommend2(title, cosine_sim,weight=0.5, n=6):
    indices = pd.Series(df.index, index=df['movie_title']).drop_duplicates()
    movie_list = df[df['movie_title'].str.contains(title)]
    if len(movie_list):
        #In case of similar movie title like 'Iron Man' and 'Iron Man 2'
        if any(movie_list['movie_title']==title):
            movie_title=title
        else:
           # Pick the one with the highest audience rating
            movie_title=movie_list.sort_values(by=['audience_rating'], ascending=False)['movie_title'].iloc[0]
        
        print('Selected movie:',movie_title)
    
        #Some movies are duplicated such as Frozen has two version.
        #Pick the one with higher audience rating
        idx = indices[movie_title]
        if np.isscalar(idx)==False:
            idx=df.iloc[idx].sort_values(by=['audience_rating'], ascending=False).index[0]

          
        
        test_bow = dictionary.doc2bow(df.loc[idx]['lemmatized'])

        test_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=test_bow)])
        lda_score=jensen_shannon(test_doc_distribution,doc_topic_dist)
        lda_score_inv=[1-x for x in lda_score]

        weight_score=zip([weight*x for x in lda_score_inv],[(1-weight)*y for y in cosine_sim[idx]])
        weighted_score=[x+y for x,y in weight_score]
        weight_rank=sorted(list(enumerate(weighted_score)), key=lambda x:x[1], reverse=True)[1:]

        weight_movie_indices = [i[0] for i in weight_rank]
        
        #Created another function for simplification 
        #Adding constraints to the  movie recommendation list
        recommend=combine_score_rating(idx,weight_movie_indices, weight_rank)
        
        #recommend=df_new.iloc[weight_movie_indices]
        #return df_new.iloc[weight_movie_indices[0:10]][['movie_title','actors','audience_rating']]
        print('=====================')
        for i in range(n):
            recommend_movie=recommend.iloc[i]['movie_title']
            movie_year=recommend.iloc[i]['year']
            print(f"{i+1}. {recommend_movie}, {movie_year}")
        #return recommend.head(n)
    else:
        print('No records in our database. Please check your input')

In [11]:
recommend2('Inception', cosine_sim_all)

Selected movie: Inception
1. Interstellar, 2014
2. Source Code, 2011
3. Catch Me If You Can, 2002
4. Avatar, 2009
5. Gravity, 2013
6. Minority Report, 2002


In [32]:
import requests
import json

def get_wiki_main_image(title):
    url = 'https://en.wikipedia.org/w/api.php'
    data = {
        'action' :'query',
        'format' : 'json',
        'formatversion' : 2,
        'prop' : 'pageimages|pageterms',
        'piprop' : 'original',
        'titles' : title
    }
    response = requests.get(url, data)
    json_data = json.loads(response.text)
    #return json_data['query']['pages'][0]['source'] if len(json_data['query']['pages']) >0 else 'Not found'
    return json_data
get_wiki_main_image('Inception')

{'batchcomplete': True,
 'query': {'pages': [{'pageid': 23270459,
    'ns': 0,
    'title': 'Inception',
    'terms': {'label': ['Inception'],
     'description': ['2010 science fiction film directed by Christopher Nolan']}}]}}

In [30]:
wikipedia.page("Inception movie").images

['https://upload.wikimedia.org/wikipedia/commons/3/3e/Emma_Thomas_%26_Christopher_Nolan_at_WonderCon_2010_1.JPG',
 'https://upload.wikimedia.org/wikipedia/commons/3/34/Impossible_staircase.svg',
 'https://upload.wikimedia.org/wikipedia/commons/4/43/InceptionCastPremiereJuly10.jpg',
 'https://upload.wikimedia.org/wikipedia/commons/f/fa/Wikiquote-logo.svg',
 'https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg',
 'https://upload.wikimedia.org/wikipedia/en/2/2e/Inception_%282010%29_theatrical_poster.jpg',
 'https://upload.wikimedia.org/wikipedia/en/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg',
 'https://upload.wikimedia.org/wikipedia/en/1/1b/Semi-protection-shackle.svg',
 'https://upload.wikimedia.org/wikipedia/en/9/96/Symbol_category_class.svg',
 'https://upload.wikimedia.org/wikipedia/en/9/94/Symbol_support_vote.svg',
 'https://upload.wikimedia.org/wikipedia/en/e/e7/Video-x-generic.svg']

In [None]:
wiki