In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
movies = pd.read_csv("./data/movies_tmdb.csv")

movies['description'] = movies['overview'].fillna('')

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english', min_df=1)
tfidf_matrix = tf.fit_transform(movies['description'])

print(tfidf_matrix.shape)

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

movies = movies.reset_index()
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

def get_recommendations(title):
    idx = indices[title]
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

print("Recomendations Based on overview for movie Doctor Who: Last Christmas")
print(get_recommendations('Doctor Who: Last Christmas').head(10))

(4485, 113624)
Recomendations Based on overview for movie Doctor Who: Last Christmas
2489                Silent Night, Deadly Night
911                     Santa Claus: The Movie
120                     Miracle on 34th Street
121                     Miracle on 34th Street
3474                          Arthur Christmas
2855           How the Grinch Stole Christmas!
1624                    Ernest Saves Christmas
1                                      Jumanji
2393                                    Taxi 3
4203    Doctor Who: The Husbands of River Song
Name: title, dtype: object


In [3]:
popularity_df = movies[['popularity', 'vote_average', 'vote_count']]

movies['description_genre'] = movies['overview'] + 2*movies['genres']
movies['description_genre'] = movies['description_genre'].fillna('')

tf_new = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
tfidf_matrix_new = tf_new.fit_transform(movies['description_genre'])

cosine_sim_new = linear_kernel(tfidf_matrix_new, tfidf_matrix_new)

movies = movies.reset_index()
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
indices.head(2)

def get_recommendations_new(title):
    idx = indices[title]
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim_new[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]
print("Recomendations Based on overview and genre for movie Doctor Who: Last Christmas")
print(get_recommendations_new('Doctor Who: Last Christmas').head(10))

Recomendations Based on overview and genre for movie Doctor Who: Last Christmas
4188    Wizards of Waverly Place: The Movie
3614              It's Such a Beautiful Day
4460                      A Wrinkle in Time
1554                                  Krull
3122    The Butterfly Effect 3: Revelations
3943                      X-Men: Apocalypse
4326           Rogue One: A Star Wars Story
1146               Pokémon: The First Movie
769                Honey, I Shrunk the Kids
3987               Marvel One-Shot: Item 47
Name: title, dtype: object


In [4]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.95)

def weighted_rating(x):
    v = int(x['vote_count'])
    R = int(x['vote_average'])
    return (v/(v+m) * R) + (m/(m+v) * C)

def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_new[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies_x = movies.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = movies_x[movies_x['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies_x[movies_x['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies_x[(movies_x['vote_count'] >= m) & (movies_x['vote_count'].notnull()) &
                       (movies_x['vote_average'].notnull())]
    qualified.loc[:, 'wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

print("Recomendations Based on weighted rating, overview and genre for movie Doctor Who: Last Christmas")
print(improved_recommendations('Doctor Who: Last Christmas')['title'].head(10))

Recomendations Based on weighted rating, overview and genre for movie Doctor Who: Last Christmas
4326          Rogue One: A Star Wars Story
757                                   Dune
3287                            Mr. Nobody
1274    Close Encounters of the Third Kind
1736                                 K-PAX
769               Honey, I Shrunk the Kids
3943                     X-Men: Apocalypse
3660                          Man of Steel
4429              Star Wars: The Last Jedi
4460                     A Wrinkle in Time
Name: title, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [5]:
import gradio as gr
def prediction(text):
    return ", ".join(improved_recommendations(text)['title'].head(10))
examples = ["Avatar"]

# Define the Gradio interface
iface = gr.Interface(fn=prediction, inputs="text", outputs="text",examples=examples)
# Launch the Gradio interface
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/pandas/core/indexes/base.py", line 3361, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 103, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 135, in pandas._libs.index.IndexEngine._get_loc_duplicates
  File "pandas/_libs/index.pyx", line 143, in pandas._libs.index.IndexEngine._maybe_get_bool_indexer
  File "pandas/_libs/index.pyx", line 161, in pandas._libs.index.IndexEngine._unpack_bool_indexer
KeyError: 'Star Wars'

The above exception was the direct cause of the followi