# Import statements

In [2]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pylab import rcParams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#%reload_ext pycodestyle_magic
#%pycodestyle_on
#%pycodestyle_off

In [3]:
MAIN_FOLDER = 'c:/Users/champ/Python_proj/'
MWML_FOLDER = 'made_with_ml_repo/temp/moviebuddy/meta_data/'
FILE = "filtered.csv"


def prep_db(file, file_type):
    '''
    This module reads and prepares the db : cleans,
    deletes the columns with 'Unnamed' values,
    fills NaN values
    input :
    file # the movieset file
    file type # csv file type
    output :
    df # read movie set dataframe
    '''
    if file_type == 'csv':
        df = pd.read_csv(file, sep=';', dtype=str)  # Read CSV File
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df = df.fillna('')  # clean the data - get rid of NaN
    df = df.astype(str)  # change the default data type to string
    return(df)


movies = prep_db(MAIN_FOLDER + MWML_FOLDER + FILE, 'csv')

# Load movies

In [4]:
# movies = pd.read_csv("/content/drive/My Drive/MovieRecommender/filtered.csv", header=0, sep=';')
# movies = pd.read_csv("/content/drive/My Drive/MovieRecommender/filtered.csv", header=0, sep=';')
# movies = movies.replace({np.nan: None})

In [None]:
''' With previously used data given by MadewithML

# Load movies
movies = pd.read_csv("/content/drive/My Drive/MovieRecommender/filtered.csv", header=0, parse_dates=['date'])
movies = movies.replace({np.nan: None}) # replace NaN with None 

'''

# User defined functions

In [5]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

# EDA

In [6]:
movies.head()

Unnamed: 0,id,title,tagline,description,genres,keywords,date,collection,runtime,revenue,...,director,cast,production_companies,production_countries,popularity,average_vote,num_votes,language,imdb_id,poster_url
0,460135,LEGO DC Super Hero Girls: Brain Drain,,"When Supergirl, Wonder Woman, Batgirl, Bumbleb...",animation,"superhero, lego",2017-08-30,DC Super Hero Girls Collection,0.0,0.0,...,Todd Grimes,"Grey Griffin, Tara Strong, Anais Fairweather, ...",Warner Bros. Animation,United States of America,8.413734,10.0,2.0,en,tt7158814,/niLX2txdI5GlVowJlnb5Hr26QpK.jpg
1,464207,The Truth Is in the Stars,,"William Shatner sits down with scientists, inn...",documentary,"nature, science, canadian movie",2017-05-01,,86.0,0.0,...,Craig Thompson,"William Shatner, Neil deGrasse Tyson, Chris Ha...",,Canada,1.075249,7.5,2.0,en,tt7104950,/tqEtNlOmEAJKrJGGrkGMyLjfRrq.jpg
2,463800,Firebase,,"Set during the Vietnam war, Firebase follows A...","action, science fiction, war","vietnam war, short",2017-06-28,,27.0,0.0,...,Neill Blomkamp,"Steve Boyle, Nic Rhind, Robert Hobbs, Chris Wi...",Oats Studio,Canada,2.129137,7.3,28.0,en,tt7078926,/e6qVOjp3QKHrA7k2sAbJSlDXkRM.jpg
3,464111,Zygote,,"Stranded in an Arctic mine, two survivors are ...","horror, science fiction",,2017-07-12,,23.0,0.0,...,Neill Blomkamp,"Dakota Fanning, Jose Pablo Cantillo",Oats Studio,Canada,3.214001,7.2,28.0,en,tt7078780,/rjm7KaiLsEwRcxnlsaX9ryzzK1B.jpg
4,462108,Chris D'Elia: Man on Fire,,Unbridled comic Chris D'Elia reconsiders his a...,comedy,stand-up comedy,2017-06-27,,65.0,0.0,...,Bill D'Elia,Chris D'Elia,,United States of America,0.482896,5.8,6.0,en,tt7068896,/dZgs1Ym7fe3fPxbeT5hq9XKLaDW.jpg


In [None]:
'''
# Function to display all rows and columns
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)
'''

'\n# Function to display all rows and columns\ndef display_all(df):\n    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): \n        display(df)\n'

In [7]:
display_all(movies.tail().T)

Unnamed: 0,45411,45412,45413,45414,45415
id,105158,16624,88013,16612,47116
title,Edison Kinetoscopic Record of a Sneeze,Blacksmith Scene,Poor Pierrot,Carmencita,The Winner
tagline,,,,,
description,A man (Thomas Edison's assistant) takes a pinc...,Three men hammer on an anvil and pass a bottle...,"One night, Arlequin come to see his lover Colo...",The first woman to appear in front of an Ediso...,Tou former boxers meet in the ring again after...
genres,documentary,drama,"comedy, animation",documentary,"drama, romance, foreign"
keywords,,"blacksmith, beer, workmen",short,"dancer, silent film",
date,1894-01-09,1893-05-08,1892-10-28,1894-03-14,1979-03-09
collection,,,,,
runtime,1.0,1.0,4.0,1.0,78.0
revenue,0.0,0.0,0.0,0.0,0.0


Observations:
- Empty values in cell. Specifically,
    - Some movies do not have taglines, keywords, date collection, revenue, budget, production companies.
- Values in genre, keywords columns are stored as one string, needs to be split and stored as lists. (#Data cleaning1)

In [8]:
display_all(movies.describe(include='all').T)

Unnamed: 0,count,unique,top,freq
id,45416,45416,63065,1
title,45416,42264,Cinderella,11
tagline,45416,20280,,25019
description,45416,44292,,952
genres,45416,4065,drama,4996
keywords,45416,25987,,14328
date,45416,17334,2008-01-01,136
collection,45416,1694,,40930
runtime,45416,354,90.0,2555
revenue,45416,6864,0.0,38015


In [9]:
movies.dtypes

id                      object
title                   object
tagline                 object
description             object
genres                  object
keywords                object
date                    object
collection              object
runtime                 object
revenue                 object
budget                  object
director                object
cast                    object
production_companies    object
production_countries    object
popularity              object
average_vote            object
num_votes               object
language                object
imdb_id                 object
poster_url              object
dtype: object

#User profile

In [10]:
movies.shape

(45416, 21)

In [11]:
movies_subset = movies.dropna(subset=['keywords'])
movies_subset = movies_subset.reset_index(drop=True)

In [12]:
movies_subset.head(10)

Unnamed: 0,id,title,tagline,description,genres,keywords,date,collection,runtime,revenue,...,director,cast,production_companies,production_countries,popularity,average_vote,num_votes,language,imdb_id,poster_url
0,460135,LEGO DC Super Hero Girls: Brain Drain,,"When Supergirl, Wonder Woman, Batgirl, Bumbleb...",animation,"superhero, lego",2017-08-30,DC Super Hero Girls Collection,0.0,0.0,...,Todd Grimes,"Grey Griffin, Tara Strong, Anais Fairweather, ...",Warner Bros. Animation,United States of America,8.413734,10.0,2.0,en,tt7158814,/niLX2txdI5GlVowJlnb5Hr26QpK.jpg
1,464207,The Truth Is in the Stars,,"William Shatner sits down with scientists, inn...",documentary,"nature, science, canadian movie",2017-05-01,,86.0,0.0,...,Craig Thompson,"William Shatner, Neil deGrasse Tyson, Chris Ha...",,Canada,1.075249,7.5,2.0,en,tt7104950,/tqEtNlOmEAJKrJGGrkGMyLjfRrq.jpg
2,463800,Firebase,,"Set during the Vietnam war, Firebase follows A...","action, science fiction, war","vietnam war, short",2017-06-28,,27.0,0.0,...,Neill Blomkamp,"Steve Boyle, Nic Rhind, Robert Hobbs, Chris Wi...",Oats Studio,Canada,2.129137,7.3,28.0,en,tt7078926,/e6qVOjp3QKHrA7k2sAbJSlDXkRM.jpg
3,464111,Zygote,,"Stranded in an Arctic mine, two survivors are ...","horror, science fiction",,2017-07-12,,23.0,0.0,...,Neill Blomkamp,"Dakota Fanning, Jose Pablo Cantillo",Oats Studio,Canada,3.214001,7.2,28.0,en,tt7078780,/rjm7KaiLsEwRcxnlsaX9ryzzK1B.jpg
4,462108,Chris D'Elia: Man on Fire,,Unbridled comic Chris D'Elia reconsiders his a...,comedy,stand-up comedy,2017-06-27,,65.0,0.0,...,Bill D'Elia,Chris D'Elia,,United States of America,0.482896,5.8,6.0,en,tt7068896,/dZgs1Ym7fe3fPxbeT5hq9XKLaDW.jpg
5,461634,Rory Scovel Tries Stand-Up for the First Time,,Comedian Rory Scovel storms the stage in Atlan...,comedy,stand-up comedy,2017-06-20,,66.0,0.0,...,Scott Moran,Rory Scovel,Netflix,United States of America,0.593041,8.0,6.0,en,tt7044010,/z4ppR4BpnXUQhIDIvQKTq0pAi6Z.jpg
6,461955,Rakka,,"""Rakka"" is the story of broken humanity follow...","action, science fiction","digital film, film, short, experimental",2017-06-14,,22.0,0.0,...,Neill Blomkamp,"Sigourney Weaver, Eugene Khumbanyiwa, Robert H...",Oats Studio,Canada,2.569474,7.4,44.0,en,tt6990734,/4wVeP9BHsQul7CRgdrfiUt1Rds1.jpg
7,460822,"Oh, Hello: On Broadway",,Two delusional geriatrics reveal curious pasts...,comedy,broadway,2017-06-13,,102.0,0.0,...,Michael John Warren,"Nick Kroll, John Mulaney, Steve Martin, Matthe...",Radical Media,United States of America,1.208618,6.8,13.0,en,tt6987652,/4sgAcBkhKoazB4xFwiYfWIdLMNf.jpg
8,461257,Queerama,,50 years after decriminalisation of homosexual...,,,2017-06-09,,75.0,0.0,...,Daisy Asquith,,,United Kingdom,0.163015,0.0,0.0,en,tt6980792,/s5UkZt6NTsrS7ZF0Rh8nzupRlIU.jpg
9,457470,Site Unseen: An Emma Fielding Mystery,,"Brilliant, dedicated, and driven, archaeologis...",mystery,,2017-06-04,,90.0,0.0,...,Douglas Barr,"Courtney Thorne-Smith, James Tupper, Adam DiMa...",Muse Entertainment,United States of America,0.86016,6.0,2.0,en,tt6973942,/iqO9ITNYfTHafJ8e1Y4buZ1Vfzp.jpg


In [13]:
print(movies.shape)
print(movies_subset.shape)

(45416, 21)
(45416, 21)


In [14]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_subset.keywords)

In [15]:
tfidf_np_matrix = tfidf_matrix.toarray()

In [16]:
tfidf_np_matrix.shape

(45416, 12595)

In [17]:
tfidf_np_matrix[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [33]:
indices = pd.Series(movies_subset.title)
indices[:5]

0    LEGO DC Super Hero Girls: Brain Drain
1                The Truth Is in the Stars
2                                 Firebase
3                                   Zygote
4                Chris D'Elia: Man on Fire
Name: title, dtype: object

In [31]:
def recommendations(titles, tfidf_np_matrix=tfidf_np_matrix):
  
    idx = []
    recommended_movies=[]

    for title in titles:
        idx.append(indices[indices==title].index[0])
    print(idx)
    user_pref_vector = tfidf_np_matrix[idx].mean(axis=0)
    print(user_pref_vector )
    cosine_sim = cosine_similarity(tfidf_matrix,np.atleast_2d(user_pref_vector))
    df_cosine_sim = pd.DataFrame(cosine_sim,columns=['sim_score'])
    df_cosine_sim = df_cosine_sim.sort_values(by='sim_score',ascending=False)
    print(df_cosine_sim.head(20))
    top_10_indexes = list(df_cosine_sim.iloc[0:(10+len(titles))].index)
    print(top_10_indexes)
    for n,i in enumerate(top_10_indexes):
        if n < len(titles):
            continue
        recommended_movies.append([movies_subset.title.iloc[i],
                                   movies_subset.genres.iloc[i],
                                   movies_subset.keywords.iloc[i],
                                   movies_subset.popularity.iloc[i],
                                   movies_subset.average_vote.iloc[i],
                                   movies_subset.num_votes.iloc[i],
                                   df_cosine_sim.sim_score.iloc[n]])

    return pd.DataFrame(recommended_movies,columns=['movie_title','genres','keywords','popularity','average_vote','num_votes','cosine_score'])

In [34]:
def recommendations(titles, tfidf_np_matrix=tfidf_np_matrix):
    idx = []
    recommended_movies = []

    for title in titles:
        idx.append(indices[indices == title].index[0])
    print(idx)
    user_pref_vector = tfidf_np_matrix[idx].mean(axis=0)
    print(user_pref_vector)
    cosine_sim = cosine_similarity(tfidf_matrix,
                                   np.atleast_2d(user_pref_vector))
    df_cosine_sim = pd.DataFrame(cosine_sim, columns=['sim_score'])
    df_cosine_sim = df_cosine_sim.sort_values(by='sim_score', ascending=False)
    print(df_cosine_sim.head(20))
    top_10_indexes = list(df_cosine_sim.iloc[0:(10+len(titles))].index)
    print(top_10_indexes)
    for n, i in enumerate(top_10_indexes):
        if n < len(titles):
            continue
        recommended_movies.append([movies_subset.title.iloc[i],
                                   movies_subset.genres.iloc[i],
                                   movies_subset.keywords.iloc[i],
                                   movies_subset.popularity.iloc[i],
                                   movies_subset.average_vote.iloc[i],
                                   movies_subset.num_votes.iloc[i],
                                   df_cosine_sim.sim_score.iloc[n]])

    return pd.DataFrame(recommended_movies, columns=['movie_title',
                                                     'genres',
                                                     'keywords',
                                                     'popularity',
                                                     'average_vote',
                                                     'num_votes',
                                                     'cosine_score'])

In [36]:
recommendations(['Toy Story', 'The Dark Knight','The Dark Knight Rises','Ace Ventura: When Nature Calls'], tfidf_np_matrix=tfidf_np_matrix)

[28363, 7021, 12632, 28875]
[0. 0. 0. ... 0. 0. 0.]
       sim_score
28875   0.577350
12632   0.577350
28363   0.577350
3610    0.355135
11748   0.354092
26861   0.326043
36387   0.309664
27473   0.303791
18901   0.302014
32227   0.287942
21757   0.279752
29951   0.271053
20842   0.264049
31837   0.263899
18065   0.255984
32367   0.241639
10970   0.241020
10265   0.239702
40233   0.236827
31048   0.236061
[28875, 12632, 28363, 3610, 11748, 26861, 36387, 27473, 18901, 32227, 21757, 29951, 20842, 31837]


Unnamed: 0,movie_title,genres,keywords,popularity,average_vote,num_votes,cosine_score
0,Barbie and the Three Musketeers,"animation, family",based on toy,6.670164999999999,6.2,81.0,0.354092
1,Small Soldiers,"comedy, adventure, fantasy, science fiction, a...","defense industry, toy shop, technical toy, sol...",10.03936,6.2,522.0,0.326043
2,Animals Are Beautiful People,"comedy, documentary, family","africa, animal",1.582132,7.5,21.0,0.309664
3,Home Alone 3,"comedy, family","parent child relationship, burglar, child hero...",8.04209,5.1,632.0,0.303791
4,Toy Story 3,"animation, family, comedy","hostage, college, toy, barbie, animation, esca...",16.96647,7.6,4710.0,0.302014
5,Dolls,"fantasy, horror","toy, gore, storm, doll, toy maker",4.545885,6.1,80.0,0.287942
6,Young Black Stallion,"adventure, drama, family","world war ii, human animal relationship, horse...",1.269123,5.4,18.0,0.279752
7,Toys,"fantasy, comedy, science fiction","brother brother relationship, loss of brother,...",5.923774,5.0,173.0,0.271053
8,Father and Son,drama,human relationship,1.559697,5.0,10.0,0.264049
9,Child's Play,"horror, thriller","gun, birthday, voodoo, toy, stalker, murder, b...",19.827546,6.3,605.0,0.263899


In [None]:
movies_subset[movies_subset.title.str.contains('Ace Ventura',case=False)==True]

Unnamed: 0,id,title,tagline,description,genres,keywords,date,collection,runtime,revenue,budget,director,cast,production_companies,production_countries,popularity,average_vote,num_votes,language,imdb_id,poster_url
17,9273,Ace Ventura: When Nature Calls,New animals. New adventures. Same hair.,"Summoned from an ashram in Tibet, Ace finds hi...","crime, comedy, adventure","africa, indigenous, human animal relationship,...",1995-11-10 00:00:00,Ace Ventura Collection,90,212386000.0,30000000,Steve Oedekerk,"Jim Carrey, Ian McNeice, Simon Callow, Maynard...","O Entertainment, Warner Bros., Morgan Creek Pr...",United States of America,8.20545,6.1,1128,en,tt0112281,/wRlGnJhEzcxBjvWtvbjhDSU1cIY.jpg
297,3049,Ace Ventura: Pet Detective,"He's the best there is! (Actually, he's the on...",He's Ace Ventura: Pet Detective. Jim Carrey is...,"comedy, mystery","dolphin, mascot, private detective, pets",1994-02-04 00:00:00,Ace Ventura Collection,86,107217000.0,15000000,Tom Shadyac,"Jim Carrey, Sean Young, Courteney Cox, Tone Lo...","Warner Bros., Morgan Creek Productions",United States of America,11.248,6.4,1684,en,tt0109040,/nZirljb8XYbKTWsRQTplDGhx39Q.jpg
