In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV
from scipy.linalg import svd
from scipy.sparse import csr_matrix
from scipy import sparse
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

import  pickle
from pyparsing import col
import streamlit as st
import pandas as pd
import requests

In [2]:
df = pd.read_csv('../clean_df', index_col=False)
df2 = df[['userId', 'title', 'rating']]

# Collaborative Recommender

In [3]:
pivot = df2.pivot_table(columns='userId', index='title', values='rating')
pivot.fillna(0, inplace=True)

In [4]:
pivot.head()

userId,1,4,6,7,18,19,20,21,28,33,...,596,597,599,600,603,605,606,607,608,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,3.0,3.0,3.0,3.0,5.0,0.0,0.0,0.0,0.0
"10,000 BC (2008)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,4.0,3.0,0.0,0.0,0.0,0.0


In [5]:
sparse_pivot = sparse.csr_matrix(pivot)

In [6]:
dists = pairwise_distances(sparse_pivot, metric='cosine')
dists

array([[0.        , 0.8569129 , 0.84098855, ..., 0.73587299, 0.88660025,
        0.51482458],
       [0.8569129 , 0.        , 0.6430202 , ..., 0.87218576, 0.70160806,
        0.76970672],
       [0.84098855, 0.6430202 , 0.        , ..., 0.81896892, 0.77351025,
        0.78853631],
       ...,
       [0.73587299, 0.87218576, 0.81896892, ..., 0.        , 0.77873406,
        0.79837534],
       [0.88660025, 0.70160806, 0.77351025, ..., 0.77873406, 0.        ,
        0.87963326],
       [0.51482458, 0.76970672, 0.78853631, ..., 0.79837534, 0.87963326,
        0.        ]])

In [7]:
# Here, similarity is 1 - distance.
similarities = cosine_similarity(sparse_pivot)

In [8]:
# Verify they are the same
np.all(np.isclose((1.0 - dists), similarities))

True

In [9]:
#distances dataframe
recommender_df = pd.DataFrame(dists,columns=pivot.index, index=pivot.index)
recommender_df.to_csv('recs_users.csv')
df3 = pd.read_csv('./recs_users.csv')

df3.head(3)

Unnamed: 0,title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
0,"'burbs, The (1989)",0.0,0.856913,0.840989,0.951844,0.812308,0.658785,0.939942,0.955424,0.954467,...,0.946714,0.925346,0.817729,0.767496,0.815033,0.887793,0.940138,0.735873,0.8866,0.514825
1,(500) Days of Summer (2009),0.856913,0.0,0.64302,0.688759,0.75755,0.769994,0.685416,0.767192,0.676509,...,0.52828,0.70442,0.872249,0.408032,0.457255,0.609591,0.650016,0.872186,0.701608,0.769707
2,10 Things I Hate About You (1999),0.840989,0.64302,0.0,0.680921,0.609813,0.670429,0.854218,0.823703,0.880515,...,0.664404,0.804913,0.79883,0.802651,0.722396,0.603135,0.823931,0.818969,0.77351,0.788536


In [10]:
search = '27'
titles = pivot[pivot.index.str.contains(search)].index

for title in titles:
    print(title)
    print('Average rating', pivot.loc[title, :].mean())
    print('Number of ratings', pivot.T[title].count())
    print('')
    print(f'10 closest movies')
    print(recommender_df[title].sort_values()[0:11])
    print('')
    print('*******************************************************************************************')
    print('')

127 Hours (2010)
Average rating 0.3208092485549133
Number of ratings 173

10 closest movies
title
127 Hours (2010)                                           0.000000
Wolf of Wall Street, The (2013)                            0.426533
Shutter Island (2010)                                      0.440893
Up in the Air (2009)                                       0.461934
Cloud Atlas (2012)                                         0.489359
The Hateful Eight (2015)                                   0.503737
Gone Girl (2014)                                           0.505604
Birdman: Or (The Unexpected Virtue of Ignorance) (2014)    0.506862
Warm Bodies (2013)                                         0.510461
The Martian (2015)                                         0.520888
Boyhood (2014)                                             0.522100
Name: 127 Hours (2010), dtype: float64

*******************************************************************************************

27 Dresses (2008)
Ave

### Merging this recommender dataframe with a dataframe indicating genre with trues and falses

In [11]:
genre_df = pd.read_csv('../genre_boo.csv')

In [12]:
genre_df.drop(columns=['Unnamed: 0'], inplace=True)

In [13]:
#make separate dataframes with genres:
#children/animation
#comedy
#drama/romance
#crime/thriller/horror
#documentary
#adventure fantasy
#other

In [14]:
recommender_df

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.000000,0.856913,0.840989,0.951844,0.812308,0.658785,0.939942,0.955424,0.954467,0.899907,...,0.946714,0.925346,0.817729,0.767496,0.815033,0.887793,0.940138,0.735873,0.886600,0.514825
(500) Days of Summer (2009),0.856913,0.000000,0.643020,0.688759,0.757550,0.769994,0.685416,0.767192,0.676509,0.644439,...,0.528280,0.704420,0.872249,0.408032,0.457255,0.609591,0.650016,0.872186,0.701608,0.769707
10 Things I Hate About You (1999),0.840989,0.643020,0.000000,0.680921,0.609813,0.670429,0.854218,0.823703,0.880515,0.531316,...,0.664404,0.804913,0.798830,0.802651,0.722396,0.603135,0.823931,0.818969,0.773510,0.788536
"10,000 BC (2008)",0.951844,0.688759,0.680921,0.000000,0.676044,0.799824,0.850913,1.000000,0.854910,0.759894,...,0.680211,0.857447,0.867946,0.739100,0.652380,0.679541,0.808534,0.861821,0.723914,0.859436
101 Dalmatians (1996),0.812308,0.757550,0.609813,0.676044,0.000000,0.574433,0.721360,0.840461,0.894046,0.715929,...,0.797808,0.832233,0.830249,0.824878,0.740296,0.671806,0.810796,0.870744,0.787361,0.819535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoolander (2001),0.887793,0.609591,0.603135,0.679541,0.671806,0.608249,0.761440,0.958609,0.841451,0.692855,...,0.644809,0.670006,0.960177,0.686378,0.586440,0.000000,0.769552,0.839296,0.581734,0.815523
Zootopia (2016),0.940138,0.650016,0.823931,0.808534,0.810796,0.852161,0.847935,0.854889,0.613946,0.839625,...,0.575367,0.714253,0.938919,0.630024,0.559787,0.769552,0.000000,0.876326,0.661284,0.904690
eXistenZ (1999),0.735873,0.872186,0.818969,0.861821,0.870744,0.841459,0.926472,0.961628,0.811861,0.948302,...,0.872437,0.931452,0.796025,0.789147,0.858654,0.839296,0.876326,0.000000,0.778734,0.798375
xXx (2002),0.886600,0.701608,0.773510,0.723914,0.787361,0.647848,0.873972,0.919824,0.779901,0.797463,...,0.678240,0.730049,0.884291,0.715182,0.703059,0.581734,0.661284,0.778734,0.000000,0.879633


In [16]:
genre_df.set_index('title', inplace=True)

In [24]:
genre_df.columns

Index(['Musical', 'Crime', 'Action', 'Comedy', 'Thriller', 'Film-Noir',
       'Documentary', 'War', 'Animation', 'Romance', 'Horror', 'IMAX',
       'Mystery', 'Western', 'Sci-Fi', 'Children', 'Drama', 'Adventure',
       'Fantasy'],
      dtype='object')

In [22]:
rec_genre = recommender_df.join(genre_df)

In [23]:
rec_genre

Unnamed: 0_level_0,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),...,Romance,Horror,IMAX,Mystery,Western,Sci-Fi,Children,Drama,Adventure,Fantasy
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.000000,0.856913,0.840989,0.951844,0.812308,0.658785,0.939942,0.955424,0.954467,0.899907,...,False,False,False,False,False,False,False,False,False,False
"'burbs, The (1989)",0.000000,0.856913,0.840989,0.951844,0.812308,0.658785,0.939942,0.955424,0.954467,0.899907,...,False,False,False,False,False,False,False,False,False,False
"'burbs, The (1989)",0.000000,0.856913,0.840989,0.951844,0.812308,0.658785,0.939942,0.955424,0.954467,0.899907,...,False,False,False,False,False,False,False,False,False,False
"'burbs, The (1989)",0.000000,0.856913,0.840989,0.951844,0.812308,0.658785,0.939942,0.955424,0.954467,0.899907,...,False,False,False,False,False,False,False,False,False,False
"'burbs, The (1989)",0.000000,0.856913,0.840989,0.951844,0.812308,0.658785,0.939942,0.955424,0.954467,0.899907,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
¡Three Amigos! (1986),0.514825,0.769707,0.788536,0.859436,0.819535,0.777457,0.841294,0.955643,0.954690,0.930278,...,False,False,False,False,True,False,False,False,False,False
¡Three Amigos! (1986),0.514825,0.769707,0.788536,0.859436,0.819535,0.777457,0.841294,0.955643,0.954690,0.930278,...,False,False,False,False,True,False,False,False,False,False
¡Three Amigos! (1986),0.514825,0.769707,0.788536,0.859436,0.819535,0.777457,0.841294,0.955643,0.954690,0.930278,...,False,False,False,False,True,False,False,False,False,False
¡Three Amigos! (1986),0.514825,0.769707,0.788536,0.859436,0.819535,0.777457,0.841294,0.955643,0.954690,0.930278,...,False,False,False,False,True,False,False,False,False,False


In [None]:
#create lists for each genre
#filter for rows and columns with that genre

In [None]:
#animation = rec_genre[rec_genre['Animation']==True].iloc[:,0:-19]

### Create lists for each genre

In [67]:
animation = rec_genre[rec_genre['Animation']==True].index.to_list()

In [69]:
animation_list = []
[animation_list.append(movie) for movie in animation if movie not in animation_list]
animation_list

children = rec_genre[rec_genre['Children']==True].index.to_list()
children_list = []
[children_list.append(movie) for movie in children if movie not in children_list]
children_list

['101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 '9 (2009)',
 'Adventures of Tintin, The (2011)',
 'Akira (1988)',
 'Aladdin (1992)',
 'Aladdin and the King of Thieves (1996)',
 'Alice in Wonderland (1951)',
 'All Dogs Go to Heaven (1989)',
 'American Tail, An (1986)',
 'Anastasia (1997)',
 'Animatrix, The (2003)',
 'Antz (1998)',
 'Aristocats, The (1970)',
 'Atlantis: The Lost Empire (2001)',
 'Bambi (1942)',
 'Batman: Mask of the Phantasm (1993)',
 'Beauty and the Beast (1991)',
 'Beavis and Butt-Head Do America (1996)',
 'Big Hero 6 (2014)',
 'Bolt (2008)',
 'Brave (2012)',
 "Bug's Life, A (1998)",
 'Cars (2006)',
 "Charlotte's Web (1973)",
 'Chicken Run (2000)',
 'Cinderella (1950)',
 'Cloudy with a Chance of Meatballs (2009)',
 'Coraline (2009)',
 'Corpse Bride (2005)',
 'Cowboy Bebop: The Movie (Cowboy Bebop: Tengoku no Tobira) (2001)',
 'Despicable Me (2010)',
 'Despicable Me 2 (2013)',
 'Dumbo (1941)',
 "Emperor's New Groove, The (2000)",
 'Enchanted (2007)',
 'Famil

In [75]:
comedy = rec_genre[rec_genre['Comedy']==True].index.to_list()
comedy_list = []
[comedy_list.append(movie) for movie in comedy if movie not in comedy_list]
comedy_list

["'burbs, The (1989)",
 '(500) Days of Summer (2009)',
 '10 Things I Hate About You (1999)',
 '101 Dalmatians (1996)',
 '13 Going on 30 (2004)',
 '21 Jump Street (2012)',
 '22 Jump Street (2014)',
 '27 Dresses (2008)',
 '40 Days and 40 Nights (2002)',
 '40-Year-Old Virgin, The (2005)',
 '50 First Dates (2004)',
 '50/50 (2011)',
 'A Million Ways to Die in the West (2014)',
 'A-Team, The (2010)',
 'About Schmidt (2002)',
 'About a Boy (2002)',
 'Accepted (2006)',
 'Ace Ventura: Pet Detective (1994)',
 'Ace Ventura: When Nature Calls (1995)',
 'Adaptation (2002)',
 'Addams Family Values (1993)',
 'Addams Family, The (1991)',
 'Adventureland (2009)',
 'Adventures in Babysitting (1987)',
 'Adventures of Baron Munchausen, The (1988)',
 'Adventures of Buckaroo Banzai Across the 8th Dimension, The (1984)',
 'Adventures of Priscilla, Queen of the Desert, The (1994)',
 'African Queen, The (1951)',
 'Airheads (1994)',
 'Airplane II: The Sequel (1982)',
 'Airplane! (1980)',
 'Aladdin (1992)',
 'Al

In [76]:
drama = rec_genre[rec_genre['Drama']==True].index.to_list()
drama_list = []
[drama_list.append(movie) for movie in drama if movie not in drama_list]
drama_list

romance = rec_genre[rec_genre['Romance']==True].index.to_list()
romance_list = []
[romance_list.append(movie) for movie in romance if movie not in romance_list]
romance_list

['(500) Days of Summer (2009)',
 '12 Angry Men (1957)',
 '12 Years a Slave (2013)',
 '127 Hours (2010)',
 '1408 (2007)',
 '1984 (Nineteen Eighty-Four) (1984)',
 '20,000 Leagues Under the Sea (1954)',
 '2001: A Space Odyssey (1968)',
 '2012 (2009)',
 '21 (2008)',
 '21 Grams (2003)',
 '25th Hour (2002)',
 '28 Days (2000)',
 '3:10 to Yuma (2007)',
 '400 Blows, The (Les quatre cents coups) (1959)',
 '50/50 (2011)',
 '8 1/2 (8½) (1963)',
 '8 Mile (2002)',
 '8MM (1999)',
 'A.I. Artificial Intelligence (2001)',
 'About Schmidt (2002)',
 'About Time (2013)',
 'About a Boy (2002)',
 'Adaptation (2002)',
 'Adventureland (2009)',
 'Adventures of Priscilla, Queen of the Desert, The (1994)',
 'Age of Innocence, The (1993)',
 'All About Eve (1950)',
 'All About My Mother (Todo sobre mi madre) (1999)',
 'All Dogs Go to Heaven (1989)',
 "All the President's Men (1976)",
 'Almost Famous (2000)',
 'Amadeus (1984)',
 'American Beauty (1999)',
 'American Gangster (2007)',
 'American Graffiti (1973)',
 'Am

In [78]:
#crime/thriller/horror
crime = rec_genre[rec_genre['Crime']==True].index.to_list()
crime_list = []
[crime_list.append(movie) for movie in crime if movie not in crime_list]
crime_list

thriller = rec_genre[rec_genre['Thriller']==True].index.to_list()
thriller_list = []
[thriller_list.append(movie) for movie in thriller if movie not in thriller_list]
thriller_list

horror = rec_genre[rec_genre['Horror']==True].index.to_list()
horror_list = []
[horror_list.append(movie) for movie in horror if movie not in horror_list]
horror_list

['1408 (2007)',
 '28 Days Later (2002)',
 '28 Weeks Later (2007)',
 'AVP: Alien vs. Predator (2004)',
 'Alien (1979)',
 'Alien: Resurrection (1997)',
 'Aliens (1986)',
 'Alien³ (a.k.a. Alien 3) (1992)',
 'American Psycho (2000)',
 'American Werewolf in London, An (1981)',
 'Amityville Horror, The (1979)',
 'Arachnophobia (1990)',
 'Army of Darkness (1993)',
 'Battle Royale (Batoru rowaiaru) (2000)',
 'Birds, The (1963)',
 'Blade (1998)',
 'Blade II (2002)',
 'Blade: Trinity (2004)',
 'Blair Witch Project, The (1999)',
 'Brothers Grimm, The (2005)',
 'Bubba Ho-tep (2002)',
 'Buffy the Vampire Slayer (1992)',
 'Cabin in the Woods, The (2012)',
 'Carrie (1976)',
 'Cell, The (2000)',
 "Child's Play (1988)",
 'Constantine (2005)',
 'Copycat (1995)',
 'Craft, The (1996)',
 'Cube (1997)',
 'Dawn of the Dead (1978)',
 'Dawn of the Dead (2004)',
 'Death Proof (2007)',
 'Deep Blue Sea (1999)',
 'Descent, The (2005)',
 "Dracula (Bram Stoker's Dracula) (1992)",
 'End of Days (1999)',
 'Event Horiz

In [79]:
#documentary
documentary = rec_genre[rec_genre['Documentary']==True].index.to_list()
documentary_list = []
[documentary_list.append(movie) for movie in documentary if movie not in documentary_list]
documentary_list

['Bowling for Columbine (2002)',
 'Crumb (1994)',
 'Fahrenheit 9/11 (2004)',
 'Hoop Dreams (1994)',
 'Jackass: The Movie (2002)',
 "March of the Penguins (Marche de l'empereur, La) (2005)",
 'Roger & Me (1989)',
 'Sicko (2007)',
 'Spellbound (2002)',
 'Super Size Me (2004)']

In [80]:
#adventure fantasy
adventure = rec_genre[rec_genre['Adventure']==True].index.to_list()
adventure_list = []
[adventure_list.append(movie) for movie in adventure if movie not in adventure_list]
adventure_list

fantasy = rec_genre[rec_genre['Fantasy']==True].index.to_list()
fantasy_list = []
[fantasy_list.append(movie) for movie in fantasy if movie not in fantasy_list]
fantasy_list

['13 Going on 30 (2004)',
 '13th Warrior, The (1999)',
 '300 (2007)',
 '8 1/2 (8½) (1963)',
 'About Time (2013)',
 'Addams Family Values (1993)',
 'Addams Family, The (1991)',
 'Adventures of Baron Munchausen, The (1988)',
 'Aladdin and the King of Thieves (1996)',
 'Alice in Wonderland (1951)',
 'Alice in Wonderland (2010)',
 'All Dogs Go to Heaven (1989)',
 'Antz (1998)',
 'Army of Darkness (1993)',
 'Atlantis: The Lost Empire (2001)',
 'Batman & Robin (1997)',
 'Batman v Superman: Dawn of Justice (2016)',
 'Beauty and the Beast (1991)',
 'Beetlejuice (1988)',
 'Being John Malkovich (1999)',
 'Bewitched (2005)',
 'Big (1988)',
 'Big Fish (2003)',
 'Big Trouble in Little China (1986)',
 "Bill & Ted's Bogus Journey (1991)",
 'Blade: Trinity (2004)',
 'Brazil (1985)',
 'Brothers Grimm, The (2005)',
 'Bruce Almighty (2003)',
 'Carrie (1976)',
 'Charlie and the Chocolate Factory (2005)',
 'Chronicles of Narnia: Prince Caspian, The (2008)',
 'Chronicles of Narnia: The Lion, the Witch and t

In [82]:
#any uses 'recommender_df'

In [43]:
rec_children = rec_genre[rec_genre['Children']==True]
rec_animation = rec_genre['Animation']==True]

In [45]:
rec_children

Unnamed: 0_level_0,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),...,Romance,Horror,IMAX,Mystery,Western,Sci-Fi,Children,Drama,Adventure,Fantasy
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),0.812308,0.757550,0.609813,0.676044,0.000000,0.574433,0.721360,0.840461,0.894046,0.715929,...,False,False,False,False,False,False,True,False,True,False
101 Dalmatians (1996),0.812308,0.757550,0.609813,0.676044,0.000000,0.574433,0.721360,0.840461,0.894046,0.715929,...,False,False,False,False,False,False,True,False,True,False
101 Dalmatians (1996),0.812308,0.757550,0.609813,0.676044,0.000000,0.574433,0.721360,0.840461,0.894046,0.715929,...,False,False,False,False,False,False,True,False,True,False
101 Dalmatians (1996),0.812308,0.757550,0.609813,0.676044,0.000000,0.574433,0.721360,0.840461,0.894046,0.715929,...,False,False,False,False,False,False,True,False,True,False
101 Dalmatians (1996),0.812308,0.757550,0.609813,0.676044,0.000000,0.574433,0.721360,0.840461,0.894046,0.715929,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zootopia (2016),0.940138,0.650016,0.823931,0.808534,0.810796,0.852161,0.847935,0.854889,0.613946,0.839625,...,False,False,False,False,False,False,True,False,True,False
Zootopia (2016),0.940138,0.650016,0.823931,0.808534,0.810796,0.852161,0.847935,0.854889,0.613946,0.839625,...,False,False,False,False,False,False,True,False,True,False
Zootopia (2016),0.940138,0.650016,0.823931,0.808534,0.810796,0.852161,0.847935,0.854889,0.613946,0.839625,...,False,False,False,False,False,False,True,False,True,False
Zootopia (2016),0.940138,0.650016,0.823931,0.808534,0.810796,0.852161,0.847935,0.854889,0.613946,0.839625,...,False,False,False,False,False,False,True,False,True,False


# Content Recommender on movie description

In [11]:
credits = pd.read_csv('../Raw Data/credits.csv')
metadata = pd.read_csv('../Raw Data/movies_metadata.csv')
keywords = pd.read_csv('../Raw Data/keywords.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
#resource in using tf-idf with cosine_similarity vs linear_kernel

#instantiate tfidf and remove stop words
tfidf = TfidfVectorizer(stop_words='english')

#Replace nans with empty strings
metadata['overview'] = metadata['overview'].fillna('')

#fit/transform
matrix = tfidf.fit_transform(metadata['overview'])
matrix.shape

(45466, 75827)

In [13]:
#cosine similarity matrix
similarity_matrix = linear_kernel(matrix, matrix)

In [14]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [15]:
def get_recommendations(title, similarity=similarity_matrix):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwise similary scores of all movies with that movie
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    # sort the movies based on the similary scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:21]

    # get the movie indices
    movie_indices = [i[0] for i in similarity_scores]

    # return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

### Dataframe with Toy Story recommendations

In [16]:
toy_story_rec = pd.DataFrame(get_recommendations('Toy Story')).rename(columns={'title':'Title'})

In [17]:
toy_story_rec.to_csv('Toy_Story.csv')

In [18]:
toy_movies = [movie for movie in get_recommendations('Toy Story')]
toy_df = metadata[metadata['title'].isin(toy_movies)]

In [19]:
toy_df = toy_df[['title', 'overview']]
toy_df.to_csv('toy_df.csv')

# Content Recommender on genre, director, and keywords

In [20]:
#removing rows with wrong id structure
print(metadata[metadata['id'].str.contains('-')].index)
for row in metadata[metadata['id'].str.contains('-')].index:
    metadata.drop([row], inplace=True)

Int64Index([19730, 29503, 35587], dtype='int64')


In [21]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [22]:
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [23]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [24]:
#define columns which hold features
features = ['cast', 'crew', 'keywords', 'genres']

#apply literal_eval
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

### Director

Director names are in the crew column, which has multiple dictionaries. Each dictionary lists department, gender, job, and name.

In [25]:
#example of crew dictionary
metadata['crew'][0][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [26]:
def director(crew):
    for person in crew:
        if person['job'] == 'Director':
            return person['name']

In [27]:
metadata['director'] = metadata['crew'].apply(director)

### Genre

Each movie ranges from 0 to 8 genres. They are not in alphabetical order, so safe to assume they're ordered by importance. Seeing as the majority of movies list 2 genres, I'll take the first 2 genres from each list.

In [28]:
metadata['genres_count'] = 0

for row in list(range(0, metadata.shape[0])):
        metadata['genres_count'][row] = len(metadata['genres'][row])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['genres_count'][row] = len(metadata['genres'][row])


KeyboardInterrupt: 

In [None]:
metadata['genres_count'].describe()

In [None]:
#define function grabbing first two names in column row

def top_two(column):
    names = [row['name'] for row in column]
    if len(names) >= 3:
        names = names[:2]
    return names

In [None]:
#apply function
metadata['genres_top'] = metadata['genres'].apply(top_two)

### Keywords

Similar to genre, the amount of keywords range. Here, we see that the minimum amount is 0 and the largest is 149, with 2 in the 50th percentile and 5 in the 75th. Instead of taking the top 2, like we did with genre, I'm going to take the top 3 keywords, considering the higher prevelance of 2+ words. 

In [None]:
metadata['keywords_count'] = 0

for row in list(range(0, metadata.shape[0])):
        metadata['keywords_count'][row] = len(metadata['keywords'][row])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['keywords_count'][row] = len(metadata['keywords'][row])


In [None]:
metadata['keywords_count'].describe()

count    46628.000000
mean         3.419426
std          4.671347
min          0.000000
25%          0.000000
50%          2.000000
75%          5.000000
max        149.000000
Name: keywords_count, dtype: float64

In [None]:
#define function grabbing first three names in column row

def top_three(column):
    names = [row['name'] for row in column]
    if len(names) >= 4:
        names = names[:3]
    return names

In [None]:
#apply function
metadata['keywords_top'] = metadata['keywords'].apply(top_three)

### Cleaning new features
Now let's clean our new features: ['keywords_top', 'genres_top', 'director']

'keywords_top', 'genres_top' are lists
'director' is a string

In [None]:
for column in ['keywords_top', 'genres_top', 'director']:
    print(metadata[column][0])

['jealousy', 'toy', 'boy']
['Animation', 'Comedy']
John Lasseter


In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_list(column):
    if isinstance(column,list):
        return [str.lower(row.replace(" ", "")) for row in column]
    else:
        return ''
        
def clean_string(column):
    if isinstance(column,str):
        return str.lower(column.replace(" ", ""))
    else:
        return ''

In [None]:
metadata['director'] = metadata['director'].apply(clean_string)

for column in ['keywords_top', 'genres_top']:
    metadata[column] = metadata[column].apply(clean_list)

### Join new features, create matrix, calculate similarity, use recommendation function

In [None]:
metadata['director']

0           johnlasseter
1            joejohnston
2           howarddeutch
3         forestwhitaker
4           charlesshyer
              ...       
46623    hamidnematollah
46624            lavdiaz
46625       markl.lester
46626    yakovprotazanov
46627       daisyasquith
Name: director, Length: 46628, dtype: object

In [None]:
def create_soup(x):
    return ' '.join(x['keywords_top']) + ' ' + x['director'] + ' ' + ' '.join(x['genres_top'])

In [None]:
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [None]:
metadata[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy johnlasseter animation comedy
1,boardgame disappearance basedonchildren'sbook ...


In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [None]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

3777                 A Couch in New York
40653                              Wacko
38251                             Agyaat
1304                    April Fool's Day
16844                 A Hole in the Soul
43127                     Hunting Season
16510                   Morsian yllättää
19970          H.P. Lovecraft's The Tomb
10230    Me and You and Everyone We Know
16042                    The Last Letter
4578                           Skin Deep
10403    Asterix & Obelix Take on Caesar
36133                            Blinker
28287              Beethoven's Big Break
25129      The Wild World of Lydia Lunch
5333                          Scooby-Doo
14025               (500) Days of Summer
9339            Kabhi Khushi Kabhie Gham
16905                           Harakiri
107         Headless Body in Topless Bar
Name: title, dtype: object

# Set up 3rd model for pickeling

In [None]:
import  pickle
pickle.dump(new_df,open('model.pkl','wb'))