In [1]:
import pandas as pd
import surprise
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
meta = pd.read_csv('../../../data/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
meta = meta.set_index('id')

In [4]:
movies = pd.read_csv('../../../data/movies.csv', dtype=object).set_index('movieId')
links = pd.read_csv('../../../data/links.csv', dtype=object).set_index('movieId')

In [5]:
movies2 = movies.join(links)

In [6]:
movies2

Unnamed: 0_level_0,title,genres,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0114709,862
2,Jumanji (1995),Adventure|Children|Fantasy,0113497,8844
3,Grumpier Old Men (1995),Comedy|Romance,0113228,15602
4,Waiting to Exhale (1995),Comedy|Drama|Romance,0114885,31357
5,Father of the Bride Part II (1995),Comedy,0113041,11862
...,...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030
193585,Flint (2017),Drama,6397426,479308
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455


In [7]:
meta['imdb_id'] = meta['imdb_id'].str[-7:]

In [8]:
meta = meta.set_index('imdb_id')

In [9]:
movies2 = movies2.set_index('imdbId').drop('tmdbId', axis=1)

In [10]:
movies2

Unnamed: 0_level_0,title,genres
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
0114709,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
0113497,Jumanji (1995),Adventure|Children|Fantasy
0113228,Grumpier Old Men (1995),Comedy|Romance
0114885,Waiting to Exhale (1995),Comedy|Drama|Romance
0113041,Father of the Bride Part II (1995),Comedy
...,...,...
5476944,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
5914996,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
6397426,Flint (2017),Drama
8391976,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [11]:
movies3 = movies2.join(meta.overview)

In [12]:
movies3.dropna(inplace=True)

In [13]:
movies3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9510 entries, 0000417 to 7158814
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     9510 non-null   object
 1   genres    9510 non-null   object
 2   overview  9510 non-null   object
dtypes: object(3)
memory usage: 297.2+ KB


In [14]:
movies3.sample(10)

Unnamed: 0,title,genres,overview
115640,Beautiful Thing (1996),Drama|Romance,A pair of teenage boys growing up in a working...
783233,Atonement (2007),Drama|Romance|War,"As a 13-year-old, fledgling writer Briony Tall..."
47577,This Island Earth (1955),Sci-Fi,"Aliens have landed and are hiding on Earth, bu..."
2084989,Upstream Color (2013),Romance|Sci-Fi|Thriller,"A man and woman are drawn together, entangled ..."
2140203,Wolf Children (Okami kodomo no ame to yuki) (2...,Animation|Fantasy,"Hana, a nineteen-year-old college student, fal..."
81480,Seems Like Old Times (1980),Comedy|Romance,Writer Nick Gardenia is kidnapped from his Cal...
2582802,Whiplash (2014),Drama,"Under the direction of a ruthless instructor, ..."
296310,The Blue Planet (2001),Documentary,"The Blue Planet, the definitive exploration of..."
3553442,Whiskey Tango Foxtrot (2016),Comedy|War,"In 2002, cable news producer Kim Barker (Tina ..."
214388,100 Girls (2000),Comedy|Romance,"This sexy, teen-comedy is about a freshman, Ma..."


In [15]:
movies3.genres = movies3.genres.apply(lambda x: x.replace('|', ' '))

In [26]:
movies3['description'] = movies3.genres + ' ' +  movies3.overview

In [31]:
movies3.sample(10)

Unnamed: 0,index,title,genres,overview,words
5259,239948,Saving Silverman (Evil Woman) (2001),Comedy Romance,A pair of buddies conspire to save their best ...,Comedy Romance A pair of buddies conspire to s...
8373,1637688,In Time (2011),Crime Sci-Fi Thriller,In the not-too-distant future the aging gene h...,Crime Sci-Fi Thriller In the not-too-distant f...
943,61722,"Graduate, The (1967)",Comedy Drama Romance,A recent college graduate finds himself in a l...,Comedy Drama Romance A recent college graduate...
7954,1291584,Warrior (2011),Drama,The inspirational story of an estranged family...,Drama The inspirational story of an estranged ...
7651,1094249,Hotel Chevalier (Part 1 of 'The Darjeeling Lim...,Drama,Grief? Depression? Ambiguity in a Paris hotel ...,Drama Grief? Depression? Ambiguity in a Paris ...
760,56732,"Exterminating Angel, The (Ángel exterminador, ...",Comedy Drama Fantasy Mystery,The guests at an upper-class dinner party find...,Comedy Drama Fantasy Mystery The guests at an ...
396,43014,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama Film-Noir Romance,A hack screenwriter writes a screenplay for a ...,Drama Film-Noir Romance A hack screenwriter wr...
5236,236027,Flickering Lights (Blinkende lygter) (2000),Action Comedy Crime,Four small gangsters from Copenhagen trick a g...,Action Comedy Crime Four small gangsters from ...
2398,94898,Coming to America (1988),Comedy Romance,"Prince Akeem, heir to the throne of Zamunda, l...","Comedy Romance Prince Akeem, heir to the thron..."
7191,780653,"Wolfman, The (2010)",Horror Thriller,"Lawrence Talbot, an American man on a visit to...","Horror Thriller Lawrence Talbot, an American m..."


In [17]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2) ,min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies3['overview'])

In [18]:
cosines = linear_kernel(tfidf_matrix, tfidf_matrix)

In [19]:
movies3 = movies3.reset_index()
titles = movies3['title']
indices = pd.Series(movies3.index, index=movies3['title'])

In [20]:
def recommend(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosines[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [21]:
recommend('Thor: The Dark World (2013)')

7223                                          Thor (2011)
9217                                Thor: Ragnarok (2017)
9490    While You Were Fighting: A Thor Mockumentary (...
3451                    NeverEnding Story III, The (1994)
1711                             Dark Crystal, The (1982)
8505                           Killing Them Softly (2012)
8154                                    Prometheus (2012)
5592    Babylon 5: The Legend of the Rangers: To Live ...
6995          Wal-Mart: The High Cost of Low Price (2005)
Name: title, dtype: object

In [92]:
list(enumerate(['johm','jake','donna']))

[(0, 'johm'), (1, 'jake'), (2, 'donna')]

In [99]:
for num, name in enumerate(['johm','jake','donna']):
    print(num, name*2)

0 johmjohm
1 jakejake
2 donnadonna


In [105]:
recommend('Hotel Transylvania 2 (2015)')

2541                 Dead Poets Society (1989)
7288                     Dracula Untold (2014)
36                     Jazz Singer, The (1927)
8107    Best Exotic Marigold Hotel, The (2011)
2065                   Vampire Hunter D (1985)
4420          Million Dollar Hotel, The (2001)
4542            Fracchia contro Dracula (1985)
665               Grass Is Greener, The (1960)
4595                      Hideous Kinky (1998)
Name: title, dtype: object

In [115]:
movies3.to_csv('../../../src/movie_descriptions.csv')

In [116]:
pd.read_csv('../../../src/movie_descriptions.csv')

Unnamed: 0.1,Unnamed: 0,index,title,genres,overview
0,0,417,"Trip to the Moon, A (Voyage dans la lune, Le) ...",Action Adventure Fantasy Sci-Fi,A Trip to The Moon is a science fiction film f...
1,1,439,The Great Train Robbery (1903),Crime Western,The clerk at the train station is assaulted an...
2,2,516,The Electric Hotel (1908),Animation Comedy Sci-Fi,According to the rapid strides that electricit...
3,3,4972,"Birth of a Nation, The (1915)",Drama War,The Birth of A Nation is a silent film from 19...
4,4,6333,"20,000 Leagues Under the Sea (1916)",Action Adventure Sci-Fi,Captain Nemo has built a fantastic submarine f...
...,...,...,...,...,...
9505,9505,6840134,The Putin Interviews (2017),(no genres listed),"Academy Award-winning filmmaker, Oliver Stone ..."
9506,9506,6878486,"Norm Macdonald: Hitler's Dog, Gossip & Tricker...",Comedy,"In this new stand-up special, Norm Macdonald d..."
9507,9507,6987652,"Oh, Hello: On Broadway (2017)",Comedy,Two delusional geriatrics reveal curious pasts...
9508,9508,7044010,Rory Scovel Tries Stand-Up for the First Time ...,Comedy,Comedian Rory Scovel storms the stage in Atlan...
