# Building Movie Recommender System with Similarity Function

In [1]:
# Import library
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data Unloading and Checking

In [2]:
# Read datasets
rating_df = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/movie_rating_df.csv')
name_df = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/actor_name.csv')
director_df = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/directors_writers.csv')

In [3]:
# Show MOVIE RATING dataset
rating_df.sample(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
587210,tt4118906,tvMovie,Special Drama: Maison Ikkoku,Supesharu dorama: Mezon Ikkoku,0,2007.0,,107.0,"Comedy,Drama,Romance",4.9,7
734542,tt8852046,movie,The Meeting,The Meeting,0,2018.0,,96.0,,7.2,9
6918,tt0024205,movie,Kazdemu wolno kochac,Kazdemu wolno kochac,0,1933.0,,80.0,"Comedy,Romance",6.6,53
334771,tt10064354,movie,Inland,Inland,0,2019.0,,95.0,,7.9,18
502040,tt2301241,short,The Winners' Circle,The Winners' Circle,0,2010.0,,14.0,"Documentary,Short",7.9,16


In [4]:
# Show columns type of  MOVIE RATING dataset
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751614 entries, 0 to 751613
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          751614 non-null  object 
 1   titleType       751614 non-null  object 
 2   primaryTitle    751614 non-null  object 
 3   originalTitle   751614 non-null  object 
 4   isAdult         751614 non-null  int64  
 5   startYear       751614 non-null  float64
 6   endYear         16072 non-null   float64
 7   runtimeMinutes  751614 non-null  float64
 8   genres          486766 non-null  object 
 9   averageRating   751614 non-null  float64
 10  numVotes        751614 non-null  int64  
dtypes: float64(4), int64(2), object(5)
memory usage: 63.1+ MB


In [5]:
# Show ACTOR NAME dataset
name_df.sample(5)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
300,nm4287983,Brianna Kondrat,\N,\N,"miscellaneous,art_department,production_manager","tt1835129,tt1299368,tt1693679,tt1918727"
396,nm7232369,Teresa Bonner,\N,\N,producer,tt4569160
515,nm2884073,Ngan-Ying Poon,\N,\N,"actress,costume_department,costume_designer","tt0311792,tt0104553,tt0096078,tt0237501"
244,nm9480005,Samuel Gagnon-Thibodeau,\N,\N,,tt7735196
297,nm1901214,Naoki Katakai,\N,\N,"art_department,animation_department,art_director","tt0268256,tt0397042,tt0884870,tt0232531"


In [6]:
# Show columns type of  ACTOR NAME dataset
name_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   nconst             1000 non-null   object
 1   primaryName        1000 non-null   object
 2   birthYear          1000 non-null   object
 3   deathYear          1000 non-null   object
 4   primaryProfession  891 non-null    object
 5   knownForTitles     1000 non-null   object
dtypes: object(6)
memory usage: 47.0+ KB


In [7]:
# Show DIRECTOR-WRITER dataset
director_df.sample(5)

Unnamed: 0,tconst,director_name,writer_name
37,tt0060910,Otakar Vávra,"Frantisek Hrubín,Otakar Vávra"
685,tt1783244,Jerzy Hoffman,"Jerzy Hoffman,Jaroslaw Sokól"
929,tt6450804,Tim Miller,"Charles H. Eglee,James Cameron,Josh Friedman,D..."
601,tt1339050,Jordan Clark,"Jordan Clark,Janice Santos Valdez"
779,tt2515134,Thomas K. Phillips,"Thomas K. Phillips,Tim O'Connell"


In [8]:
# Show columns type of  DIRECTOR-WRITER dataset
director_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 986 entries, 0 to 985
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tconst         986 non-null    object
 1   director_name  986 non-null    object
 2   writer_name    986 non-null    object
dtypes: object(3)
memory usage: 23.2+ KB


## Data Preparation

#### Choosing relevants features
We don't need all the columns of Actor Name dataset to be processed. So, we select columns that relevant to the cases. Those columns are:
- `nconst`
- `primaryName`
- `knownForTitles`

In [9]:
name_df = name_df[['nconst','primaryName','knownForTitles']]
name_df.head()

Unnamed: 0,nconst,primaryName,knownForTitles
0,nm1774132,Nathan McLaughlin,"tt0417686,tt1713976,tt1891860,tt0454839"
1,nm10683464,Bridge Andrew,tt7718088
2,nm1021485,Brandon Fransvaag,tt0168790
3,nm6940929,Erwin van der Lely,tt4232168
4,nm5764974,Svetlana Shypitsyna,tt3014168


#### Convert data into List
This process aims to simplify directors and writers accessing/subsetting since we have multiple directors and writers in one column

In [10]:
# Convert director_name to list
director_df['director_name'] = director_df['director_name'].apply(lambda row: row.split(','))
director_df['writer_name'] = director_df['writer_name'].apply(lambda row: row.split(','))

# Show top 5 data
director_df.head()

Unnamed: 0,tconst,director_name,writer_name
0,tt0011414,[David Kirkland],"[John Emerson, Anita Loos]"
1,tt0011890,[Roy William Neill],"[Arthur F. Goodrich, Burns Mantle, Mary Murillo]"
2,tt0014341,"[Buster Keaton, John G. Blystone]","[Jean C. Havez, Clyde Bruckman, Joseph A. Mitc..."
3,tt0018054,[Cecil B. DeMille],[Jeanie Macpherson]
4,tt0024151,[James Cruze],"[Max Miller, Wells Root, Jack Jevne]"


#### One to One Correspondence

Of course an actor can play in more than a film. Then, a table of 1-1 relation to each of the movies' title is necessary. We have to unnest the table.

In [11]:
# Variation checking
name_df['knownForTitles'].apply(lambda x: len(x.split(','))).unique()

# Covert knownForTitles menjadi list of list
name_df['knownForTitles'] = name_df['knownForTitles'].apply(lambda x: x.split(','))

# Show top 5 data
name_df.head()

Unnamed: 0,nconst,primaryName,knownForTitles
0,nm1774132,Nathan McLaughlin,"[tt0417686, tt1713976, tt1891860, tt0454839]"
1,nm10683464,Bridge Andrew,[tt7718088]
2,nm1021485,Brandon Fransvaag,[tt0168790]
3,nm6940929,Erwin van der Lely,[tt4232168]
4,nm5764974,Svetlana Shypitsyna,[tt3014168]


In [12]:
# prepare the bucket for dataframe
df_uni = []

for x in ['knownForTitles']:
    # repeat index so we have name of index as many as the knownForTitles list length in a row
    idx = name_df.index.repeat(name_df['knownForTitles'].str.len())

    # split every list values in a row and concat to the another rows as dataframe
    df1 = pd.DataFrame({
        x: np.concatenate(name_df[x].values)
    })

    # change the index to idx
    df1.index = idx

    # for every dataframe formed, add to the bucket
    df_uni.append(df1)

In [13]:
# concatting data
df_concat = pd.concat(df_uni, axis=1)

# join with the value of original dataframe
unnested_df = df_concat.join(name_df.drop(['knownForTitles'], 1), how='left')

# make the column order same as original dataframe
unnested_df = unnested_df[name_df.columns.tolist()]
unnested_df

Unnamed: 0,nconst,primaryName,knownForTitles
0,nm1774132,Nathan McLaughlin,tt0417686
0,nm1774132,Nathan McLaughlin,tt1713976
0,nm1774132,Nathan McLaughlin,tt1891860
0,nm1774132,Nathan McLaughlin,tt0454839
1,nm10683464,Bridge Andrew,tt7718088
...,...,...,...
998,nm5245804,Eliza Jenkins,tt1464058
999,nm0948460,Greg Yolen,tt0436869
999,nm0948460,Greg Yolen,tt0476663
999,nm0948460,Greg Yolen,tt0109723


#### Group primaryName as list group by knownForTitles

Next, we will group the player column because what we need to do movie recommendation is the movie level 

In [14]:
unnested_drop = unnested_df.drop(['nconst'], axis=1)

df_uni2 = []

for col in ['primaryName']:
    # PrimaryName feature aggregation
    dfi = unnested_drop.groupby(['knownForTitles'])[col].apply(list)
    df_uni2.append(dfi)
    
df_grouped = pd.concat(df_uni2, axis=1).reset_index()
df_grouped.columns = ['knownForTitles','cast_name']
df_grouped

Unnamed: 0,knownForTitles,cast_name
0,tt0008125,[Charles Harley]
1,tt0009706,[Charles Harley]
2,tt0010304,[Natalie Talmadge]
3,tt0011414,[Natalie Talmadge]
4,tt0011890,[Natalie Talmadge]
...,...,...
1893,tt9610496,[Stefano Baffetti]
1894,tt9714030,[Kevin Kain]
1895,tt9741820,[Caroline Plyler]
1896,tt9759814,[Ethan Francis]


#### Join table

In [149]:
# join df_grouped with rating_df
base_df = pd.merge(df_grouped, rating_df, left_on='knownForTitles', right_on='tconst', how='inner')

# join base_df with director_df
base_df = pd.merge(base_df, director_df, left_on='tconst', right_on='tconst', how='left')

base_df.head()

Unnamed: 0,knownForTitles,cast_name,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,director_name,writer_name
0,tt0011414,[Natalie Talmadge],tt0011414,movie,The Love Expert,The Love Expert,0,1920.0,,60.0,"Comedy,Romance",4.9,136,[David Kirkland],"[John Emerson, Anita Loos]"
1,tt0011890,[Natalie Talmadge],tt0011890,movie,Yes or No,Yes or No,0,1920.0,,72.0,,6.3,7,[Roy William Neill],"[Arthur F. Goodrich, Burns Mantle, Mary Murillo]"
2,tt0014341,[Natalie Talmadge],tt0014341,movie,Our Hospitality,Our Hospitality,0,1923.0,,65.0,"Comedy,Romance,Thriller",7.8,9621,"[Buster Keaton, John G. Blystone]","[Jean C. Havez, Clyde Bruckman, Joseph A. Mitc..."
3,tt0018054,[Reeka Roberts],tt0018054,movie,The King of Kings,The King of Kings,0,1927.0,,155.0,"Biography,Drama,History",7.3,1826,[Cecil B. DeMille],[Jeanie Macpherson]
4,tt0024151,[James Hackett],tt0024151,movie,I Cover the Waterfront,I Cover the Waterfront,0,1933.0,,80.0,"Drama,Romance",6.3,455,[James Cruze],"[Max Miller, Wells Root, Jack Jevne]"


#### Cleaning data

In this step, we will handle null values and drop unrequired features

In [16]:
# check null values
base_df.isnull().sum()

knownForTitles      0
cast_name           0
tconst              0
titleType           0
primaryTitle        0
originalTitle       0
isAdult             0
startYear           0
endYear           950
runtimeMinutes      0
genres            315
averageRating       0
numVotes            0
director_name      74
writer_name        74
dtype: int64

Knowing some features have null values, we have to handle them by filling `Unknown` value

In [17]:
base_fill = base_df

In [18]:
# Fill NULL value with 'Unknown'
base_fill['genres'] = base_fill['genres'].fillna('Unknown')
base_fill[['director_name','writer_name']] = base_fill[['director_name','writer_name']].fillna('unknown')

In [19]:
# Genres feature has nultiple values, so it's better to make it list of list
base_fill['genres'] = base_fill['genres'].apply(lambda x: x.split(','))

In [20]:
# Drop knownForTitles, tconst, isAdult, endYear, originalTitle
base_drop = base_fill.drop(['knownForTitles','tconst','isAdult','endYear','originalTitle'], axis=1)

base_drop = base_drop[['primaryTitle','titleType','startYear','runtimeMinutes','genres','averageRating','numVotes','cast_name','director_name','writer_name']]

base_drop.columns = ['title','type','start','duration','genres','rating','votes','cast_name','director_name','writer_name']
base_drop.head()

Unnamed: 0,title,type,start,duration,genres,rating,votes,cast_name,director_name,writer_name
0,The Love Expert,movie,1920.0,60.0,"[Comedy, Romance]",4.9,136,[Natalie Talmadge],[David Kirkland],"[John Emerson, Anita Loos]"
1,Yes or No,movie,1920.0,72.0,[Unknown],6.3,7,[Natalie Talmadge],[Roy William Neill],"[Arthur F. Goodrich, Burns Mantle, Mary Murillo]"
2,Our Hospitality,movie,1923.0,65.0,"[Comedy, Romance, Thriller]",7.8,9621,[Natalie Talmadge],"[Buster Keaton, John G. Blystone]","[Jean C. Havez, Clyde Bruckman, Joseph A. Mitc..."
3,The King of Kings,movie,1927.0,155.0,"[Biography, Drama, History]",7.3,1826,[Reeka Roberts],[Cecil B. DeMille],[Jeanie Macpherson]
4,I Cover the Waterfront,movie,1933.0,80.0,"[Drama, Romance]",6.3,455,[James Hackett],[James Cruze],"[Max Miller, Wells Root, Jack Jevne]"


In [21]:
base_drop.isnull().sum()

title            0
type             0
start            0
duration         0
genres           0
rating           0
votes            0
cast_name        0
director_name    0
writer_name      0
dtype: int64

## Creating Content-based Recommender System

#### Metadata Classification

In [125]:
# Classify data by title, cast_name, genres, director_name, dan writer_name
feature_df = base_drop[['title', 'cast_name', 'genres', 'director_name', 'writer_name']]

# Show data
feature_df.head()

Unnamed: 0,title,cast_name,genres,director_name,writer_name
0,The Love Expert,[Natalie Talmadge],"[Comedy, Romance]",[David Kirkland],"[John Emerson, Anita Loos]"
1,Yes or No,[Natalie Talmadge],[Unknown],[Roy William Neill],"[Arthur F. Goodrich, Burns Mantle, Mary Murillo]"
2,Our Hospitality,[Natalie Talmadge],"[Comedy, Romance, Thriller]","[Buster Keaton, John G. Blystone]","[Jean C. Havez, Clyde Bruckman, Joseph A. Mitc..."
3,The King of Kings,[Reeka Roberts],"[Biography, Drama, History]",[Cecil B. DeMille],[Jeanie Macpherson]
4,I Cover the Waterfront,[James Hackett],"[Drama, Romance]",[James Cruze],"[Max Miller, Wells Root, Jack Jevne]"


In [126]:
simple_df = base_drop[['title', 'cast_name', 'genres', 'director_name', 'writer_name']]

#### Strip spaces of each rows and elements

In [127]:
def sanitize(x):
    try:
        #kalau cell berisi list
        if isinstance(x, list):
            return [i.replace(' ','').lower() for i in x]
        #kalau cell berisi string
        else:
            return [x.replace(' ','').lower()]
    except:
        print(x)
             
feature_cols = ['cast_name','genres','writer_name','director_name']

# apply function sanitize 
for col in feature_cols:
    feature_df[col] = feature_df[col].apply(sanitize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


#### Merge all feature as one sentence for each title

In [128]:
def soup_feature(x):
    return ' '.join(x['cast_name']) + ' ' + ' '.join(x['genres']) + ' ' + ' '.join(x['director_name']) + ' ' + ' '.join(x['writer_name'])

# making to one column
feature_df['soup'] = feature_df.apply(soup_feature, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


#### Prepare CountVectorizer and fit to the soup above

In [129]:
# define CountVectorizer dan change the previous soup to vector
countvec = CountVectorizer(stop_words='english')
count_matrix = countvec.fit_transform(feature_df['soup'])

print(count)
print(count_matrix.shape)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)
(1060, 10026)


#### Making similarity model

In this step, we want to calculate cosine similarity score of each title pairs (based on every combination), in other words we will make 675x675 matrix, where cell in i and j column show the similarity score between i and j. The diagonal value will be 1, since

The formula of cosine similarity calculation between 2 text:

$cosine(x,y)=\frac{x.y^T}{||x||.||y||}$

The output will be the range of -1 to 1. The closer the score to 1 means the two entities are very similar. The closer the score to -1 means the two entities are very different. 

In [130]:
# Import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

# Gunakan cosine_similarity antara count_matrix 
cosine_sim = cosine_similarity(count_matrix, count_matrix)

cosine_sim

array([[1.        , 0.15430335, 0.35355339, ..., 0.        , 0.        ,
        0.13608276],
       [0.15430335, 1.        , 0.10910895, ..., 0.        , 0.        ,
        0.        ],
       [0.35355339, 0.10910895, 1.        , ..., 0.        , 0.08703883,
        0.09622504],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.08703883, ..., 0.        , 1.        ,
        0.10050378],
       [0.13608276, 0.        , 0.09622504, ..., 0.        , 0.10050378,
        1.        ]])

#### Content based recommender system

In [150]:
indices = pd.Series(feature_df.index, index=feature_df['title']).drop_duplicates()

def content_recommender(title):
    #mendapatkan index dari judul film (title) yang disebutkan
    idx = indices[title]

    #menjadikan list dari array similarity cosine sim 
    #hint: cosine_sim[idx]
    sim_scores = list(enumerate(cosine_sim[idx]))

    #mengurutkan film dari similarity tertinggi ke terendah
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    #untuk mendapatkan list judul dari item kedua sampe ke 11
    sim_scores = sim_scores[1:11]

    #mendapatkan index dari judul-judul yang muncul di sim_scores
    movie_indices = [i[0] for i in sim_scores]

    #dengan menggunakan iloc, kita bisa panggil balik berdasarkan index dari movie_indices
    return base_drop.iloc[movie_indices]

Please replace ### with the movie name on the function `content_recommender('###')`. The movie reccomendation based on the movie you input will appear

In [151]:
# Apply the function
content_recommender('Iron Man')

Unnamed: 0,title,type,start,duration,genres,rating,votes,cast_name,director_name,writer_name
726,Black Panther,movie,2018.0,134.0,"[Action, Adventure, Sci-Fi]",7.3,575851,[Robert E. Evans],[Ryan Coogler],"[Stan Lee, Jack Kirby, Ryan Coogler, Joe Rober..."
873,X-Men: Apocalypse,movie,2016.0,144.0,"[Action, Adventure, Sci-Fi]",6.9,377709,[Frank Maudsley],[Bryan Singer],"[Stan Lee, Jack Kirby, Simon Kinberg, Bryan Si..."
511,Star Trek,movie,2009.0,127.0,"[Action, Adventure, Sci-Fi]",7.9,567224,"[Matthew Fuchs, Aida Caefer]",[J.J. Abrams],"[Gene Roddenberry, Roberto Orci, Alex Kurtzman]"
611,X-Men: First Class,movie,2011.0,131.0,"[Action, Adventure, Sci-Fi]",7.7,629609,[Aida Caefer],[Matthew Vaughn],"[Stan Lee, Jack Kirby, Ashley Miller, Zack Ste..."
494,Ant-Man,movie,2015.0,117.0,"[Action, Adventure, Comedy]",7.3,540644,[Francesco Cadoni],[Peyton Reed],"[Stan Lee, Larry Lieber, Jack Kirby, Edgar Wri..."
791,Spider-Man: Homecoming,movie,2017.0,133.0,"[Action, Adventure, Sci-Fi]",7.4,479292,[Frank Maudsley],[Jon Watts],"[Stan Lee, Jack Kirby, Joe Simon, Jonathan Gol..."
466,Alita: Battle Angel,movie,2019.0,122.0,"[Action, Adventure, Sci-Fi]",7.3,202735,[Jeff Bottoms],[Robert Rodriguez],"[James Cameron, Laeta Kalogridis, Yukito Kishiro]"
637,Inception,movie,2010.0,148.0,"[Action, Adventure, Sci-Fi]",8.8,1950039,[Dan Churchill],[Christopher Nolan],[Christopher Nolan]
649,Battleship,movie,2012.0,131.0,"[Action, Adventure, Sci-Fi]",5.8,230468,[Robert E. Evans],[Peter Berg],"[Jon Hoeber, Erich Hoeber]"
877,Thor: Ragnarok,movie,2017.0,130.0,"[Action, Adventure, Comedy]",7.9,544539,[Francesco Cadoni],[Taika Waititi],"[Stan Lee, Larry Lieber, Jack Kirby, Craig Kyl..."
