In [97]:
import pandas as pd

df = pd.read_parquet('../data/movie_data.parquet')

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99806 entries, 0 to 99805
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   userId             99806 non-null  int64  
 1   rating             99806 non-null  float64
 2   imdb_id            99806 non-null  object 
 3   adult              99806 non-null  object 
 4   genres             99806 non-null  object 
 5   original_language  99806 non-null  object 
 6   overview           99792 non-null  object 
 7   popularity         99806 non-null  float64
 8   poster_path        99800 non-null  object 
 9   release_date       99800 non-null  object 
 10  runtime            99806 non-null  float64
 11  title              99806 non-null  object 
 12  vote_average       99806 non-null  float64
 13  vote_count         99806 non-null  float64
dtypes: float64(5), int64(1), object(8)
memory usage: 10.7+ MB


In [99]:
df.genres

0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
2        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
3        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
4        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
                               ...                        
99801    [{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...
99802                        [{'id': 18, 'name': 'Drama'}]
99803    [{'id': 53, 'name': 'Thriller'}, {'id': 10749,...
99804    [{'id': 12, 'name': 'Adventure'}, {'id': 18, '...
99805    [{'id': 99, 'name': 'Documentary'}, {'id': 104...
Name: genres, Length: 99806, dtype: object

In [100]:
df1 = df[['title','overview','genres','poster_path','vote_average','vote_count']].drop_duplicates().reset_index(drop=True)


In [101]:
df1[df1.overview.isna() == True]

Unnamed: 0,title,overview,genres,poster_path,vote_average,vote_count
272,Roommates,,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",/hvHNlMvWS2GBt7RR971bJ3k4bJc.jpg,6.4,7.0
563,The Superwife,,"[{'id': 35, 'name': 'Comedy'}]",/AbhMKCh3fV5PY2B9uSPF1DWEvq2.jpg,5.3,7.0
684,The Day the Sun Turned Cold,,[],/loQea7CiBv6VevsU70vGpqj6AQM.jpg,7.0,2.0
1164,Guantanamera,,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",/4axpsF1n3ABUjuibI3PndaFHgCv.jpg,8.0,3.0
1821,One Tough Cop,,[],/ihg8EBrYWdFYTRdLCnqqtN8dRLC.jpg,3.0,3.0
7256,The Three Musketeers,,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",/hvttwy7RWLtWTnt9IC5LQjRYerC.jpg,3.0,2.0
7911,Descongelate!,,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",/mdzlBQYuaXHYhUaUHswvcWmb9xw.jpg,0.0,0.0
8690,Off Beat,,"[{'id': 18, 'name': 'Drama'}]",/4D85xDXqwSq6mTDdX1ZowJ2jMT8.jpg,6.5,16.0
8854,El vals de los inútiles,,"[{'id': 99, 'name': 'Documentary'}]",/c0vQXiTwiHrDamg1gfRlF3OB9Op.jpg,5.7,3.0
8935,Bana Masal Anlatma,,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",/u276zhNPhGQSjhXbPa6B0QJL5JP.jpg,5.7,9.0


In [102]:
df1['overview'] = df1['overview'].fillna('')
df1['poster_path'] = df1['poster_path'].fillna('')

In [103]:
df1['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [104]:
import ast

genres = ast.literal_eval(df1['genres'][0])
genres_name = [genre['name'] for genre in genres]
print(genres_name)

['Animation', 'Comedy', 'Family']


In [105]:
def extract_genre_names(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        return [g['name'] for g in genres]   
    except (ValueError, SyntaxError):
        return []  
df1['genres'] = df1['genres'].apply(extract_genre_names)
df1[['title', 'genres']].head()

Unnamed: 0,title,genres
0,Toy Story,"[Animation, Comedy, Family]"
1,Jumanji,"[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Romance, Comedy]"
3,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,Father of the Bride Part II,[Comedy]


In [106]:
df1['genres_str'] = df1['genres'].apply(lambda x: ' '.join(x))
df1['genres_str'] = df1['genres_str'].fillna('').apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df1['text'] = df1['genres_str'] + ' ' + df1['overview']
df1['text'][0]

"Animation Comedy Family Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df1['text'])

In [108]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df1.index, index=df1['title']).drop_duplicates()

In [109]:
cosine_sim

array([[1.        , 0.02396644, 0.00755   , ..., 0.        , 0.        ,
        0.01481331],
       [0.02396644, 1.        , 0.07057541, ..., 0.        , 0.01520441,
        0.01570377],
       [0.00755   , 0.07057541, 1.        , ..., 0.00813259, 0.0132603 ,
        0.        ],
       ...,
       [0.        , 0.        , 0.00813259, ..., 1.        , 0.0170893 ,
        0.        ],
       [0.        , 0.01520441, 0.0132603 , ..., 0.0170893 , 1.        ,
        0.        ],
       [0.01481331, 0.01570377, 0.        , ..., 0.        , 0.        ,
        1.        ]], shape=(9010, 9010))

In [110]:
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
Sharknado 4: The 4th Awakens                          9005
The Last Brickmaker in America                        9006
Rustom                                                9007
Mohenjo Daro                                          9008
The Beatles: Eight Days a Week - The Touring Years    9009
Length: 9010, dtype: int64

In [131]:
def get_recommendations(title, n=10):
    idx = indices[title]

    # sim_scores are square matrix of the similarity between movie x and y: consine_sim[x][y]
    # Turns consine_sim into [(row_num, similarity_score), (),()]
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Extract similarity_score and sort
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # skip the movie itself
    sim_scores = sim_scores[1:len(df1)+1]  
    
    # Get index
    movie_indices = [i[0] for i in sim_scores]

    # Remove if the movie have vote avg lower than 5 or count is lower than 100
    rec = df1[['title','genres','vote_average','vote_count','poster_path']].iloc[movie_indices]
    rec = rec[(rec['vote_average'] > 5) & (rec['vote_count'] > 100)].head(n)

    return rec

In [133]:
get_recommendations('1984', n=5)

Unnamed: 0,title,genres,vote_average,vote_count,poster_path
1672,Nineteen Eighty-Four,"[Drama, Romance, Science Fiction]",6.8,311.0,/asqIqgy3lywRhrVv6WCdcofNWH1.jpg
5606,House of Flying Daggers,"[Adventure, Drama, Action, Romance]",7.1,452.0,/39XJExIXKyvSYQYvy1b5EsutjB8.jpg
1310,Gattaca,"[Thriller, Science Fiction, Mystery, Romance]",7.5,1846.0,/gPYtuvhQvP7v5F1Tqt2iQWR3Y7i.jpg
4719,THX 1138,"[Drama, Mystery, Science Fiction]",6.4,265.0,/8cie5mojY6MlIrYMs9EtNSyterv.jpg
3299,In the Mood for Love,"[Drama, Romance]",7.8,379.0,/unOW3SxFxBdd7LMWjmRONdHWKPb.jpg
