In [None]:
!gdown --id 1zu97TwzyU2T8OVvQ2VMof-XqHDX_fxPf

Downloading...
From: https://drive.google.com/uc?id=1zu97TwzyU2T8OVvQ2VMof-XqHDX_fxPf
To: /content/data_merged.csv
100% 227M/227M [00:00<00:00, 240MB/s]


In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize

In [None]:
df = pd.read_csv('data_merged.csv')

In [None]:
features = ['cast', 'keywords', 'crew', 'genres']
required_cols = features + ['overview'] + ['title']
df = df[required_cols]
df = df.head(10000)

In [None]:
print("columns:", df.columns)
print("size:", len(df))

columns: Index(['cast', 'keywords', 'crew', 'genres', 'overview', 'title'], dtype='object')
size: 10000


In [None]:
df['overview'] = df['overview'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim1 = normalize(cosine_sim1, axis=0)

In [None]:
# HELPER FUNCTIONS
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names

    return []

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

    f = get_director if feature == 'crew' else get_list
    df[feature] = df[feature].apply(f)
    
    df[feature] = df[feature].apply(clean_data)

def create_metadata(x):
    metadata = []
    for feature in features + ['title']:
        metadata.append(' '.join(x[feature]))
    
    return ' '.join(metadata)

In [None]:
df['meta'] = df.apply(create_metadata, axis=1)

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['meta'])
cosine_sim2 = linear_kernel(count_matrix, count_matrix)
# cosine_sim2 = normalize(cosine_sim2, axis=0)

In [None]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [None]:
def get_recommendations(title, plot=1, meta=1, top_k=10):
    try:
        idx = indices[title]
    except:
        print("Movie not found.")
        return []

    sim_scores = [
        plot * sim1 + meta * sim2
        for sim1, sim2 in zip(cosine_sim1[idx], cosine_sim2[idx])
    ]
    
    sim_scores = list(enumerate(sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[:top_k+1]

    movie_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[movie_indices]

In [None]:
get_recommendations('Toy Story', plot=1, meta=1)

0                       Toy Story
3024                  Toy Story 2
3336            Creature Comforts
4797               Monsters, Inc.
608                The Aristocats
4272    Atlantis: The Lost Empire
1131           The Wrong Trousers
7805                     Garfield
734                 A Close Shave
9859                       Robots
1437              Jungle 2 Jungle
Name: title, dtype: object

In [None]:
get_recommendations('Garfield', plot=1, meta=1)

7805              Garfield
3336     Creature Comforts
6713             Beethoven
8891         Peter-No-Tail
1450      Cats Don't Dance
734          A Close Shave
608         The Aristocats
3024           Toy Story 2
6269        Daddy Day Care
0                Toy Story
8944    Asterix vs. Caesar
Name: title, dtype: object