<a href="https://colab.research.google.com/github/josephpark0828/movie-recommender/blob/main/movie_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

metadata = pd.read_csv('movies_metadata.csv.zip', low_memory=False)

metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_obj = TfidfVectorizer(stop_words='english')

metadata['overview'] = metadata['overview'].fillna('')

vectorizer_matrix = vectorizer_obj.fit_transform(metadata['overview'])

In [None]:
vectorizer_matrix.shape

(45466, 75827)

In [None]:
vectorizer_obj.get_feature_names()[5000:5010]

['avails',
 'avaks',
 'avalanche',
 'avalanches',
 'avallone',
 'avalon',
 'avant',
 'avanthika',
 'avanti',
 'avaracious']

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(vectorizer_matrix, vectorizer_matrix)

cosine_sim.shape

cosine_sim[0]

array([1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
       0.        ])

In [None]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
  index = indices[title]

  sim_scores = list(enumerate(cosine_sim[index]))

  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  sim_scores = sim_scores[1:11]

  movie_indices = [i[0] for i in sim_scores]

  return metadata['title'].iloc[movie_indices]

In [None]:
get_recommendations('Toy Story')

15348                                     Toy Story 3
2997                                      Toy Story 2
10301                          The 40 Year Old Virgin
24523                                       Small Fry
23843                     Andy Hardy's Blonde Trouble
29202                                      Hot Splash
43427                Andy Kaufman Plays Carnegie Hall
38476    Superstar: The Life and Times of Andy Warhol
42721    Andy Peters: Exclamation Mark Question Point
8327                                        The Champ
Name: title, dtype: object