## Movie Recommender System

In [32]:
import numpy as np 
import pandas as pd

### Importing Datasets


In [33]:
Imdb_movies = pd.read_csv('imdb_top_1000.csv')

In [34]:
Imdb_movies.head(1)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469


In [35]:
Imdb_movies = Imdb_movies[['Poster_Link','Series_Title','Genre','Overview','Director','Star1','Star2','Star3','Star4']]

In [36]:
Imdb_movies.isnull().sum()

Poster_Link     0
Series_Title    0
Genre           0
Overview        0
Director        0
Star1           0
Star2           0
Star3           0
Star4           0
dtype: int64

In [37]:
Imdb_movies.duplicated().sum()

0

In [38]:
Imdb_movies['Overview'] = Imdb_movies["Overview"].apply(lambda x:x.split())

In [39]:
Imdb_movies['Overview'][0]

['Two',
 'imprisoned',
 'men',
 'bond',
 'over',
 'a',
 'number',
 'of',
 'years,',
 'finding',
 'solace',
 'and',
 'eventual',
 'redemption',
 'through',
 'acts',
 'of',
 'common',
 'decency.']

In [42]:
Imdb_movies.head(1)

Unnamed: 0,Poster_Link,Series_Title,Genre,Overview,Director,Star1,Star2,Star3,Star4
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,[Drama],"[Two, imprisoned, men, bond, over, a, number, ...",[Frank Darabont],[Tim Robbins],[Morgan Freeman],[Bob Gunton],[William Sadler]


In [41]:
Imdb_movies['Genre'] = Imdb_movies['Genre'].apply(lambda x: [x])
Imdb_movies['Star1'] = Imdb_movies['Star1'].apply(lambda x: [x])
Imdb_movies['Star2'] = Imdb_movies['Star2'].apply(lambda x: [x])
Imdb_movies['Star3'] = Imdb_movies['Star3'].apply(lambda x: [x])
Imdb_movies['Star4'] = Imdb_movies['Star4'].apply(lambda x: [x])
Imdb_movies['Director'] = Imdb_movies['Director'].apply(lambda x: [x])

In [43]:
Imdb_movies['tags'] = Imdb_movies['Overview'] + Imdb_movies['Genre'] + Imdb_movies['Star1'] + Imdb_movies['Star2'] + Imdb_movies['Star3'] + Imdb_movies['Star4']

In [44]:
Imdb_movies.head(1)

Unnamed: 0,Poster_Link,Series_Title,Genre,Overview,Director,Star1,Star2,Star3,Star4,tags
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,[Drama],"[Two, imprisoned, men, bond, over, a, number, ...",[Frank Darabont],[Tim Robbins],[Morgan Freeman],[Bob Gunton],[William Sadler],"[Two, imprisoned, men, bond, over, a, number, ..."


In [45]:
Final_Imdb_Movies = Imdb_movies[['Poster_Link','Series_Title','tags']]

In [46]:
Final_Imdb_Movies.head(1)

Unnamed: 0,Poster_Link,Series_Title,tags
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,"[Two, imprisoned, men, bond, over, a, number, ..."


In [47]:
Final_Imdb_Movies['tags'] = Final_Imdb_Movies['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_Imdb_Movies['tags'] = Final_Imdb_Movies['tags'].apply(lambda x:" ".join(x))


In [48]:
Final_Imdb_Movies['tags'][0]

'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. Drama Tim Robbins Morgan Freeman Bob Gunton William Sadler'

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')

In [50]:
!pip install nltk




[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [51]:
import nltk

In [55]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [56]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)



In [57]:
Final_Imdb_Movies['tags'] = Final_Imdb_Movies['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_Imdb_Movies['tags'] = Final_Imdb_Movies['tags'].apply(stem)


In [58]:
vectors = cv.fit_transform(Final_Imdb_Movies['tags']).toarray()

In [59]:
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
similarity_matrix = cosine_similarity(vectors)

In [61]:
import pickle

In [62]:
pickle.dump(Final_Imdb_Movies,open('movies.pkl','wb'))

In [63]:
pickle.dump(similarity_matrix,open('similarity_matrix.pkl','wb'))