In [63]:
import numpy as np
import pandas as pd
import ast
import sklearn
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
'''import datasets'''
movies = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credit, on='title')
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.dropna(inplace=True)
movies.head(1)
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4806 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   title     4806 non-null   object
 2   overview  4806 non-null   object
 3   genres    4806 non-null   object
 4   keywords  4806 non-null   object
 5   cast      4806 non-null   object
 6   crew      4806 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.4+ KB


In [4]:
'''preprossing of data '''
'''1) in genres convert
'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}
, {"id": 878, "name": "Science Fiction"}]' this into [Action,.....] same in keywords
'''
def convert(obj) :
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [5]:
'''preprossing data '''
'''3) in cast we only want top 3 actors '''
def convert3(obj) :
    l = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter !=3 :
            l.append(i['name'])
            counter += 1
        else :
            break
    return l
movies['cast']=movies['cast'].apply(convert3)

In [6]:
'''preprossing data '''
'''4) in crew we only want only director '''

def convert4(obj) :
    l = []
    for i in ast.literal_eval(obj):
        if i['job'] =='Director' :
            l.append(i['name'])
            break
    return l
movies['crew'] = movies['crew'].apply(convert4)
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [7]:
'''preprossing data '''
'''transformation of data like Science Fiction into ScienceFiction with help of this this will convert into
single entity and for user it will increse accuracy'''

movies['genres'] = movies['genres'].apply(lambda x : [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x : [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x : [i.replace(" ","") for i in x])

In [8]:
'''preprossing data '''
'''merging cells  '''
movies['tags'] = movies['overview'] + movies['cast'] + movies['crew'] + movies['keywords'] + movies['genres']

In [9]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [10]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."


In [11]:
'''convert list into str'''
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."


In [12]:
new['tags'] = new['tags'].apply(lambda x :x.lower())
new.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


In [13]:
new['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. samworthington zoesaldana sigourneyweaver jamescameron cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d action adventure fantasy sciencefiction'

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [35]:
vector = cv.fit_transform(new['tags']).toarray()

In [36]:
vector.shape

(4806, 5000)

In [30]:
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
       y.append(ps.stem(i))
    return " ".join(y)

In [32]:
new['tags'] = new['tags'].apply(stem)

In [33]:
new.iloc[0][2]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. samworthington zoesaldana sigourneyweav jamescameron cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d action adventur fantasi sciencefict'

In [46]:

'''calculating distence of vector from its cosine angle'''
similarity = cosine_similarity(vector)


In [49]:
similarity[2]

array([0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
       0.        ])

In [57]:
def recommend(movie) :
    movie_index = new[new['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)),reverse=True,key = lambda x: x[1])[1:6]
    for i in movie_list :
         print(new.iloc[i[0]].title)


In [65]:
recommend('Batman Begins')
pickle.dump(new,open('movies.pkl','wb'))

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf


In [66]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [67]:
pickle.dump(new.to_dict(),open('moviedic.pkl','wb'))
