In [80]:
import numpy as np
import pandas as pd

In [81]:
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

In [82]:
movies = movies.merge(credits, on='title')

In [83]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew' ]]

In [84]:
movies.isnull().sum()
movies.dropna(inplace=True)

In [85]:
movies.duplicated().sum()

0

In [86]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [87]:
import ast
def convert(obj):
      l=[]
      for i in ast.literal_eval(obj):
            l.append(i['name'])
      return l

def convert3(obj):
      l=[]
      c=0
      for i in ast.literal_eval(obj):
            if c!=3:
                  l.append(i['name'])
                  c+=1
            else:
                  break
      return l

def fetch_director(obj):
      l=[]
      for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                  l.append(i['name'])
                  break
      return l

In [88]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [89]:
movies['cast'] = movies['cast'].apply(convert3)

In [90]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [91]:
#initially overview is a string we made it into list of words
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [92]:
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [94]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [95]:
df = movies[['movie_id', 'title', 'tags']]

In [96]:
df['tags'] = df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x:" ".join(x))


In [97]:
df['tags'] = df['tags'].apply(lambda x:x.lower())
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x:x.lower())


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


## VECTORIZATION


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [115]:
vectors = cv.fit_transform(df['tags']).toarray()

In [102]:
%pip install nltk



In [104]:
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [105]:
def stem(text):
      y=[]
      for i in text.split():
            y.append(ps.stem(i))
      return " ".join(y)

In [106]:
df['tags'] = df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [117]:
similarity = cosine_similarity(vectors)

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [120]:
def recommend(movie):
      ind = df[df['title'] == movie].index[0]
      dist = similarity[ind]
      movie_list = sorted(list(enumerate(dist)), reverse=True, key=lambda x:x[1])[1:6]
      for i in movie_list:
            print(df.iloc[i[0]].title)

In [122]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf


In [125]:
import pickle
pickle.dump(df, open('movies.pkl', 'wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

