In [2]:
import numpy as np
import pandas as pd
import ast
import nltk

In [3]:
movies=pd.read_csv("tmdb_5000_movies.csv")
credits=pd.read_csv("tmdb_5000_credits.csv")
credits.rename(columns={"movie_id":"id"},inplace=True)

In [4]:
movies=movies.merge(credits,on="id")


In [5]:
# movies.columns
# relevant columns
# id 
# genres
# keywords
# title
# overview
# cast
# crew
movies=movies[["id","genres","keywords","title_x","overview","cast","crew"]]
# movies.info()

In [6]:
# now will preprocess the columns
movies.isnull().sum()

id          0
genres      0
keywords    0
title_x     0
overview    3
cast        0
crew        0
dtype: int64

In [7]:
movies.dropna(inplace=True)

In [8]:
movies.duplicated().sum()

0

In [9]:
# now we will put sanitize genres
def convert_one(genre_str):
    genre_final=[]
    for item in ast.literal_eval(genre_str):
        genre_final.append(item["name"])
    
    return genre_final

# movies["genres"]=movies["genres"].apply(convert_genres)
# movies["keywords"]=movies["keywords"].apply(convert_genres)

In [10]:
movies["genres"]=movies["genres"].apply(convert_one)

In [11]:
movies["keywords"]=movies["keywords"].apply(convert_one)

In [12]:
#now we will extract only first 3 actors of each movie
def convert_two(cast):
    list_final=[]
    for item in ast.literal_eval(cast):
        list_final.append(item["name"])
        if(len(list_final)==3): 
            break
    
    return list_final

# print(convert_two(movies.iloc[0].cast))


In [13]:
movies["cast"]=movies["cast"].apply(convert_two)

In [14]:
def get_director(crew):
    list_final=[]
    for c in ast.literal_eval(crew):
        if(c["job"]=="Director"):
            list_final.append(c["name"])
            return list_final

# print(get_director(movies.iloc[0].crew))


In [15]:
movies["crew"]=movies["crew"].apply(get_director)


In [16]:
movies["overview"]=movies["overview"].apply(lambda x:x.split())

In [17]:
movies.head(2)

Unnamed: 0,id,genres,keywords,title_x,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [18]:
movies["genres"]=movies["genres"].apply(lambda x:[i.replace(" ","") for i in x])


In [19]:
movies["keywords"]=movies["keywords"].apply(lambda x:[i.replace(" ","") for i in x])


In [20]:
movies["cast"]=movies["cast"].apply(lambda x:[i.replace(" ","") for i in x])


In [21]:
def convert_three(crew_name):
    list_final=[]
    list_final.append(crew_name[0].replace(" ",""))
    return list_final




In [22]:
movies.dropna(inplace=True)

In [23]:
movies["crew"]=movies["crew"].apply(convert_three)

In [24]:
movies["tags"]=movies["overview"]+movies["genres"]+movies["keywords"]+movies["cast"]+movies["crew"]

In [27]:
movies.head(1)["tags"]

0    [In, the, 22nd, century,, a, paraplegic, Marin...
Name: tags, dtype: object

In [28]:
new_movies=movies[["id","title_x","tags"]]

In [29]:
new_movies["tags"]=new_movies["tags"].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies["tags"]=new_movies["tags"].apply(lambda x:" ".join(x))


In [30]:
new_movies["tags"]=new_movies["tags"].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies["tags"]=new_movies["tags"].apply(lambda x:x.lower())


In [31]:
new_movies["tags"]

0       in the 22nd century, a paraplegic marine is di...
1       captain barbossa, long believed to be dead, ha...
2       a cryptic message from bond’s past sends him o...
3       following the death of district attorney harve...
4       john carter is a war-weary, former military ca...
                              ...                        
4798    el mariachi just wants to play his guitar and ...
4799    a newlywed couple's honeymoon is upended by th...
4800    "signed, sealed, delivered" introduces a dedic...
4801    when ambitious new york attorney sam is sent t...
4802    ever since the second grade when he first saw ...
Name: tags, Length: 4770, dtype: object

In [32]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [33]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [34]:
new_movies["tags"]=new_movies["tags"].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies["tags"]=new_movies["tags"].apply(stem)


In [35]:
new_movies["tags"]

0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
                              ...                        
4798    el mariachi just want to play hi guitar and ca...
4799    a newlyw couple' honeymoon is upend by the arr...
4800    "signed, sealed, delivered" introduc a dedic q...
4801    when ambiti new york attorney sam is sent to s...
4802    ever sinc the second grade when he first saw h...
Name: tags, Length: 4770, dtype: object

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words="english")

In [37]:
vectors=cv.fit_transform(new_movies["tags"]).toarray()

In [43]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [44]:
vectors.shape

(4770, 5000)

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
result=cosine_similarity(vectors)

In [52]:
print(result.shape)
print(result[0])


(4770, 4770)
[1.         0.08346223 0.0860309  ... 0.04499213 0.         0.        ]


In [53]:
def recommend(movie_title):
    movie_index=new_movies[new_movies["title_x"]== movie_title].index[0]
    distances=result[movie_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

    for mov in movies_list:
        print(new_movies.iloc[mov[0]]["title_x"])



In [55]:
recommend("Batman")

Batman & Robin
Batman Begins
Batman Returns
The R.M.
The Dark Knight Rises
