In [28]:
import numpy as np
import pandas as pd

In [60]:
movie = pd.read_csv("datasets/tmdb_5000_movies.csv")
credit = pd.read_csv("datasets/tmdb_5000_credits.csv")

# Merging Datasets

In [61]:
movies = movie.merge(credit, on="title")

# Feature Selection

In [62]:
movies = movies[["id","title","genres","keywords","overview","cast","crew"]]

In [63]:
movies.shape

(4809, 7)

In [64]:
movies.iloc[4807,0]

126186

In [65]:
movies[movies["title"]=="Signed, Sealed, Delivered"]

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
4806,231617,"Signed, Sealed, Delivered","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...","""Signed, Sealed, Delivered"" introduces a dedic...","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."


In [66]:
movies.dropna(inplace=True)
movies.reset_index(drop=True, inplace=True)

In [67]:
movies[movies["title"]=="Signed, Sealed, Delivered"]

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
4803,231617,"Signed, Sealed, Delivered","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...","""Signed, Sealed, Delivered"" introduces a dedic...","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."


In [73]:
movies.iloc[4804,0]

126186

In [74]:
movies.shape

(4806, 7)

# Preprocessing

In [75]:
import json

def convert(obj):
    L = []
    for i in json.loads(obj):
        L.append(i["name"])
    return L

In [76]:
movies["genres"] = movies["genres"].apply(convert)

In [77]:
movies["keywords"] = movies["keywords"].apply(convert)

In [78]:
def convert2(obj):
    L = []
    count = 0
    for i in json.loads(obj):
        if count!=3:
            L.append(i["name"])
            count+=1
        else:
            break
    return L

In [79]:
movies["cast"] = movies["cast"].apply(convert2)

In [80]:
def fetch_director(obj):
    L = []
    for i in json.loads(obj):
        if i['job']=="Director":
            L.append(i["name"])
            break
    return L

In [81]:
movies["crew"] = movies["crew"].apply(fetch_director)

In [82]:
movies.head(3)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


In [83]:
clean_movies_dataset = movies.copy()

In [84]:
clean_movies_dataset[clean_movies_dataset["title"]=="My Date with Drew"]

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
4805,25975,My Date with Drew,[Documentary],"[obsession, camcorder, crush, dream girl]",Ever since the second grade when he first saw ...,"[Drew Barrymore, Brian Herzlinger, Corey Feldman]",[Brian Herzlinger]


In [85]:
clean_movies_dataset.head(3)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


In [113]:
clean_movies_dataset[clean_movies_dataset["title"]=="Everything You Always Wanted to Know About Sex *But Were Afraid to Ask"]

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
4081,11624,Everything You Always Wanted to Know About Sex...,[Comedy],"[transsexuality, perversity, sperm, orgasm, so...","A collection of seven vignettes, which each ad...","[Woody Allen, John Carradine, Lou Jacobi]",[Woody Allen]


In [87]:
clean_movies_dataset.shape , movies.shape

((4806, 7), (4806, 7))

In [88]:
clean_movies_dataset.iloc[-1, 3]

['obsession', 'camcorder', 'crush', 'dream girl']

In [89]:
movies.iloc[-1, 3]

['obsession', 'camcorder', 'crush', 'dream girl']

In [90]:
# this step purpose is to convert text into list, so that we can add all lists
movies["overview"] = movies["overview"].apply(lambda x:x.split())

In [91]:
movies["genres"] = movies["genres"].apply(lambda y:[i.replace(" ","") for i in y])
movies["keywords"] = movies["keywords"].apply(lambda y:[i.replace(" ","") for i in y])
movies["cast"] = movies["cast"].apply(lambda y:[i.replace(" ","") for i in y])
movies["crew"] = movies["crew"].apply(lambda y:[i.replace(" ","") for i in y])

In [92]:
movies["tags"] = movies["overview"]+movies["genres"]+movies["keywords"]+movies["cast"]+movies["crew"]

In [93]:
movies.head(3)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."


In [94]:
n_df = movies[["id","title","tags"]]

In [95]:
n_df["tags"] = n_df["tags"].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_df["tags"] = n_df["tags"].apply(lambda x:" ".join(x))


In [96]:
n_df["tags"] = n_df["tags"].apply(lambda y:y.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_df["tags"] = n_df["tags"].apply(lambda y:y.lower())


# Apply Stemming

In [97]:
import nltk
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [98]:
def stemming_text(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [99]:
n_df["tags"] = n_df["tags"].apply(stemming_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_df["tags"] = n_df["tags"].apply(stemming_text)


# Create vectors database

In [100]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words="english")

In [101]:
vectors = cv.fit_transform(n_df["tags"]).toarray()

In [102]:
vectors.shape

(4806, 5000)

# Cosine Similarity

In [103]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [104]:
clean_movies_dataset.head(3)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


# Pickle

In [105]:
import pickle as pk

In [106]:
pk.dump(clean_movies_dataset, open("imported from jupyter notebook/movies/detail_of_movie.pkl","wb"))

In [107]:
pk.dump(n_df, open("imported from jupyter notebook/movies/dataset.pkl","wb"))

In [108]:
pk.dump(similarity, open("imported from jupyter notebook/movies/movies_recommender.pkl","wb"))

In [92]:
list = []
list2 = [2,3,4]

def r(list,list2):
    return list,list2
a,b = r(list,list2)

In [95]:
len(a)

0

In [94]:
b

[2, 3, 4]