In [1]:
import numpy as np
import pandas as pd
import ast
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk import sent_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/huzefa_m/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits, on='title')

In [4]:
movies = movies[movies['original_language']=='en']

In [5]:
movies = movies[['movie_id', 'genres', 'keywords', 'title', 'overview', 'cast', 'crew']]

In [6]:
movies.isnull().sum()

movie_id    0
genres      0
keywords    0
title       0
overview    1
cast        0
crew        0
dtype: int64

In [7]:
movies.dropna(inplace=True)

In [8]:
movies.duplicated().sum()

np.int64(0)

In [9]:
movies.reset_index(inplace=True)

In [10]:
def convert(obj):
  l=[]
  for i in ast.literal_eval(obj):
    l.append(i['name'])
  return l

In [11]:
movies['genres'] = movies['genres'].apply(convert)

In [12]:
movies['keywords'] = movies['keywords'].apply(convert)

In [13]:
def convert3(obj):
  l=[]
  for x,i in enumerate(ast.literal_eval(obj)):
    l.append(i['name'])
    if x==2: break
  return l

In [14]:
movies['cast']=movies['cast'].apply(convert3)

In [15]:
def fetch_director(obj):
  l=[]
  for i in ast.literal_eval(obj):
    if i['department']=="Directing":
      l.append(i['name'])
      break
  return l

In [16]:
movies['crew']=movies['crew'].apply(fetch_director)

In [17]:
movies['overview']=movies['overview'].apply(lambda x: x.split())  # Converting from string to list

In [18]:
# Combining names and surnames

movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [19]:
movies['tag'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] +movies['crew']

In [20]:
new_df = movies[[ 'title', 'tag']]

In [21]:
new_df['tag'] = new_df['tag'].apply(lambda x: " ".join(x))   # converting from list to strings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tag'] = new_df['tag'].apply(lambda x: " ".join(x))   # converting from list to strings


In [22]:
new_df['tag']=new_df['tag'].apply(lambda x : x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tag']=new_df['tag'].apply(lambda x : x.lower())


In [23]:
# To convert the words like ['loved', 'loving', 'love'] to ['love','love','love']

ps = PorterStemmer()

def stem (text):
  y=[]
  for i in text.split():

    y.append(ps.stem(i))
  return " ".join(y)

In [24]:
new_df['tag'] = new_df['tag'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tag'] = new_df['tag'].apply(stem)


## Plan of Attack

1. Remove stop words
2. Vectorize /tokenize (word2vec, word embeddings)
3. make every row similar in dimension (and store in pkl file)
4. x= tag and y= movie name 
5. find the similarity by Cosine similarity, now try Nearest neighbor, Manhattan, taccard, spotify wala etc
6. Find the similarity by b/w the test and rest of the movies 
7. suggest the nearest 5 movies
8. make the frontend


## Vectorization

### From CountVectorization or Bag of words Representation

In [25]:
cv = CountVectorizer(max_features=5000,stop_words='english')

In [26]:
cv1 = CountVectorizer(stop_words='english')

In [27]:
cv_vec = cv.fit_transform(new_df['tag']).toarray()

In [28]:
cv_vec1 = cv1.fit_transform(new_df['tag']).toarray()

In [29]:
len(cv.get_feature_names_out())

5000

In [30]:
len(cv1.get_feature_names_out())

31078

In [31]:
cv_vec1.shape

(4509, 31078)

### TFIDF Vectorizer

In [32]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

In [33]:
tfidf1 = TfidfVectorizer(stop_words='english')

In [34]:
tfidf_vec = tfidf.fit_transform(new_df['tag']).toarray()

In [35]:
tfidf_vec1 = tfidf1.fit_transform(new_df['tag']).toarray()

In [36]:
stop_words = tfidf1.get_stop_words()

### Word2Vec

In [37]:
# remove stopwords
def remove(obj):
  for i in stop_words:
    obj = obj.replace(f" {i}","") or obj.replace(f"{i} ","")
  return obj
lo = new_df['tag'].apply(remove)

In [38]:
#training word2vec
story = []
for row in lo:
  raw_sent = sent_tokenize(row)
  for sent in raw_sent:
    story.append(simple_preprocess(sent))

model = gensim.models.Word2Vec(window=3, min_count=1, sg=0, vector_size=50)
model.build_vocab(story)
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(700937, 724935)

In [39]:
# Preprocessing and mean of words for calculating cosine similarity

def preprocess(text):
  result = []
  raw_sent = sent_tokenize(text)
  for sent in raw_sent:
    result.append(simple_preprocess(sent))
  return result
    
def means1(text):
  result = []
  raw_sentence = sent_tokenize(str(text))
  for sent in raw_sentence:
    tokens = preprocess(sent)
    tokens = [w for w in tokens[0] if w in model.wv]
    if len(tokens)==0: continue
    result.append(np.mean(model.wv[tokens], axis =0))
  return np.mean(result, axis =0)

In [40]:
word2vec_vec = lo.apply(means1)
word2vec_vec = np.vstack(word2vec_vec.values)

### Cosine Similarity

In [41]:
cs_cv = cosine_similarity(cv_vec)

In [None]:
cs_cv1 = cosine_similarity(cv_vec1)

In [None]:
cs_tfidf = cosine_similarity(tfidf_vec)

In [None]:
cs_tfidf1 = cosine_similarity(tfidf_vec1)

In [None]:
cs_word2vec = cosine_similarity(word2vec_vec)

In [None]:
cs = cosine_similarity(vectors)

In [None]:
def reccomend(movie, simi):
  index = new_df[new_df['title']==movie].index[0]
  reccomend_movies = sorted(list(enumerate(simi[index])), reverse= True, key = lambda x: x[1])[1:6]
  for i, recc in reccomend_movies:
    print(f"Movies: {new_df['title'].iloc[i]}\t Percentages of Matching: {(recc*100):.2f}%")

In [None]:
reccomend('Spectre', cs_cv)

In [None]:
reccomend('Spectre', cs_cv1)

In [None]:
reccomend('Spectre', cs_tfidf)

In [None]:
reccomend('Spectre', cs_tfidf1)

In [None]:
reccomend('Spectre', cs_word2vec)

In [None]:
import pickle

pickle.dump(cs_cv, open('cos_similarity_countvec.pkl', 'wb'))
pickle.dump(cs_cv1, open('cos_similarity_countvec1.pkl', 'wb'))
pickle.dump(cs_tfidf, open('cos_similarity_tfidf.pkl', 'wb'))
pickle.dump(cs_tfidf1, open('cos_similarity_tfidf1.pkl', 'wb'))
pickle.dump(cs_word2vec, open('cos_similarity_word2vec.pkl', 'wb'))

In [None]:
new_df.to_csv('new_df.csv', index = False) # To save the file

In [31]:
new_df['title']

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4804                                 El Mariachi
4805                                   Newlyweds
4806                   Signed, Sealed, Delivered
4807                            Shanghai Calling
4808                           My Date with Drew
Name: title, Length: 4806, dtype: object