In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
from sklearn.metrics.pairwise import linear_kernel


In [2]:
df = pd.read_csv('mini_movies.csv')
df.shape

(4803, 20)

In [3]:
df.isnull().sum()


budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [4]:
df['overview']=df['overview'].fillna('')


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime as dt

s=dt.now()

vectorizer = TfidfVectorizer()
overview_tfidf = vectorizer.fit_transform(df['overview'])
print("Shape",overview_tfidf.shape)

print("Time taken to run cell:",dt.now() - s)

Shape (4803, 21262)
Time taken to run cell: 0:00:00.254849


In [6]:
similarity_tfidf  = linear_kernel(overview_tfidf,overview_tfidf)
indextitle = pd.Series(df.index, index=df['title']).drop_duplicates()

In [7]:
   def tfidf_based_recommendation(title):
    
    id = indextitle[title]    
    
    similarity_score = list(enumerate(similarity_tfidf[id]))
    
    #pair_distance = pairwise_distances(overview_bow,overview_bow[id])
    indices = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    indices = indices[1:10]
    
    movie_indices = [i[0] for i in indices]
#     print(movie_indices)
    df_indices = list(df.index[movie_indices])
    print("Similar movie of {} are: \n".format(df["title"].loc[id]))
    for i in range(0,len(indices)):
        
        print("{}".format(df['title'].loc[df_indices[i]]))

In [8]:
indextitle

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [9]:
tfidf_based_recommendation('Avatar')


Similar movie of Avatar are: 

Apollo 18
Tears of the Sun
The American
The Inhabited Island
The Matrix
Blood and Chocolate
Semi-Pro
The Adventures of Pluto Nash
The Book of Life


# COUNT VECTORIZER

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
title_bow = vectorizer.fit_transform(df['overview'])
df1 = pd.DataFrame(title_bow.toarray(),columns = vectorizer.get_feature_names())
df1.head()

Unnamed: 0,00,000,007,07am,10,100,1000,101,108,10th,...,zuckerberg,zula,zuzu,zyklon,æon,éloigne,émigré,été,única,über
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
similarity  = linear_kernel(title_bow,title_bow)

In [12]:
indextitle = pd.Series(df.index, index=df['title']).drop_duplicates()
print(indextitle.head())
print(indextitle.size)

title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
dtype: int64
4803


In [13]:
def bag_of_words(title):
    
    id = indextitle[title]    
    
    similarity_score = list(enumerate(similarity[id]))
    print(pd.DataFrame(similarity_score))
    
    indices = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    indices = indices[1:10]
    
    movie_indices = [i[0] for i in indices]
    
    df_indices = list(df.index[movie_indices])
    
    print("Similar movie of {} are: \n".format(df["title"].loc[id]))
    
    for i in range(0,len(indices)):
        print("{}".format(df['title'].loc[df_indices[i]]))

In [14]:
bag_of_words('Toy Story')


         0     1
0        0   9.0
1        1  11.0
2        2  11.0
3        3  18.0
4        4  23.0
...    ...   ...
4798  4798  27.0
4799  4799   6.0
4800  4800  24.0
4801  4801  17.0
4802  4802  14.0

[4803 rows x 2 columns]
Similar movie of Toy Story are: 

The Midnight Meat Train
The Work and the Glory II: American Zion
Semi-Pro
Toy Story
Roadside Romeo
The Little Ponderosa Zoo
Thank You for Smoking
The Book of Mormon Movie, Volume 1: The Journey
Once in a Lifetime: The Extraordinary Story of the New York Cosmos


In [15]:
bag_of_words('Roadside Romeo')


         0      1
0        0   54.0
1        1   85.0
2        2   78.0
3        3  141.0
4        4  131.0
...    ...    ...
4798  4798  149.0
4799  4799   27.0
4800  4800  140.0
4801  4801  103.0
4802  4802  100.0

[4803 rows x 2 columns]
Similar movie of Roadside Romeo are: 

The Midnight Meat Train
Gladiator
The Work and the Glory II: American Zion
Semi-Pro
The Book of Mormon Movie, Volume 1: The Journey
The Thief and the Cobbler
Once in a Lifetime: The Extraordinary Story of the New York Cosmos
The Little Ponderosa Zoo
Thank You for Smoking


# word to vector

In [16]:
from tqdm import tqdm
overview_list = []

for sent in tqdm(df['overview']):
    overview_list.append(sent.split())

100%|██████████████████████████████████████████████████████████████████████████| 4803/4803 [00:00<00:00, 150498.23it/s]


In [17]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings("ignore")

w2vmodel=Word2Vec(overview_list,min_count=5,size=50,workers=4)
w2v_words = list(w2vmodel.wv.vocab)
print(len(w2v_words))

5598


In [18]:
overview_vector=[]
for sent in tqdm(df['overview']):
    sent_vec=np.zeros(50)
    count=0
    for word in sent:
        if word in w2v_words:
            vec = w2vmodel.wv[word]
            sent_vec+=vec
            count+=1
    if count !=0:
        sent_vec/=count
    overview_vector.append(sent_vec)

100%|██████████████████████████████████████████████████████████████████████████████| 4803/4803 [01:48<00:00, 44.17it/s]


In [14]:
similarity_w2vec  = linear_kernel(overview_vector,overview_vector)


In [15]:
def w2v_based_recommendation(title):
    
    id = indextitle[title]    
    
    similarity_score = list(enumerate(similarity_w2vec[id]))
    
    #pair_distance = pairwise_distances(overview_bow,overview_bow[id])
    indices = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    indices = indices[1:10]
    
    movie_indices = [i[0] for i in indices]
#     print(movie_indices)
    df_indices = list(df.index[movie_indices])
    print("Similar movie of {} are: \n".format(df["title"].loc[id]))
    for i in range(0,len(indices)):
        
        print("{}".format(df['title'].loc[df_indices[i]]))

In [17]:
w2v_based_recommendation('Avatar')


Similar movie of Avatar are: 

Jesus' Son
Harrison Montgomery
Gone with the Wind
Basic
Good bye, Lenin!
The Man
The Island of Dr. Moreau
The Great Debaters
Torn Curtain


In [34]:
w2v_based_recommendation('Toy Story')

Similar movie of Toy Story are: 

Jesus' Son
Harrison Montgomery
Gone with the Wind
Basic
The Man
Good bye, Lenin!
The Island of Dr. Moreau
The Great Debaters
Torn Curtain
