In [2]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')
df.head(3)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."


In [6]:
len(df)

34886

In [7]:
np.unique(df['Origin/Ethnicity'])

array(['American', 'Assamese', 'Australian', 'Bangladeshi', 'Bengali',
       'Bollywood', 'British', 'Canadian', 'Chinese', 'Egyptian',
       'Filipino', 'Hong Kong', 'Japanese', 'Kannada', 'Malayalam',
       'Malaysian', 'Maldivian', 'Marathi', 'Punjabi', 'Russian',
       'South_Korean', 'Tamil', 'Telugu', 'Turkish'], dtype=object)

In [8]:
len(df[df['Origin/Ethnicity'] == 'American'])

17377

In [9]:
len(df[df['Origin/Ethnicity'] == 'British'])

3670

In [10]:
# considering only american and british genres
df1 = pd.DataFrame(df[df['Origin/Ethnicity'] == 'American'])
df2 = pd.DataFrame(df[df['Origin/Ethnicity'] == 'British'])

df_final = pd.concat([df1,df2],ignore_index= True)

In [11]:
df_final.head(3)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."


In [12]:
len(df_final)

21047

In [13]:
df_final = df_final[['Title','Plot']]

In [14]:
df_final.head(5)

Unnamed: 0,Title,Plot
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,The earliest known adaptation of the classic f...


In [53]:
df_final['Plot']

0        A bartender is working at a saloon, serving dr...
1        The moon, painted with a smiling face hangs ov...
2        The film, just over a minute long, is composed...
3        Lasting just 61 seconds and consisting of two ...
4        The earliest known adaptation of the classic f...
                               ...                        
21042    In 1934, famous Belgian detective Hercule Poir...
21043    Paddington, having settled with the Brown fami...
21044    ‘Lady’ Sandra Abbott (Imelda Staunton) discove...
21045    In 1973, 16-year-old John Paul Getty III (Paul...
21046    Olivia, a career lawyer in her 40's, feels rea...
Name: Plot, Length: 21047, dtype: object

In [25]:
## Preprocessing using NLP
import nltk
nltk.download('punkt')  # sentence tokenizer
nltk.download('averaged_perceptron_tagger')  # POS tagger
nltk.download('wordnet')  # lemmatizer(database)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'then', 'that', "that'll", 'themselves', 'them', 'now', "should've", 'down', 'by', 'you', 'is', 'very', 'only', 'for', 's', 'with', 'their', 'theirs', 'those', 'himself', 'under', 'they', 'y', 'so', 'too', 'between', "you'd", 'him', 'itself', 'didn', 'will', "needn't", 't', "it's", 'until', 'as', 'there', 'each', 'just', 'its', "don't", 'some', "isn't", 'needn', 'does', 'below', 'most', 'mightn', 'over', 'ain', 'wasn', 'more', 'has', 'no', 'i', 'aren', 'about', 'and', 'doesn', 'other', 'been', "wouldn't", "mightn't", "mustn't", "you're", 'our', 'during', 'the', 'myself', 'off', 'can', 'isn', 'own', 'here', 'how', 'o', 'my', 'why', "haven't", 'me', 'from', 'few', 'having', "didn't", 'doing', 'all', 'had', 'what', 'd', 're', 'when', 'who', 'wouldn', 'through', 'yourselves', 'on', 'where', 'should', 'out', 'before', 've', 'do', 'such', "hasn't", 'whom', 'against', 'be', 'up', "won't", 'but', 'weren', 'hadn', 'we', 'll', 'yours', 'while', 'which', 'yourself', 'm', 'hers', 'once', 'a', "do

In [28]:
VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

In [32]:
from nltk.tokenize import word_tokenize

In [61]:
from dataclasses import replace
from re import T
from tempfile import tempdir
from typing import final


def preprocess_sentences(text):
    """Preprocess a list of sentences.
    Args:
        sentences: list of sentences, each sentence is a string.
    Returns:
        list of preprocessed sentences.
    """
    text = text.lower()
    temp_sent = []
    words = nltk.word_tokenize(text)
    tags = nltk.pos_tag(words)
    
    for i,word in enumerate(words):
        if tags[i][1] in VERB_CODES:
            lemmatized = lemmatizer.lemmatize(word, pos = 'v')
        else:
            lemmatized = lemmatizer.lemmatize(word)
        if lemmatized not in stop_words and lemmatized.isalpha():
            temp_sent.append(lemmatized)
        
    final_sent = " ".join(temp_sent)  # converting list to string
    final_sent = final_sent.replace("n't","not")
    final_sent = final_sent.replace("'s", "is")
    final_sent = final_sent.replace("'m", "am")
    final_sent = final_sent.replace("'re","are")
    final_sent = final_sent.replace("'ll","will")
    final_sent = final_sent.replace("'ve","have")
    final_sent = final_sent.replace("'d","would") 
    return final_sent
    #return temp_sent

        

In [62]:
df_final['processed_plot'] = df_final['Plot'].apply(preprocess_sentences)

In [63]:
#words = nltk.word_tokenize(text)
#print(words)

In [64]:
#tags = nltk.pos_tag(words)
#tags[:5]

In [65]:
#for i,word in enumerate(words):
#    print(i,word)

In [66]:
df_final.head(5)

Unnamed: 0,Title,Plot,processed_plot
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr...",bartender work saloon serve drink customer fil...
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov...",moon paint smile face hang park night young co...
2,The Martyred Presidents,"The film, just over a minute long, is composed...",film minute long compose two shot first girl s...
3,"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...,last second consist two shot first shot set wo...
4,Jack and the Beanstalk,The earliest known adaptation of the classic f...,earliest known adaptation classic fairytale fi...


In [68]:
### tf-idf vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizing movie plots using Tf- idf
tfidfvec = TfidfVectorizer()
tfidf_movieid = tfidfvec.fit_transform(df_final['processed_plot'])

print(tfidf_movieid)

  (0, 41188)	0.04634704931499867
  (0, 23684)	0.1468986210808462
  (0, 52331)	0.0666258277784378
  (0, 2886)	0.07781145134640544
  (0, 56016)	0.12220057794453736
  (0, 24208)	0.08225124677721743
  (0, 78178)	0.09686491882482405
  (0, 64455)	0.2373880269323476
  (0, 68079)	0.1596013770939067
  (0, 59753)	0.16122349646456527
  (0, 11450)	0.11974965521177468
  (0, 8965)	0.06664892759943154
  (0, 47646)	0.13460473251216654
  (0, 25647)	0.20843836365669768
  (0, 66765)	0.13199645042648225
  (0, 5148)	0.0989847442256418
  (0, 79905)	0.13943959091595828
  (0, 6094)	0.05867821377061008
  (0, 30206)	0.15619074374948191
  (0, 31885)	0.06905494102367965
  (0, 21257)	0.12464085646561647
  (0, 24130)	0.10003008204731886
  (0, 31666)	0.12027494408301305
  (0, 57672)	0.08942092143018524
  (0, 3781)	0.11400862927243287
  :	:
  (21046, 60687)	0.11163073728486274
  (21046, 49005)	0.10248605516144894
  (21046, 24869)	0.11303355441197084
  (21046, 80529)	0.13565865500079005
  (21046, 41047)	0.143396706560

In [69]:
# finding the cosine similarity between vectors
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(tfidf_movieid,tfidf_movieid)

print(cos_sim)

[[1.         0.02868374 0.00821845 ... 0.01498438 0.00623665 0.00442621]
 [0.02868374 1.         0.03713543 ... 0.         0.00551872 0.01444205]
 [0.00821845 0.03713543 1.         ... 0.01006554 0.00616888 0.01287415]
 ...
 [0.01498438 0.         0.01006554 ... 1.         0.00974492 0.01369473]
 [0.00623665 0.00551872 0.00616888 ... 0.00974492 1.         0.01066113]
 [0.00442621 0.01444205 0.01287415 ... 0.01369473 0.01066113 1.        ]]


In [70]:
df_final.head(3)

Unnamed: 0,Title,Plot,processed_plot
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr...",bartender work saloon serve drink customer fil...
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov...",moon paint smile face hang park night young co...
2,The Martyred Presidents,"The film, just over a minute long, is composed...",film minute long compose two shot first girl s...


In [72]:
df_final = df_final.set_index('Title')   #setting title as index

df_final.head(3)

Unnamed: 0_level_0,Plot,processed_plot
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr...",bartender work saloon serve drink customer fil...
Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov...",moon paint smile face hang park night young co...
The Martyred Presidents,"The film, just over a minute long, is composed...",film minute long compose two shot first girl s...


In [76]:
indices = pd.Series(df_final.index)
print(indices)

0                  Kansas Saloon Smashers
1           Love by the Light of the Moon
2                 The Martyred Presidents
3        Terrible Teddy, the Grizzly King
4                  Jack and the Beanstalk
                       ...               
21042        Murder on the Orient Express
21043                        Paddington 2
21044                   Finding Your Feet
21045          All the Money in the World
21046                     You, Me and Him
Name: Title, Length: 21047, dtype: object


In [81]:
indices[indices == "Harry Potter and the Chamber of Secrets"].index[0]

14063

In [93]:
sim_scr = cos_sim[14063]  # cosine similarity of harry potter and the chamber of secrets
#sim_scr
sim_scr = pd.Series(sim_scr).sort_values(ascending = False)

In [94]:
sim_scr

14063    1.000000
13862    0.555194
20118    0.555194
15845    0.489120
20410    0.489120
           ...   
18396    0.000000
5249     0.000000
17471    0.000000
1653     0.000000
562      0.000000
Length: 21047, dtype: float64

In [99]:
list(sim_scr.iloc[1:11].index)

[13862, 20118, 15845, 20410, 15648, 16040, 20477, 15225, 14667, 14470]

In [100]:
# Building recommendation function which recommends top 10 similar movies


def recommendations(title, cosine_sim = cos_sim):
    recommended_movies = []
    index = indices[indices == title].index[0]
    similarity_scores = pd.Series(cos_sim[index]).sort_values(ascending = False)
    top_10_movies = list(similarity_scores.iloc[1:11].index)
    
    for i in top_10_movies:
        recommended_movies.append(list(df_final.index)[i])
        
    return recommended_movies

In [101]:

recommendations("Harry Potter and the Chamber of Secrets")

["Harry Potter and the Sorcerer's Stone",
 "Harry Potter and the Philosopher's Stone",
 'Harry Potter and the Deathly Hallows: Part 1',
 'Harry Potter and the Deathly Hallows: Part I',
 'Harry Potter and the Half-Blood Prince',
 'Harry Potter and the Deathly Hallows: Part 2',
 'Harry Potter and the Deathly Hallows: Part II',
 'Harry Potter and the Order of the Phoenix',
 'Harry Potter and the Goblet of Fire',
 'Harry Potter and the Prisoner of Azkaban']

In [102]:

recommendations("Ice Age")

['Ice Age: The Meltdown',
 'Ice Age: Dawn of the Dinosaurs',
 'The Wrong Man',
 'Ice Age: Continental Drift',
 'The Buttercup Chain',
 'Ice Age: Collision Course',
 'Runaway Train',
 'Corrina, Corrina',
 'Sid and Nancy',
 'Zorro, the Gay Blade']

In [103]:

recommendations("Blackmail")

['Checkpoint',
 'Odds Against Tomorrow',
 'The Beast with Five Fingers',
 'Fruitvale Station',
 'The Exile',
 'The Black Swan',
 'Small Town Gay Bar',
 'Eye of the Cat',
 'Blown Away',
 'Brenda Starr, Reporter']