In [66]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [106]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/angelicablancogarcia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/angelicablancogarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/angelicablancogarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/angelicablancogarcia/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [69]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/angelicablancogarcia/nltk_data...


True

# EDA + Data Cleaning

In [2]:
data = pd.read_csv("movies.csv")
data.head()

Unnamed: 0,title,year,certificate,runtime,genre,rating,metascore,synopsis,director,votes,gross,cast1,cast2,cast3,cast4
0,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,84.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,2669470,$534.86M,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine
1,The Lord of the Rings: The Return of the King,2003,U,201 min,"Action, Adventure, Drama",9.0,94.0,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,1856911,$377.85M,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom
2,Inception,2010,UA,148 min,"Action, Adventure, Sci-Fi",8.8,74.0,A thief who steals corporate secrets through t...,Christopher Nolan,2368139,$292.58M,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe
3,The Lord of the Rings: The Fellowship of the Ring,2001,U,178 min,"Action, Adventure, Drama",8.8,92.0,A meek Hobbit from the Shire and eight compani...,Peter Jackson,1886353,$315.54M,Elijah Wood,Ian McKellen,Orlando Bloom,Sean Bean
4,The Lord of the Rings: The Two Towers,2002,UA,179 min,"Action, Adventure, Drama",8.8,87.0,While Frodo and Sam edge closer to Mordor with...,Peter Jackson,1676766,$342.55M,Elijah Wood,Ian McKellen,Viggo Mortensen,Orlando Bloom


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10067 entries, 0 to 10066
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        10067 non-null  object 
 1   year         10067 non-null  object 
 2   certificate  8292 non-null   object 
 3   runtime      10064 non-null  object 
 4   genre        10067 non-null  object 
 5   rating       10067 non-null  float64
 6   metascore    8049 non-null   float64
 7   synopsis     10067 non-null  object 
 8   director     10067 non-null  object 
 9   votes        10067 non-null  object 
 10  gross        7252 non-null   object 
 11  cast1        10064 non-null  object 
 12  cast2        10063 non-null  object 
 13  cast3        10057 non-null  object 
 14  cast4        10047 non-null  object 
dtypes: float64(2), object(13)
memory usage: 1.2+ MB


In [4]:
# Checking nan
data.isna().sum()

title             0
year              0
certificate    1775
runtime           3
genre             0
rating            0
metascore      2018
synopsis          0
director          0
votes             0
gross          2815
cast1             3
cast2             4
cast3            10
cast4            20
dtype: int64

In [5]:
#dropping duplicates --> There were no duplicates
data.drop_duplicates()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10067 entries, 0 to 10066
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        10067 non-null  object 
 1   year         10067 non-null  object 
 2   certificate  8292 non-null   object 
 3   runtime      10064 non-null  object 
 4   genre        10067 non-null  object 
 5   rating       10067 non-null  float64
 6   metascore    8049 non-null   float64
 7   synopsis     10067 non-null  object 
 8   director     10067 non-null  object 
 9   votes        10067 non-null  object 
 10  gross        7252 non-null   object 
 11  cast1        10064 non-null  object 
 12  cast2        10063 non-null  object 
 13  cast3        10057 non-null  object 
 14  cast4        10047 non-null  object 
dtypes: float64(2), object(13)
memory usage: 1.2+ MB


In [6]:
# Checking unique titles, to see if it can be unique identifier
data["title"].nunique()

9765

In [7]:
# Checking what are the repeated titles
title_counts = data['title'].value_counts()
repeated_titles = title_counts[title_counts > 1]
repeated_titles_df = pd.DataFrame({'Title': repeated_titles.index, 'Count': repeated_titles.values})
repeated_titles_df

Unnamed: 0,Title,Count
0,A Star Is Born,4
1,Dracula,4
2,The Mummy,4
3,Halloween,3
4,Hamlet,3
...,...,...
276,Fantastic Four,2
277,The Rookie,2
278,The Invisible Man,2
279,The Sentinel,2


### Checking why althoug the title is the same, each row is a different film

In [8]:
filtered_data = data[data['title']== "A Star Is Born"]
filtered_data

Unnamed: 0,title,year,certificate,runtime,genre,rating,metascore,synopsis,director,votes,gross,cast1,cast2,cast3,cast4
4802,A Star Is Born,2018,A,136 min,"Drama, Music, Romance",7.6,88.0,A musician helps a young singer find fame as a...,Bradley Cooper,389747,$215.29M,Lady Gaga,Bradley Cooper,Sam Elliott,Greg Grunberg
6486,A Star Is Born,1954,U,154 min,"Drama, Musical, Romance",7.5,89.0,A film star helps a young singer and actress f...,George Cukor,18585,$14.93M,Judy Garland,James Mason,Jack Carson,Charles Bickford
6810,A Star Is Born,1937,,111 min,"Drama, Romance",7.3,77.0,A young woman comes to Hollywood with dreams o...,William A. Wellman,10148,$4.36M,Janet Gaynor,Fredric March,Adolphe Menjou,May Robson
8681,A Star Is Born,1976,U,139 min,"Drama, Music, Romance",6.1,59.0,A has-been rock star falls in love with a youn...,Frank Pierson,12120,$80.00M,Barbra Streisand,Kris Kristofferson,Gary Busey,Oliver Clark


### Dropping the columns that are not of our interest

In [9]:
data = data.drop(["certificate", "metascore", "gross", "cast3", "cast4"], axis = 1)
data

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2
0,The Dark Knight,2008,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,2669470,Christian Bale,Heath Ledger
1,The Lord of the Rings: The Return of the King,2003,201 min,"Action, Adventure, Drama",9.0,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,1856911,Elijah Wood,Viggo Mortensen
2,Inception,2010,148 min,"Action, Adventure, Sci-Fi",8.8,A thief who steals corporate secrets through t...,Christopher Nolan,2368139,Leonardo DiCaprio,Joseph Gordon-Levitt
3,The Lord of the Rings: The Fellowship of the Ring,2001,178 min,"Action, Adventure, Drama",8.8,A meek Hobbit from the Shire and eight compani...,Peter Jackson,1886353,Elijah Wood,Ian McKellen
4,The Lord of the Rings: The Two Towers,2002,179 min,"Action, Adventure, Drama",8.8,While Frodo and Sam edge closer to Mordor with...,Peter Jackson,1676766,Elijah Wood,Ian McKellen
...,...,...,...,...,...,...,...,...,...,...
10062,Dudley Do-Right,1999,77 min,"Comedy, Family, Romance",3.9,Inept Canadian mountie Dudley Do-Right chases ...,Hugh Wilson,10928,Brendan Fraser,Sarah Jessica Parker
10063,Tubelight,2017,136 min,"Drama, War",3.9,A story of two brothers set during the Sino-In...,Kabir Khan,20743,Salman Khan,Sohail Khan
10064,The Disappointments Room,2016,91 min,"Drama, Horror, Thriller",3.9,A mother and her young son release unimaginabl...,D.J. Caruso,10081,Kate Beckinsale,Mel Raido
10065,Material Girls,2006,98 min,"Comedy, Family, Romance",3.9,"Two wealthy sisters, both heiresses to their f...",Martha Coolidge,22415,Hilary Duff,Haylie Duff


In [10]:
!pip install pandas_profiling  # Run this from Jupytor notebook Ignore the warnings if any 
from pandas_profiling import ProfileReport #restart the kernel if throws error
ProfileReport(data)



  from pandas_profiling import ProfileReport #restart the kernel if throws error


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [11]:
# EDA Package
!pip install jupyter notebook
from ydata_profiling import ProfileReport
#ProfileReport
ProfileReport(data)





Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



### Filling the NaNs

In [12]:
data.isna().sum()

title       0
year        0
runtime     3
genre       0
rating      0
synopsis    0
director    0
votes       0
cast1       3
cast2       4
dtype: int64

In [13]:
data.loc[pd.isna(data["runtime"]), :]

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2
4032,Mantus,2014,,"Comedy, Drama, Thriller",4.3,Madame Mary is the mother of Mantus (Satan) an...,Enzo Zelocchi,28697,Enzo Zelocchi,Miryam Negrin
5564,My Little Princess,2010,,"Romance, Drama, Family",3.1,"A young Jewish man, Aaron, mid twenties, whos ...",Enzo Zelocchi,43565,Enzo Zelocchi,Charlotte Labadie
8666,No War,II 2022,,"Action, Drama",6.2,John a CIA contractor goes to Ukraine with his...,Enzo Zelocchi,16022,Enzo Zelocchi,Emilia Nimak


In [14]:
missing_runtime = {
    "Mantus": "120 min",
    "My Little Princess": "21 min",
    "No War": "19 min"
}

for title, runtime in missing_runtime.items():
    data.loc[data['title'] == title, 'runtime'] = runtime

In [15]:
# Checking that the replacement was correct
data.loc[data["title"]== "My Little Princess", :]

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2
5564,My Little Princess,2010,21 min,"Romance, Drama, Family",3.1,"A young Jewish man, Aaron, mid twenties, whos ...",Enzo Zelocchi,43565,Enzo Zelocchi,Charlotte Labadie


In [16]:
data.loc[pd.isna(data["cast1"]), :]

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2
5951,Kimetsu no Yaiba: Tsuzumi Yashiki Hen,2021,87 min,"Animation, Action, Fantasy",8.9,Tanjiro ventures to the south-southeast where ...,Haruo Sotozaki,15734,,
5957,Kimetsu no Yaiba: Natagumo Yama Hen,2020,138 min,"Animation, Action, Fantasy",8.8,Tanjiro teams up with Zenitsu and Inosuke to i...,Haruo Sotozaki,12076,,
6041,It's Such a Beautiful Day,2012,62 min,"Animation, Comedy, Drama",8.2,Bill struggles to put together his shattered p...,Don Hertzfeldt,14288,,


In [17]:
missing_cast1 = {
    "Kimetsu no Yaiba: Tsuzumi Yashiki Hen": "Akari Kito",
    "Kimetsu no Yaiba: Natagumo Yama Hen": "Natsuki Hanae",
    "It's Such a Beautiful Day": "Sara Cushman"
}

for title, cast1 in missing_cast1.items():
    data.loc[data['title'] == title, 'cast1'] = cast1


In [18]:
data.loc[data["title"]== "Kimetsu no Yaiba: Tsuzumi Yashiki Hen", :]

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2
5951,Kimetsu no Yaiba: Tsuzumi Yashiki Hen,2021,87 min,"Animation, Action, Fantasy",8.9,Tanjiro ventures to the south-southeast where ...,Haruo Sotozaki,15734,Akari Kito,


In [19]:
data.loc[pd.isna(data["cast2"]), :]

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2
596,All Is Lost,2013,106 min,"Action, Adventure, Drama",6.9,After a collision with a shipping container at...,J.C. Chandor,81353,Robert Redford,
5951,Kimetsu no Yaiba: Tsuzumi Yashiki Hen,2021,87 min,"Animation, Action, Fantasy",8.9,Tanjiro ventures to the south-southeast where ...,Haruo Sotozaki,15734,Akari Kito,
5957,Kimetsu no Yaiba: Natagumo Yama Hen,2020,138 min,"Animation, Action, Fantasy",8.8,Tanjiro teams up with Zenitsu and Inosuke to i...,Haruo Sotozaki,12076,Natsuki Hanae,
6041,It's Such a Beautiful Day,2012,62 min,"Animation, Comedy, Drama",8.2,Bill struggles to put together his shattered p...,Don Hertzfeldt,14288,Sara Cushman,


In [20]:
missing_cast2 = {
    "All Is Lost": "n.d.",
    "Kimetsu no Yaiba: Tsuzumi Yashiki Hen": "n.d.",
    "Kimetsu no Yaiba: Natagumo Yama Hen": "Akari Kito",
    "It's Such a Beautiful Day": "Don Hertzfeldt"
}

for title, cast2 in missing_cast2.items():
    data.loc[data['title'] == title, 'cast2'] = cast2

In [21]:
data.loc[data["title"]== "Kimetsu no Yaiba: Tsuzumi Yashiki Hen", :]

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2
5951,Kimetsu no Yaiba: Tsuzumi Yashiki Hen,2021,87 min,"Animation, Action, Fantasy",8.9,Tanjiro ventures to the south-southeast where ...,Haruo Sotozaki,15734,Akari Kito,n.d.


In [22]:
data.isna().sum()

title       0
year        0
runtime     0
genre       0
rating      0
synopsis    0
director    0
votes       0
cast1       0
cast2       0
dtype: int64

#### Changing year to a integer

In [48]:
data["year"] = data["year"].astype(str)
data["year"] = data["year"].str.replace("I ", "")
data["year"] = data["year"].str.replace("I", "")
data["year"] = data["year"].str.replace("V ", "")
data["year"] = data["year"].str.replace("V", "")
data["year"] = data["year"].str.replace("X ", "")
data["year"] = data["year"].str.replace("X", "")

In [49]:
data.loc[data["year"]== "I2015",:]

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2


In [54]:
data["year"] = data["year"].astype(int)

In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10067 entries, 0 to 10066
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   title     10067 non-null  object 
 1   year      10067 non-null  int64  
 2   runtime   10067 non-null  object 
 3   genre     10067 non-null  object 
 4   rating    10067 non-null  float64
 5   synopsis  10067 non-null  object 
 6   director  10067 non-null  object 
 7   votes     10067 non-null  object 
 8   cast1     10067 non-null  object 
 9   cast2     10067 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 786.6+ KB


#### Changing votes to a integer

In [58]:
data["votes"] = data["votes"].astype(str)
data["votes"] = data["votes"].str.replace(",", "")

In [59]:
data["votes"] = data["votes"].astype(int)

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10067 entries, 0 to 10066
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   title     10067 non-null  object 
 1   year      10067 non-null  int64  
 2   runtime   10067 non-null  object 
 3   genre     10067 non-null  object 
 4   rating    10067 non-null  float64
 5   synopsis  10067 non-null  object 
 6   director  10067 non-null  object 
 7   votes     10067 non-null  int64  
 8   cast1     10067 non-null  object 
 9   cast2     10067 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 786.6+ KB


##### Separating genre into different columns

In [61]:
split_genres = data["genre"].str.split(",", expand=True)

# Create dummy variables for each genre
genre_dummies = pd.get_dummies(split_genres, prefix="", prefix_sep="")

# Concatenate the dummy variables with the original DataFrame
data = pd.concat([data, genre_dummies], axis=1)

In [62]:
data

Unnamed: 0,title,year,runtime,genre,rating,synopsis,director,votes,cast1,cast2,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,The Dark Knight,2008,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,2669470,Christian Bale,Heath Ledger,...,0,0,0,0,0,0,0,0,0,0
1,The Lord of the Rings: The Return of the King,2003,201 min,"Action, Adventure, Drama",9.0,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,1856911,Elijah Wood,Viggo Mortensen,...,0,0,0,0,0,0,0,0,0,0
2,Inception,2010,148 min,"Action, Adventure, Sci-Fi",8.8,A thief who steals corporate secrets through t...,Christopher Nolan,2368139,Leonardo DiCaprio,Joseph Gordon-Levitt,...,0,0,0,0,0,1,0,0,0,0
3,The Lord of the Rings: The Fellowship of the Ring,2001,178 min,"Action, Adventure, Drama",8.8,A meek Hobbit from the Shire and eight compani...,Peter Jackson,1886353,Elijah Wood,Ian McKellen,...,0,0,0,0,0,0,0,0,0,0
4,The Lord of the Rings: The Two Towers,2002,179 min,"Action, Adventure, Drama",8.8,While Frodo and Sam edge closer to Mordor with...,Peter Jackson,1676766,Elijah Wood,Ian McKellen,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10062,Dudley Do-Right,1999,77 min,"Comedy, Family, Romance",3.9,Inept Canadian mountie Dudley Do-Right chases ...,Hugh Wilson,10928,Brendan Fraser,Sarah Jessica Parker,...,0,0,0,0,1,0,0,0,0,0
10063,Tubelight,2017,136 min,"Drama, War",3.9,A story of two brothers set during the Sino-In...,Kabir Khan,20743,Salman Khan,Sohail Khan,...,0,0,0,0,0,0,0,0,0,0
10064,The Disappointments Room,2016,91 min,"Drama, Horror, Thriller",3.9,A mother and her young son release unimaginabl...,D.J. Caruso,10081,Kate Beckinsale,Mel Raido,...,0,0,0,0,0,0,0,1,0,0
10065,Material Girls,2006,98 min,"Comedy, Family, Romance",3.9,"Two wealthy sisters, both heiresses to their f...",Martha Coolidge,22415,Hilary Duff,Haylie Duff,...,0,0,0,0,1,0,0,0,0,0


In [63]:
data = data.drop(["genre"], axis = 1)
data

Unnamed: 0,title,year,runtime,rating,synopsis,director,votes,cast1,cast2,Action,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,The Dark Knight,2008,152 min,9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,2669470,Christian Bale,Heath Ledger,1,...,0,0,0,0,0,0,0,0,0,0
1,The Lord of the Rings: The Return of the King,2003,201 min,9.0,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,1856911,Elijah Wood,Viggo Mortensen,1,...,0,0,0,0,0,0,0,0,0,0
2,Inception,2010,148 min,8.8,A thief who steals corporate secrets through t...,Christopher Nolan,2368139,Leonardo DiCaprio,Joseph Gordon-Levitt,1,...,0,0,0,0,0,1,0,0,0,0
3,The Lord of the Rings: The Fellowship of the Ring,2001,178 min,8.8,A meek Hobbit from the Shire and eight compani...,Peter Jackson,1886353,Elijah Wood,Ian McKellen,1,...,0,0,0,0,0,0,0,0,0,0
4,The Lord of the Rings: The Two Towers,2002,179 min,8.8,While Frodo and Sam edge closer to Mordor with...,Peter Jackson,1676766,Elijah Wood,Ian McKellen,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10062,Dudley Do-Right,1999,77 min,3.9,Inept Canadian mountie Dudley Do-Right chases ...,Hugh Wilson,10928,Brendan Fraser,Sarah Jessica Parker,0,...,0,0,0,0,1,0,0,0,0,0
10063,Tubelight,2017,136 min,3.9,A story of two brothers set during the Sino-In...,Kabir Khan,20743,Salman Khan,Sohail Khan,0,...,0,0,0,0,0,0,0,0,0,0
10064,The Disappointments Room,2016,91 min,3.9,A mother and her young son release unimaginabl...,D.J. Caruso,10081,Kate Beckinsale,Mel Raido,0,...,0,0,0,0,0,0,0,1,0,0
10065,Material Girls,2006,98 min,3.9,"Two wealthy sisters, both heiresses to their f...",Martha Coolidge,22415,Hilary Duff,Haylie Duff,0,...,0,0,0,0,1,0,0,0,0,0


# Synopsis Text Preprocessing

1. Removing punctuation.
2. Remove Stopwords
3. Remove Additional Spaces and Digits.
4. And Lemmatize the Text.
5. Returns cleaned list.

In [70]:
#Text pre-processing
"""removes punctuation, stopwords, and returns a list of the remaining words, or tokens"""

import string
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Remove words
    '''
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    return [stemmer.lemmatize(word) for word in nopunc]

#testing the function with a sample text#
sample_text = "Hey There! This is a Sample review, which 123happens {blah}%456 to contain happened punctuations universal rights of right contained."
print(text_process(sample_text))

['hey', 'there', 'this', 'sample', 'review', 'happens', 'blah', 'contain', 'happened', 'punctuation', 'universal', 'right', 'right', 'contained']


In [75]:
tokens = text_process(data["synopsis"])

In [100]:
cleaned_tokens = []
for token in tokens:
    cleaned_token = re.sub(r'[^\w\s]', '', token)  # Remove punctuation
    words = re.split(r'\s+', cleaned_token)       # Split into separate words
    cleaned_tokens.extend(words)
tokens=cleaned_tokens

In [102]:
# Checking the total of unique words
unique_tokens = set(tokens)
len(unique_tokens)

27219

In [101]:
wordfreq = {}
for word in tokens:
    if (word not in wordfreq.keys() ): ## first time appearnce in the sentence
        wordfreq[word] = 1 # We initialize the corresponding counter
    else: ## if the world is already existed in the dictionalry 
        wordfreq[word] += 1 # We increase the corresponding counter
wordfreq

{'when': 115,
 'menace': 11,
 'known': 78,
 'joker': 4,
 'wreaks': 6,
 'havoc': 20,
 'chaos': 31,
 'people': 219,
 'gotham': 13,
 'batman': 10,
 'must': 610,
 'accept': 17,
 'one': 604,
 'greatest': 29,
 'psychological': 13,
 'physical': 25,
 'test': 37,
 'ability': 35,
 'fight': 228,
 'injusticegandalf': 1,
 'aragorn': 1,
 'lead': 192,
 'world': 475,
 'men': 157,
 'saurons': 3,
 'army': 109,
 'draw': 13,
 'gaze': 2,
 'frodo': 2,
 'sam': 18,
 'approach': 10,
 'mount': 8,
 'doom': 8,
 'ringa': 3,
 'thief': 79,
 'steal': 70,
 'corporate': 22,
 'secret': 238,
 'use': 59,
 'dreamsharing': 1,
 'technology': 15,
 'given': 42,
 'inverse': 1,
 'task': 35,
 'planting': 1,
 'idea': 22,
 'mind': 27,
 'ceo': 8,
 'tragic': 40,
 'past': 165,
 'may': 100,
 'project': 23,
 'team': 305,
 'disastera': 6,
 'meek': 3,
 'hobbit': 2,
 'shire': 1,
 'eight': 18,
 'companion': 19,
 'set': 322,
 'journey': 147,
 'destroy': 60,
 'powerful': 84,
 'ring': 27,
 'save': 266,
 'middleearth': 1,
 'dark': 103,
 'lord':

In [103]:
len(wordfreq) # 

27219

In [104]:
sorted_wordfreq = dict(sorted(wordfreq.items(), key=lambda item: item[1], reverse=True))
sorted_wordfreq


{'young': 1158,
 'life': 1115,
 'find': 990,
 'new': 849,
 'man': 846,
 'woman': 738,
 'family': 616,
 'must': 610,
 'one': 604,
 'love': 573,
 'friend': 516,
 'take': 509,
 'two': 504,
 'get': 497,
 'story': 478,
 'world': 475,
 'girl': 452,
 'group': 436,
 'go': 411,
 'help': 391,
 'war': 384,
 'year': 371,
 '': 370,
 'try': 370,
 'becomes': 344,
 'school': 342,
 'set': 322,
 'the': 322,
 'mysterious': 315,
 'father': 315,
 'boy': 313,
 'fall': 306,
 'team': 305,
 'wife': 301,
 'home': 299,
 'come': 290,
 'murder': 289,
 'son': 287,
 'city': 286,
 'make': 281,
 'turn': 279,
 'american': 271,
 'back': 270,
 'meet': 267,
 'save': 266,
 'time': 264,
 'begin': 259,
 'daughter': 259,
 'police': 257,
 'town': 257,
 'york': 247,
 'return': 247,
 'order': 245,
 'death': 245,
 'former': 242,
 'secret': 238,
 'mother': 233,
 'day': 233,
 'become': 233,
 'high': 232,
 'fight': 228,
 'couple': 223,
 'student': 223,
 'small': 220,
 'people': 219,
 'discovers': 218,
 'attempt': 217,
 'work': 215,


# Vectorization

In [108]:
from nltk.corpus import stopwords

stop_words = list(stopwords.words('english')) 


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer

tidf = TfidfVectorizer(max_features = 30000, ngram_range = (1,3), analyzer = 'word',
                       stop_words = stop_words )

In [129]:
synopsis_df = data["synopsis"]
X = tidf.fit_transform(synopsis_df)

In [130]:
X

<10067x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 189262 stored elements in Compressed Sparse Row format>