In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
df = pd.read_csv('imdb_top_1000.csv')
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [61]:
# select the relevant columns
df_select = df[['Overview', 'Genre']]
df_select.head()

Unnamed: 0,Overview,Genre
0,Two imprisoned men bond over a number of years...,Drama
1,An organized crime dynasty's aging patriarch t...,"Crime, Drama"
2,When the menace known as the Joker wreaks havo...,"Action, Crime, Drama"
3,The early life and career of Vito Corleone in ...,"Crime, Drama"
4,A jury holdout attempts to prevent a miscarria...,"Crime, Drama"


In [84]:
# select unique genres
unique_genres = []
for row in df_select['Genre']:
    row_genre = row.split(', ')
    unique_genres += [genre for genre in row_genre if genre not in unique_genres]
    
unique_genres.sort()
print(unique_genres)

['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


In [87]:
# One hot encode the genres
def one_hot_encode(genres, genre):
    if genre in genres:
        return 1
    else:
        return 0
    
df_data = df_select.copy()
    
df_data['Genre'] = df_data['Genre'].str.split(', ')
df_data = df_data.dropna(axis=1)

for genre in unique_genres:
    df_data[genre] = df_data['Genre'].apply(lambda x: one_hot_encode(x, genre))

df_data.head()

Unnamed: 0,Overview,Genre,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,Two imprisoned men bond over a number of years...,[Drama],0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,An organized crime dynasty's aging patriarch t...,"[Crime, Drama]",0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,When the menace known as the Joker wreaks havo...,"[Action, Crime, Drama]",1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,The early life and career of Vito Corleone in ...,"[Crime, Drama]",0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,A jury holdout attempts to prevent a miscarria...,"[Crime, Drama]",0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
# drop unnecessary 'Genre' column
df_data.drop(columns=['Genre'], inplace=True)
df_data.head()

Unnamed: 0,Overview,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,Two imprisoned men bond over a number of years...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,An organized crime dynasty's aging patriarch t...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,When the menace known as the Joker wreaks havo...,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The early life and career of Vito Corleone in ...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A jury holdout attempts to prevent a miscarria...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
# create corpus for CountVectorizer
corpus = []

for text in df_data['Overview']:
    corpus.append(text)

In [100]:
# fit the vectorizer
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(corpus)

In [92]:
# vectorize function:
#def vectorize_text(text):
#   return 
    

In [102]:
# vectorize the input text
df_vectorize = df_data.copy()
df_vectorize['Vectorized'] = df_vectorize['Overview'].apply(lambda text: vectorizer.transform([text]).toarray())
df_vectorize.head()

Unnamed: 0,Overview,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,...,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Vectorized
0,Two imprisoned men bond over a number of years...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,An organized crime dynasty's aging patriarch t...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,When the menace known as the Joker wreaks havo...,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,The early life and career of Vito Corleone in ...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,A jury holdout attempts to prevent a miscarria...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [104]:
# drop unnecessary column 'Overview'
df_vectorize.drop(columns=['Overview'], inplace=True)
df_vectorize.head()

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,...,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Vectorized
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
