
# Movie Data Pre-Processing

### Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

pd.set_option("display.max_columns",350)

In [2]:
df = pd.read_csv('./data/df_to_preprocess.csv')
df.drop(columns = 'Unnamed: 0', inplace = True)
print(df.shape)
df.head()

(13173, 41)


Unnamed: 0,rt_id,critic_name,publisher,review_type,review_date,review_content,title,plot,critics_consensus,content_rating,genre,director,writer,cast,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,critic_id,review_score,year,imdb_title,country,imdb_plot,imdb_score,imdb_count,review_negative,review_neutral,review_positive,review_compound,consensus_negative,consensus_neutral,consensus_positive,consensus_compound,score
0,m/0814255,Ben McEachen,Sunday Mail (Australia),1,2010-02-09,Whether audiences will get behind The Lightnin...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,0,49.0,149.0,0,53.0,254421.0,1022,70.0,2010,tt0814255,"UK, Canada, USA",A teenager discovers he's the descendant of a ...,59.0,174198.0,0.08,0.727,0.193,0.705,0.0,0.59,0.41,0.9274,53.95
1,m/0814255,Nick Schager,Slant Magazine,0,2010-02-10,Harry Potter knockoffs don't come more transpa...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,0,49.0,149.0,0,53.0,254421.0,7475,25.0,2010,tt0814255,"UK, Canada, USA",A teenager discovers he's the descendant of a ...,59.0,174198.0,0.0,1.0,0.0,0.0,0.0,0.59,0.41,0.9274,53.95
2,m/0878835,Erik Childress,eFilmCritic.com,1,2010-01-31,Holofcener always gives us more to chew on tha...,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2012-09-04,90.0,Sony Pictures Classics,1,87.0,142.0,1,64.0,11574.0,3177,75.0,2010,tt0878835,USA,"In New York City, a husband and wife butt head...",66.0,10928.0,0.0,0.928,0.072,0.4767,0.0,0.885,0.115,0.5023,73.53
3,m/0878835,Richard Mowe,Boxoffice Magazine,1,2010-03-24,"The tone is low-key but very funny. Yet, at th...",Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2012-09-04,90.0,Sony Pictures Classics,1,87.0,142.0,1,64.0,11574.0,8239,60.0,2010,tt0878835,USA,"In New York City, a husband and wife butt head...",66.0,10928.0,0.0,0.816,0.184,0.6474,0.0,0.885,0.115,0.5023,73.53
4,m/10,Scott Weinberg,eFilmCritic.com,1,2002-07-25,Obvious but entertaining portrayal of midlife ...,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,2014-07-24,122.0,Waner Bros.,1,67.0,24.0,0,53.0,14684.0,8874,80.0,1979,tt0078721,USA,A Hollywood composer goes through a mid-life c...,61.0,14946.0,0.486,0.332,0.182,-0.8126,0.262,0.547,0.191,-0.4676,60.03


In [5]:
# Making dataset smaller to avoid pre-processing and modeling issues.
df = df.groupby('title').head(1)
df.shape

(6587, 41)

#### One Hot Encode Content Rating

In [6]:
rating_dummy = pd.get_dummies(df['content_rating'])
rating_dummy.drop(columns = 'NC17', inplace = True)
print(rating_dummy.shape)
rating_dummy.head()

(6587, 5)


Unnamed: 0,G,NR,PG,PG-13,R
0,0,0,1,0,0
2,0,0,0,0,1
4,0,0,0,0,1
6,0,1,0,0,0
8,0,0,0,0,1


In [7]:
# Concatenating df and rating dummy
df = pd.concat([df,rating_dummy], axis = 1)
df.drop(columns = 'content_rating', inplace = True)

CountVectorizer EDA

In [8]:
# Instantiating cvec
cvec = CountVectorizer(stop_words = 'english')

In [9]:
# Removing duplicates
df_plot  = df[['plot']].drop_duplicates(keep = 'first')
print(df_plot.shape)
df_plot.head()

(6587, 1)


Unnamed: 0,plot
0,"Always trouble-prone, the life of teenager Per..."
2,Kate (Catherine Keener) and her husband Alex (...
4,"A successful, middle-aged Hollywood songwriter..."
6,"While on vacation in London, Canadian Richard ..."
8,"Fico Fellove (Andy Garcia), an apolitical Hava..."


In [10]:
# CVEC Fit Transform Movie Info
df_plot = df_plot['plot']
cvec.fit(df_plot)
df_plot = cvec.transform(df_plot).toarray()
plot_df = pd.DataFrame(df_plot, columns = cvec.get_feature_names())



In [34]:
# Movie Info Top 20 Words
plot_df.sum().sort_values(ascending = False).head(20)

life       1211
new        1162
young       906
man         816
family      796
world       696
father      633
finds       625
love        601
wife        579
john        570
soon        565
help        552
home        552
woman       546
old         530
time        524
friends     522
way         493
mother      487
dtype: int64

Top 5 words: Life, new, young, man, family

In [11]:
# Removing duplicates
df_critic  = df[['critics_consensus']].drop_duplicates(keep = 'first')
print(df_critic.shape)
df_critic.head()

(6587, 1)


Unnamed: 0,critics_consensus
0,Though it may seem like just another Harry Pot...
2,Nicole Holofcener's newest might seem slight i...
4,Blake Edwards' bawdy comedy may not score a pe...
6,"Packed with twists and turns, this essential e..."
8,"Its heart is in the right place, but what star..."


In [12]:
# CVEC Fit Transform Critics Consensus
df_critic = df['critics_consensus']
cvec.fit(df_critic)
df_critic = cvec.transform(df_critic).toarray()
critic_df = pd.DataFrame(df_critic, columns = cvec.get_feature_names())



In [13]:
# Critic Concensus Top 20 Words
critic_df.sum().sort_values(ascending = False).head(20)

story           600
cast            598
performances    561
performance     546
director        500
movie           471
comedy          430
film            404
thriller        350
drama           344
action          337
plot            313
characters      308
life            281
funny           271
offers          264
make            258
script          251
strong          242
humor           234
dtype: int64

Top 5 words: Story, Cast, Performances, Performance, Director.

In [14]:
# CVEC Fit Transform Review Content - Each is unique, so not dropping any duplicates
df_review = df['review_content']
cvec.fit(df_review)
df_review = cvec.transform(df_review).toarray()
review_df = pd.DataFrame(df_review, columns = cvec.get_feature_names())

In [15]:
# Review Content Top 20 Words
review_df.sum().sort_values(ascending = False).head(20)

film          1162
movie          821
like           445
story          359
just           308
comedy         303
good           298
best           259
time           242
way            227
make           197
director       194
characters     186
funny          182
fun            181
little         181
action         181
great          177
films          174
love           168
dtype: int64

Top 5 Words: Film, Review, Content, Movie, Like. Overlap with top 20 critic consensus: Story, Director, Movie, Comedy, Film, Drama, Life and Characters

In [16]:
# CVEC Fit Transform Genres
df_genre = df['genre']
cvec.fit(df_genre)
df_genre = cvec.transform(df_genre).toarray()
genre_df = pd.DataFrame(df_genre, columns = cvec.get_feature_names())

In [17]:
# Top 20 Genres
genre_df.sum().sort_values(ascending = False).head(20)

drama            3875
comedy           2476
action           1578
adventure        1578
suspense         1569
mystery          1569
science           869
fiction           869
fantasy           869
romance           830
horror            728
kids              505
family            505
house             468
international     468
art               468
classics          385
animation         266
arts              234
performing        234
dtype: int64

Top 5 Genres: Drama, Action, Comedy, Adventure, Suspense

In [19]:
# Removing duplicates
df_imdb  = df[['imdb_plot']].drop_duplicates(keep = 'first')
print(df_critic.shape)
df_imdb.head()

(6587, 14178)


Unnamed: 0,imdb_plot
0,A teenager discovers he's the descendant of a ...
2,"In New York City, a husband and wife butt head..."
4,A Hollywood composer goes through a mid-life c...
6,A man in London tries to help a counter-espion...
8,An evil scientist plots to take over the world...


In [20]:
# CVEC Fit Transform Critics Consensus
df_imdb = df_imdb['imdb_plot']
cvec.fit(df_imdb)
df_imdb = cvec.transform(df_imdb).toarray()
df_imdb = pd.DataFrame(df_imdb, columns = cvec.get_feature_names())



In [21]:
# Top 20 Genres
df_imdb.sum().sort_values(ascending = False).head(20)

young      775
life       698
man        644
new        615
woman      495
family     466
world      384
love       377
story      311
school     298
finds      290
group      285
friends    283
old        279
father     273
wife       268
city       265
town       257
girl       257
war        249
dtype: int64

Pre-Process for Modeling

In [22]:
# Setting parameters to avoid crash 
cvec = CountVectorizer(stop_words = 'english', lowercase = True, min_df = 2, max_df = .9, max_features = 3000)

In [23]:
# Creating text column
def create_text(x):
    return (''.join(x['title']) + ' ' + ''.join(x['plot']) + ' ' + ''.join(x['critics_consensus']) +
            ' ' + ''.join(x['genre']) + ' ' + ''.join(x['director']) + ' ' + ''.join(x['cast'] + 
            ' ' + ''.join(x['imdb_plot'])))

df['text'] = df.apply(create_text, axis = 1)

df[['text']].head(2)

Unnamed: 0,text
0,Percy Jackson & the Olympians: The Lightning T...
2,Please Give Kate (Catherine Keener) and her hu...


In [24]:
# Fit & Transform text column
df_text = df['text']
cvec.fit(df_text)
df_text = cvec.transform(df_text).toarray()
text_df = pd.DataFrame(df_text, columns = cvec.get_feature_names(), index = df.index)



In [25]:
# Getting numeric columns 
df_numeric = df._get_numeric_data()
df_numeric.shape

(6587, 27)

In [26]:
# Concatenating numeric df and text df 
df = pd.concat([df_numeric, text_df], axis = 1)
df.shape

(6587, 3027)

In [27]:
# Saving to new DF to model
df.to_csv('./data/df_model.csv')