# IMDB 5000 Movie Dataset Data Cleaning

================================================================================================================================

**AUTHOR**: Mengshan Jin

**CREATION DATE**: 07/31/2017

================================================================================================================================


**PROGRAM DESCRIPTION**: Data cleaning on IMDB 5000 Movie Dataset

**INPUT DATASETS**: 01_Data/movie_metadata.csv

**OUTPUT DATASETS**: 


================================================================================================================================
**PROGRAM CHANGE HISTORY**

Date|Author|Change|
----|------|------|

# Section 0: Import packages

In [69]:
# Data structure
import numpy as np
import pandas as pd
from itertools import chain
import copy
from collections import Counter

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set(style="ticks", color_codes=True)

# Custom support functions
import sys    
sys.path.insert(0,"../03_Feature_Engineering/sklearn-support/")
import support as sup

# sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Imputer, FunctionTransformer
from sklearn.pipeline import Pipeline

# Section 1: Read data

In [2]:
imdb = pd.read_csv("../01_Data/movie_metadata.csv")

In [3]:
imdb['color'] = imdb['color'].str.strip()

# Section 2: Impute missing values

## Part 1: Inspection

In [5]:
imdb.shape

(5043, 28)

In [4]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-

In [6]:
imdb_modern = imdb.loc[imdb['title_year'] > 1980]

In [7]:
imdb_modern.shape

(4650, 28)

In [8]:
imdb_modern.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4650 entries, 0 to 5042
Data columns (total 28 columns):
color                        4635 non-null object
director_name                4650 non-null object
num_critic_for_reviews       4609 non-null float64
duration                     4638 non-null float64
director_facebook_likes      4650 non-null float64
actor_3_facebook_likes       4634 non-null float64
actor_2_name                 4640 non-null object
actor_1_facebook_likes       4643 non-null float64
gross                        4040 non-null float64
genres                       4650 non-null object
actor_1_name                 4643 non-null object
movie_title                  4650 non-null object
num_voted_users              4650 non-null int64
cast_total_facebook_likes    4650 non-null int64
actor_3_name                 4634 non-null object
facenumber_in_poster         4637 non-null float64
plot_keywords                4510 non-null object
movie_imdb_link              4650 non-

In [16]:
imdb_modern.to_csv("../01_Data/movie_metadata_modern.csv", index=False)

In [107]:
imdb_modern = pd.read_csv("../01_Data/movie_metadata_modern.csv")

### Question: Depending on title_year?

In [28]:
imdb_modern.loc[imdb_modern['aspect_ratio'].isnull()]['title_year'].value_counts()
# It looks pretty random, can impute with median

2014.0    55
2015.0    52
2013.0    30
2016.0    21
2012.0    20
2011.0    17
2009.0    15
2010.0    15
2004.0    10
2005.0     9
2006.0     8
2007.0     8
1997.0     6
2000.0     5
1998.0     5
1999.0     5
2008.0     4
2003.0     4
2002.0     3
1982.0     2
1983.0     2
1990.0     2
2001.0     1
1995.0     1
1987.0     1
1996.0     1
1994.0     1
1985.0     1
Name: title_year, dtype: int64

In [29]:
imdb_modern.loc[imdb_modern['color'].isnull()]['title_year'].value_counts()
# same here

2014.0    4
2012.0    2
2011.0    2
2015.0    2
1990.0    1
2013.0    1
2016.0    1
2009.0    1
2010.0    1
Name: title_year, dtype: int64

## Part 2: Preprocessing
1. aspect_ratio: replace 16 with 1.85
2. content_rating: replace "Not Rated" and NaN with "Unrated"
3. remove movie_imdb_link

In [108]:
imdb_modern['aspect_ratio'] = imdb_modern['aspect_ratio'].apply(lambda x: 1.85 if x == 16 else x)

In [109]:
imdb_modern['aspect_ratio'].describe()

count    4346.000000
mean        2.102609
std         0.269304
min         1.180000
25%         1.850000
50%         2.350000
75%         2.350000
max         2.760000
Name: aspect_ratio, dtype: float64

In [110]:
imdb_modern['content_rating'] = imdb_modern['content_rating'].apply(lambda x: "Unrated" if x == "Not Rated" or pd.isnull(x) else x)

In [111]:
imdb_modern['content_rating'].value_counts()

R          2060
PG-13      1457
PG          631
Unrated     389
G            89
X             9
NC-17         6
TV-14         3
TV-G          3
TV-PG         3
Name: content_rating, dtype: int64

In [91]:
imdb_modern = imdb_modern.drop('movie_imdb_link', 1)

## Part 3: Handle genres and plot_keywords

In [112]:
dtmp = imdb_modern['genres'].apply(lambda x: '|'.join(pd.Series(x))).str.get_dummies()

In [115]:
imdb_modern = pd.concat([imdb_modern, dtmp], axis=1)
imdb_modern = imdb_modern.drop('genres', 1)

### ==>Task to do<==
Add "keyword" before each keyword, e.g. the column name should be like "keyword_love"

In [99]:
imdb_modern['keywords_list'] = imdb_modern['plot_keywords'].apply(lambda x: x.split('|') if pd.notnull(x) else x)

In [102]:
plot_keywords = Counter([item for sublist in list(imdb_modern['plot_keywords'].str.split('|')) if sublist is not np.nan for item in sublist])
plot_keywords = pd.DataFrame(plot_keywords.most_common()).set_index([0], drop=True)
del plot_keywords.index.name
plot_keywords.columns = ['Count']

In [106]:
plot_keywords.loc[plot_keywords['Count']>30]

Unnamed: 0,Count
love,182
friend,153
murder,146
death,121
police,114
high school,84
new york city,84
alien,74
school,71
boy,70


## Part 4: Use Transformers to impute missing values

In [48]:
clean_pipe = Pipeline([
    ('impute_with_unknown', sup.UnknownImputer(['actor_1_name', 'actor_2_name', 'actor_3_name', 'country', 'director_name',
                                                'genres', 'language', 'movie_title', 'plot_keywords'])),
    ('combine_small_categories', sup.SmallCategoryCombiner(['country', 'language', 'actor_1_name', 'actor_2_name',
                                                            'actor_3_name', 'director_name'])),
    ('convert_to_category', sup.DtypeConverter(['color', 'content_rating', 'country', 'language', 'actor_1_name', 
                                                'actor_2_name', 'actor_3_name', 'director_name'])),
    ('create_dummies', sup.CategoricalTransformer()),
    ('impute_missings', Imputer(strategy='median'))
])

In [58]:
imdb_modern_cleaned = copy.deepcopy(imdb_modern)
for i in range(len(clean_pipe.steps)-1):
    imdb_modern_cleaned = clean_pipe.steps[i][1].fit_transform(imdb_modern_cleaned)
# genres, movie_title and plot_keywords were not handled appropriately

In [59]:
imdb_modern_cleaned.shape

(4650, 188)

In [60]:
imdb_modern_cleaned.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,genres,movie_title,num_voted_users,cast_total_facebook_likes,...,content_rating_G,content_rating_NC-17,content_rating_PG,content_rating_PG-13,content_rating_R,content_rating_TV-14,content_rating_TV-G,content_rating_TV-PG,content_rating_Unrated,content_rating_X
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,0.0,0.0,886204,4834,...,0,0,0,1,0,0,0,0,0,0
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,0.0,0.0,471220,48350,...,0,0,0,1,0,0,0,0,0,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,0.0,0.0,275868,11700,...,0,0,0,1,0,0,0,0,0,0
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,0.0,0.0,1144337,106759,...,0,0,0,1,0,0,0,0,0,0
4,462.0,132.0,475.0,530.0,640.0,73058679.0,0.0,0.0,212204,1873,...,0,0,0,1,0,0,0,0,0,0


In [63]:
imdb_modern_cleaned_final = pd.DataFrame(clean_pipe.steps[4][1].fit_transform(imdb_modern_cleaned), columns=imdb_modern_cleaned.columns)