In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [351]:
# Load movie data
import pandas as pd
movie_data_df = pd.read_pickle('../dump/movie_data')

In [352]:
movie_df = movie_data_df.drop(columns=['rating','vote','metascore','keywords',\
                                       'link_d','link_s','link_w'])
movie_df.rename(columns = {'certificate':'MPAA'},inplace=True)

In [353]:
movie_df

Unnamed: 0,movie_title,MPAA,genre,release_date,budget,opening_weekend_usa,gross_usa,gross_world,runtime,director,writer,star,distributor,language,country
0,After We Collided,R,"[Drama, Romance]",2020-10-23,,422899.0,2386483.0,4.799041e+07,105.0,Roger Kumble,"[Anna Todd, Mario Celaya]","[Josephine Langford, Hero Fiennes Tiffin, Dyla...",CalMaple,"[English, Chinese]",[USA]
1,The Godfather,R,"[Crime, Drama]",1972-03-24,6000000.0,302393.0,134966411.0,2.461210e+08,175.0,Francis Ford Coppola,"[Mario Puzo, Francis Ford Coppola]","[Marlon Brando, Al Pacino, James Caan]",ParamountPictures,"[English, Italian, Latin]",[USA]
2,Harry Potter and the Sorcerer's Stone,PG,"[Adventure, Family, Fantasy]",2001-11-16,125000000.0,90294621.0,318087620.0,1.006918e+09,159.0,Chris Columbus,"[J.K. Rowling, Steve Kloves]","[Daniel Radcliffe, Rupert Grint, Richard Harris]",WarnerBros.,[English],"[UK, USA]"
3,Unknown,PG-13,"[Action, Thriller]",2011-02-18,30000000.0,21856389.0,63686397.0,1.357100e+08,113.0,Jaume Collet-Serra,"[Oliver Butcher, Stephen Cornwell]","[Liam Neeson, Diane Kruger, January Jones]",DarkCastleEntertainment,"[English, German, Turkish, Arabic]","[UK, Germany, France, USA]"
4,The Lord of the Rings: The Fellowship of the Ring,PG-13,"[Action, Adventure, Drama]",2001-12-19,93000000.0,47211490.0,315544750.0,8.881591e+08,171.0,Peter Jackson,"[J.R.R. Tolkien, Fran Walsh, 2 more credits]","[Elijah Wood, Ian McKellen, Orlando Bloom]",NewLineCinema,"[English, Sindarin]","[NewZealand, USA]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,From the Terrace,Passed,"[Drama, Romance]",1960-09-09,,,11336000.0,,149.0,Mark Robson,"[John O'Hara, Ernest Lehman]","[Paul Newman, Joanne Woodward, Myrna Loy]",Linebrook,[English],[USA]
1996,V.I. Warshawski,R,"[Action, Comedy, Crime]",1991-07-26,,3603338.0,11128309.0,1.112831e+07,89.0,Jeff Kanew,"[Sara Paretsky, Edward Taylor, 3 more credits]","[Kathleen Turner, Jay O. Sanders, Charles Durn...",HollywoodPictures,[English],[USA]
1997,Anne of Green Gables,Passed,"[Comedy, Drama, Family]",1934-11-23,226000.0,,,,78.0,George Nichols Jr.,"[Sam Mintz, Lucy Maud Montgomery]","[Anne Shirley, Tom Brown, O.P. Heggie]",RKORadioPictures,[English],[USA]
1998,Odds Against Tomorrow,Approved,"[Crime, Drama, Thriller]",1959-11-01,,,,,100.0,Robert Wise,"[William P. McGivern, Abraham Polonsky, 2 more...","[Harry Belafonte, Robert Ryan, Gloria Grahame]",HarBelProductions,[English],[USA]


In [354]:
movie_df.dropna(inplace=True)

In [355]:
movie_df.shape

(960, 15)

In [356]:
movie_df.columns

Index(['movie_title', 'MPAA', 'genre', 'release_date', 'budget',
       'opening_weekend_usa', 'gross_usa', 'gross_world', 'runtime',
       'director', 'writer', 'star', 'distributor', 'language', 'country'],
      dtype='object')

## Clean each column

### Start with single lable categorical variables
### 1. MPAA rating
Find the distribution and keep only the top frequent categories

In [357]:
movie_df.MPAA.unique()

array(['R', 'PG', 'PG-13', 'G', 'Passed', 'NC-17', 'Approved', 'X',
       'M/PG', 'Not'], dtype=object)

In [358]:
movie_df.MPAA.value_counts()

R           434
PG-13       301
PG          179
G            36
Passed        4
Not           2
X             1
M/PG          1
Approved      1
NC-17         1
Name: MPAA, dtype: int64

In [359]:
# Keep only the top 3
MPAA_discard = ['G','Passed','Not','X','M/PG','Approved','NC-17']
movie_df.MPAA.replace(['G','Passed','Not','X','M/PG','Approved','NC-17'],'Other',inplace=True)

In [403]:
movie_df.MPAA.value_counts()

R        434
PG-13    301
PG       179
Other     46
Name: MPAA, dtype: int64

### 2. distributor

In [396]:
movie_df.distributor.value_counts().head(5)

WarnerBros.            72
ParamountPictures      59
ColumbiaPictures       54
UniversalPictures      49
TwentiethCenturyFox    41
Name: distributor, dtype: int64

In [408]:
# Keep only the top 3
distributor_keep = ['WarnerBros.','ParamountPictures','ColumbiaPictures','UniversalPictures',\
             'TwentiethCenturyFox','TwentiethCenturyFox']
distributor_discard = list(set(movie_df.distributor.unique()).difference(set(MPAA_keep)))
movie_df.distributor.replace(distributor_discard,'Other',inplace=True)

In [409]:
movie_df.distributor.value_counts()

Other                  685
WarnerBros.             72
ParamountPictures       59
ColumbiaPictures        54
UniversalPictures       49
TwentiethCenturyFox     41
Name: distributor, dtype: int64

### Multilabel categorical labels
### 3. genre
Find the top frequent genres and convert the column from list to set in preparation for `MultiLabelBinarizer()`.

In [361]:
# Find the top 10 genre
genre_all = []
for row in movie_df.genre:
    genre_all += row
genre_count = []
for genre in set(genre_all):
    genre_count.append({'genre':genre,'count':genre_all.count(genre)})
sorted(genre_count, key = lambda i: i['count'],reverse=True)

[{'genre': 'Drama', 'count': 630},
 {'genre': 'Comedy', 'count': 234},
 {'genre': 'Adventure', 'count': 232},
 {'genre': 'Action', 'count': 207},
 {'genre': 'Romance', 'count': 197},
 {'genre': 'Crime', 'count': 185},
 {'genre': 'Thriller', 'count': 170},
 {'genre': 'Family', 'count': 123},
 {'genre': 'Mystery', 'count': 117},
 {'genre': 'Fantasy', 'count': 116},
 {'genre': 'Horror', 'count': 75},
 {'genre': 'Sci-Fi', 'count': 74},
 {'genre': 'Animation', 'count': 43},
 {'genre': 'Biography', 'count': 37},
 {'genre': 'History', 'count': 26},
 {'genre': 'War', 'count': 25},
 {'genre': 'Music', 'count': 13},
 {'genre': 'Sport', 'count': 11},
 {'genre': 'Western', 'count': 8},
 {'genre': 'Musical', 'count': 3},
 {'genre': 'Film-Noir', 'count': 2}]

In [362]:
# Keep only the top 10 genre
# The rest will go to "Other"
genre_list = ['Drama','Adventure','Comedy','Action','Crime','Romance','Thriller',\
              'Mystery','Family','Fantasy']

In [363]:
movie_df['genre']= movie_df.genre.apply(lambda x:\
                            [genre if genre in genre_list else 'Other'for genre in x])

In [324]:
# movie_df.head()

In [372]:
# Convert genre column from list to set
movie_df['genre'] = movie_df.genre.apply(lambda x: set(x))
movie_df[['genre']]

Unnamed: 0,genre
1,"{Crime, Drama}"
2,"{Fantasy, Family, Adventure}"
3,"{Thriller, Action}"
4,"{Drama, Adventure, Action}"
5,"{Fantasy, Adventure, Action}"
...,...
1968,"{Mystery, Crime, Drama}"
1973,"{Drama, Romance}"
1988,"{Drama, Romance}"
1989,{Comedy}


## Train test split

In [410]:
# Train test split
from sklearn.model_selection import train_test_split

X= movie_df[['movie_title', 'MPAA', 'genre', 'release_date', 'budget','runtime',
       'director', 'writer', 'star', 'distributor', 'language', 'country']]

y = movie_df['opening_weekend_usa']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

## Preprocessing


### A. Categorical variable preprocessing

### a. OneHotEncoder: for column with single object

#### Transform train set

In [374]:
# Categorical varibale: MPAA
cat_variables = ['MPAA']
X_train_cat = X_train[cat_variables]

In [375]:
# Instantiate a OneHotEncoder object
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, drop='first')
ohe.fit(X_train_cat)
cats = ohe.transform(X_train_cat)

In [376]:
# Create the categorical dataframe
columns = ohe.get_feature_names(cat_variables)
X_train_cat_df = pd.DataFrame(cats, columns = columns, index = X_train_cat.index)
X_train_cat_df.head()

Unnamed: 0,MPAA_PG,MPAA_PG-13,MPAA_R
1802,0.0,0.0,1.0
226,0.0,0.0,1.0
61,0.0,1.0,0.0
1140,0.0,1.0,0.0
1245,0.0,1.0,0.0


In [411]:
# Test MPAA, distributor
cat_variables = ['MPAA','distributor']
X_train_cat = X_train[cat_variables]

In [412]:
# Instantiate a OneHotEncoder object
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, drop='first')
ohe.fit(X_train_cat)
cats = ohe.transform(X_train_cat)

In [413]:
# Create the categorical dataframe
columns = ohe.get_feature_names(cat_variables)
X_train_cat_df = pd.DataFrame(cats, columns = columns, index = X_train_cat.index)
X_train_cat_df.head()

Unnamed: 0,MPAA_PG,MPAA_PG-13,MPAA_R,distributor_Other,distributor_ParamountPictures,distributor_TwentiethCenturyFox,distributor_UniversalPictures,distributor_WarnerBros.
1968,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
229,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
653,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1489,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


#### Repeat transformations for testing set

In [377]:
# Repeat transformation for testing set

### b. MultiLabelBinarizer: for column with list object

#### Transform train set

In [369]:
mlb = MultiLabelBinarizer()

In [387]:
# Test on genre column first
multilabel_variables = ['genre']
X_train_cats = X_train[multilabel_variables]
X_train_cats_df = pd.DataFrame(mlb.fit_transform(X_train_cats['genre']),\
                               columns=mlb.classes_, index=X_train_cats_df.index)
X_train_cats_df.drop(columns='Other',inplace=True)

In [388]:
X_train_cats_df

Unnamed: 0,Action,Adventure,Comedy,Crime,Drama,Family,Fantasy,Mystery,Romance,Thriller
73,0,0,0,1,0,0,0,0,0,1
128,0,0,1,1,1,0,0,0,0,0
64,0,0,0,0,1,0,0,0,1,0
1044,0,0,0,0,1,0,0,0,0,1
598,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
50,1,0,0,0,1,0,0,1,0,0
1307,1,1,0,0,1,0,0,0,0,0
1471,0,0,0,0,1,0,0,0,0,0
1289,0,0,0,0,1,0,0,0,0,1


### OneHotEncoder with list item (e.g. genre)

In [158]:
# Example
d = {'A': [[5,7], [3, 4, 5], [2], [1,2,3,4]]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,A
0,"[5, 7]"
1,"[3, 4, 5]"
2,[2]
3,"[1, 2, 3, 4]"


In [91]:
# Use MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
df_ohe = pd.DataFrame(mlb.fit_transform(df['A']),columns = mlb.classes_, index=df.index)
df_ohe

Unnamed: 0,1,2,3,4,5,7
0,0,0,0,0,1,1
1,0,0,1,1,1,0
2,0,1,0,0,0,0
3,1,1,1,1,0,0


In [195]:
# Test on genre
cat_variable = ['genre']
X_train_cat = X_train[cat_variable]

In [199]:
X_train_cat

Unnamed: 0,genre
230,"[Drama, History]"
22,"[Crime, Drama, Mystery]"
1226,"[Crime, Drama, Thriller]"
1326,[Drama]
501,"[Drama, Fantasy, Mystery]"
...,...
1361,"[Action, Biography, Western]"
402,"[Animation, Drama, Family]"
436,"[Action, Adventure, Fantasy]"
901,"[Crime, Drama, Mystery]"


In [269]:
X_train_cat_test = X_train_cat.iloc[:3,:].copy()
X_train_cat_test

Unnamed: 0,genre
230,"[Drama, History]"
22,"[Crime, Drama, Mystery]"
1226,"[Crime, Drama, Thriller]"


In [243]:
X_train_cat_set = X_train_cat.copy()

In [144]:
def list_set(my_list):
    return set(my_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat['genre_clean']= X_train_cat_set.genre.apply(lambda x:\


In [256]:
X_train_cat_set

Unnamed: 0,genre,genre_clean
230,"[Drama, History]","[Drama, Other]"
22,"[Crime, Drama, Mystery]","[Crime, Drama, Mystery]"
1226,"[Crime, Drama, Thriller]","[Crime, Drama, Thriller]"
1326,[Drama],[Drama]
501,"[Drama, Fantasy, Mystery]","[Drama, Fantasy, Mystery]"
...,...,...
1361,"[Action, Biography, Western]","[Action, Other, Other]"
402,"[Animation, Drama, Family]","[Other, Drama, Family]"
436,"[Action, Adventure, Fantasy]","[Action, Adventure, Fantasy]"
901,"[Crime, Drama, Mystery]","[Crime, Drama, Mystery]"


Unnamed: 0,genre_clean
230,"{Other, Drama}"
22,"{Mystery, Crime, Drama}"
1226,"{Crime, Thriller, Drama}"
1326,{Drama}
501,"{Mystery, Drama, Fantasy}"
...,...
1361,"{Other, Action}"
402,"{Drama, Other, Family}"
436,"{Fantasy, Adventure, Action}"
901,"{Mystery, Crime, Drama}"


In [266]:
X_train_cat_df = pd.DataFrame(mlb.fit_transform(X_train_cat_set['genre_clean']),\
                              columns=mlb.classes_,index=X_train_cat_set.index)
X_train_cat_df.drop(columns='Other',inplace=True)

In [267]:
X_train_cat_df

Unnamed: 0,Action,Adventure,Comedy,Crime,Drama,Family,Fantasy,Mystery,Romance,Thriller
230,0,0,0,0,1,0,0,0,0,0
22,0,0,0,1,1,0,0,1,0,0
1226,0,0,0,1,1,0,0,0,0,1
1326,0,0,0,0,1,0,0,0,0,0
501,0,0,0,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1361,1,0,0,0,0,0,0,0,0,0
402,0,0,0,0,1,1,0,0,0,0
436,1,1,0,0,0,0,1,0,0,0
901,0,0,0,1,1,0,0,1,0,0


In [262]:
len(mlb.classes_),mlb.classes_

(11,
 array(['Action', 'Adventure', 'Comedy', 'Crime', 'Drama', 'Family',
        'Fantasy', 'Mystery', 'Other', 'Romance', 'Thriller'], dtype=object))

In [263]:
X_train_cat_df.sum(axis=0).sort_values(ascending=False)

Drama        491
Other        224
Adventure    191
Comedy       188
Action       175
Crime        155
Romance      152
Thriller     137
Mystery       98
Family        98
Fantasy       92
dtype: int64