# Movie Analysis

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from os import listdir
from os.path import isfile, join
import pandas as pd
import json

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score


%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## Data Collection
Data about movies is in no short supply on the web. After deciding on which data would be most relevant, we found two data sets that would complement each other 

Kaggle - https://www.kaggle.com/tmdb/tmdb-movie-metadata

Load the data from the raw files into Pandas dataframes. Some preprocessing had to be done on these files since they had some non-utf-8 characters.

In [14]:
file_dir = "Data/RawData/"

data_files = [f for f in listdir(file_dir) if isfile(join(file_dir, f))]
data_list = []

for data_file in data_files:
    if "Store" not in data_file:
        data_list.append(pd.read_csv(file_dir + data_file, engine='python'))

merged = data_list[0].join(data_list[1], lsuffix='title', rsuffix='title')

Countries that a movie was produced is some interesting data. Flatten it down and one-hot encode it.

In [15]:
rows = []
for i, row in enumerate(merged['production_countries']):
    if len(json.loads(row)) > 0:
        rows.append("|".join([country['name'] for country in json.loads(row)]))
    else:
        rows.append("")
merged.drop(['production_countries'], axis=1)
se = pd.Series(rows)
merged['production_countries'] = se.values
cleaned_pc = merged.production_countries.str.split('|', expand=True).stack()
production_countries = pd.get_dummies(cleaned_pc, prefix='cp').groupby(level=0).sum()

One hot encode genres, color, content rating, and language

In [16]:
cleaned_g = merged.genres.str.split('|', expand=True).stack()
genres = pd.get_dummies(cleaned_g, prefix='g').groupby(level=0).sum()

color = pd.get_dummies(merged['color'])
content_rating = pd.get_dummies(merged['content_rating'])
language = pd.get_dummies(merged['language'])

Merge in all of the one-hot encoded data into the merged dataframe

In [17]:
# Add the one-hot encoded dataframes to the final set
merged = pd.concat([merged, genres], axis=1, sort=True)
merged = pd.concat([merged, color], axis=1, sort=True)
merged = pd.concat([merged, content_rating], axis=1, sort=True)
merged = pd.concat([merged, language], axis=1, sort=True)
merged = pd.concat([merged, production_countries], axis=1, sort=True)

In [18]:
merged = merged.drop(['genres', 'movie_title', 'gross', 'production_countries', 'num_user_for_reviews',
                      'director_name', 'actor_2_name', 'actor_1_facebook_likes',
                      'actor_1_name', 'actor_3_name', 'color', 'content_rating', 'language'], axis=1)

Drop other columns

In [19]:
# Drop columns that we will not be using yet, but might later (keywords)
merged = merged.drop(['plot_keywords', 'keywords'], axis=1)

# I think production companies might be a bit much. lets remove it for now.
merged = merged.drop(['production_companies'], axis=1)

### Fix the release date (https://stackoverflow.com/questions/46428870/how-to-handle-date-variable-in-machine-learning-data-pre-processing)
### Dropping for now, but this is definitely something to do feature engineering on!


In [20]:
merged = merged.drop(['release_date'], axis=1)

In [21]:
# set to average any 0 or nan for most of the continous columns
# 0 or nan: revenue, duration
zero_or_nan_average = ['revenue', 'duration']
for col in zero_or_nan_average:
    mean = merged[col].mean()
    merged[col].fillna((mean), inplace=True)
    merged = merged.replace({col: {0: mean}})
    
just_nan_average = ['vote_average', 'vote_count', 'num_critic_for_reviews', 'director_facebook_likes', 'actor_3_facebook_likes', 'num_voted_users',
                    'cast_total_facebook_likes', 'facenumber_in_poster', 'actor_2_facebook_likes', 'movie_facebook_likes']
for col in just_nan_average:
    merged[col].fillna((merged[col].mean()), inplace=True)

Wrap up the data preprocessing!

In [22]:
merged = merged.set_index('title')
merged.head()

Unnamed: 0_level_0,revenue,vote_average,vote_count,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,...,cp_Sweden,cp_Switzerland,cp_Taiwan,cp_Thailand,cp_Tunisia,cp_Turkey,cp_Ukraine,cp_United Arab Emirates,cp_United Kingdom,cp_United States of America
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,2787965000.0,7.2,11800,723.0,178.0,0.0,855.0,886204,4834,0.0,...,0,0,0,0,0,0,0,0,1,1
Pirates of the Caribbean: At World's End,961000000.0,6.9,4500,302.0,169.0,563.0,1000.0,471220,48350,0.0,...,0,0,0,0,0,0,0,0,0,1
Spectre,880674600.0,6.3,4466,602.0,148.0,0.0,161.0,275868,11700,1.0,...,0,0,0,0,0,0,0,0,1,1
The Dark Knight Rises,1084939000.0,7.6,9106,813.0,164.0,22000.0,23000.0,1144337,106759,0.0,...,0,0,0,0,0,0,0,0,0,1
John Carter,284139100.0,6.1,2124,462.0,132.0,475.0,530.0,212204,1873,1.0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
msk = np.random.rand(len(merged)) < 0.8
train = merged[msk]
test = merged[~msk]

#creating matrices for sklearn:
X_train = merged[:train.shape[0]]
X_test = merged[train.shape[0]:]
# cross_ten = np.array_split(merged, 10)
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]
y = train.revenue

### Lasso Regression

In [26]:
model_lasso = LassoCV(alphas = alphas).fit(X_train, y)
rmse_cv(model_lasso).mean()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
coef = pd.Series(model_lasso.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
lasso_preds = np.expm1(model_lasso.predict(X_test))
solution = pd.DataFrame({"id":test.Id, "SalePrice":lasso_preds})
solution.to_csv("lasso_sol.csv", index = False)