# Movie Analysis

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from os import listdir
from os.path import isfile, join
import pandas as pd
import json

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score


%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## Data Collection
Data about movies is in no short supply on the web. After deciding on which data would be most relevant, we found two data sets that would complement each other 

Kaggle - https://www.kaggle.com/tmdb/tmdb-movie-metadata

Load the data from the raw files into Pandas dataframes. Some preprocessing had to be done on these files since they had some non-utf-8 characters.

In [2]:
file_dir = "Data/RawData/"

data_files = [f for f in listdir(file_dir) if isfile(join(file_dir, f))]
data_list = []

for data_file in data_files:
    if "Store" not in data_file:
        data_list.append(pd.read_csv(file_dir + data_file, engine='python'))

merged = data_list[0].join(data_list[1], lsuffix='title', rsuffix='title')

Countries that a movie was produced is some interesting data. Flatten it down and one-hot encode it.

In [3]:
rows = []
for i, row in enumerate(merged['production_countries']):
    if len(json.loads(row)) > 0:
        rows.append("|".join([country['name'] for country in json.loads(row)]))
    else:
        rows.append("")
merged.drop(['production_countries'], axis=1)
se = pd.Series(rows)
merged['production_countries'] = se.values
cleaned_pc = merged.production_countries.str.split('|', expand=True).stack()
production_countries = pd.get_dummies(cleaned_pc, prefix='cp').groupby(level=0).sum()

One hot encode genres, color, content rating, and language

In [4]:
cleaned_g = merged.genres.str.split('|', expand=True).stack()
genres = pd.get_dummies(cleaned_g, prefix='g').groupby(level=0).sum()

color = pd.get_dummies(merged['color'])
content_rating = pd.get_dummies(merged['content_rating'])
language = pd.get_dummies(merged['language'])

Merge in all of the one-hot encoded data into the merged dataframe

In [5]:
# Add the one-hot encoded dataframes to the final set
merged = pd.concat([merged, genres], axis=1, sort=True)
merged = pd.concat([merged, color], axis=1, sort=True)
merged = pd.concat([merged, content_rating], axis=1, sort=True)
merged = pd.concat([merged, language], axis=1, sort=True)
merged = pd.concat([merged, production_countries], axis=1, sort=True)

In [6]:
merged = merged.drop(['genres', 'movie_title', 'gross', 'production_countries', 'num_user_for_reviews',
                      'director_name', 'actor_2_name', 'actor_1_facebook_likes',
                      'actor_1_name', 'actor_3_name', 'color', 'content_rating', 'language'], axis=1)

Drop other columns

In [7]:
# Drop columns that we will not be using yet, but might later (keywords)
merged = merged.drop(['plot_keywords', 'keywords'], axis=1)

# I think production companies might be a bit much. lets remove it for now.
merged = merged.drop(['production_companies'], axis=1)

### Fix the release date (https://stackoverflow.com/questions/46428870/how-to-handle-date-variable-in-machine-learning-data-pre-processing)
### Dropping for now, but this is definitely something to do feature engineering on!

### Thought: I wonder if we can get data on opening week or month revenue


In [8]:
merged = merged.drop(['release_date'], axis=1)

In [9]:
# set to average any 0 or nan for most of the continous columns
# 0 or nan: revenue, duration
zero_or_nan_average = ['revenue', 'duration']
for col in zero_or_nan_average:
    mean = merged[col].mean()
    merged[col].fillna((mean), inplace=True)
    merged = merged.replace({col: {0: mean}})
    
just_nan_average = ['vote_average', 'vote_count', 'num_critic_for_reviews', 'director_facebook_likes', 'actor_3_facebook_likes', 'num_voted_users',
                    'cast_total_facebook_likes', 'facenumber_in_poster', 'actor_2_facebook_likes', 'movie_facebook_likes']
for col in just_nan_average:
    merged[col].fillna((merged[col].mean()), inplace=True)
    
# Remove anything that wasn't fixed
merged = merged.dropna()

Wrap up the data preprocessing!

In [52]:


msk = np.random.rand(len(merged)) < 0.8
train = merged[msk]
test = merged[~msk]

export_csv = train.to_csv(r'train.csv', index = "title", header=True)
export_csv = test.to_csv(r'test.csv', index = "title", header=True)
export_csv = test.to_csv()

merged = merged.set_index('title')

merged.head()

KeyError: 'title'

In [27]:




# creating matrices for sklearn:
X_train = merged[:train.shape[0]]
X_test = merged[train.shape[0]:]
# cross_ten = np.array_split(merged, 10)
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]
y = train.revenue

X_test.head()

Unnamed: 0_level_0,revenue,vote_average,vote_count,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,...,cp_Sweden,cp_Switzerland,cp_Taiwan,cp_Thailand,cp_Tunisia,cp_Turkey,cp_Ukraine,cp_United Arab Emirates,cp_United Kingdom,cp_United States of America
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
They Live,13008928.0,7.1,550,44.0,90.0,152.0,826.0,15345,23714,0.0,...,0,0,0,0,0,0,0,0,0,1
The Great Escape,11744471.0,7.8,717,14.0,141.0,54.0,70.0,2017,1332,2.0,...,0,0,0,0,0,0,0,0,0,1
The Last Exorcism Part II,15179302.0,4.4,203,45.0,99.0,1000.0,409.0,2326,2269,1.0,...,0,0,0,0,0,0,0,0,0,1
Boyhood,44349000.0,7.5,1971,42.0,79.0,57.0,99.0,3519,1284,1.0,...,0,0,0,0,0,0,0,0,0,1
The Wash,10229331.0,5.3,26,29.0,82.0,99.0,44.0,5164,315,0.0,...,0,0,0,0,0,0,0,0,0,1


In [28]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

### Lasso Regression

In [46]:
model_lasso = LassoCV(alphas = alphas, tol=0.5, cv=5, max_iter=10000000, verbose=True).fit(X_train, y)
print(model_lasso)
rmse_cv(model_lasso).mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
.....[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
.....[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
.....[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
....

LassoCV(alphas=[10], copy_X=True, cv=5, eps=0.001, fit_intercept=True,
    max_iter=10000000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.5, verbose=True)


.[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
.....[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
.....[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


132233832.15942845

In [21]:
coef = pd.Series(model_lasso.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

Lasso picked 163 variables and eliminated the other 30 variables


Unnamed: 0_level_0,revenue,vote_average,vote_count,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,...,cp_Sweden,cp_Switzerland,cp_Taiwan,cp_Thailand,cp_Tunisia,cp_Turkey,cp_Ukraine,cp_United Arab Emirates,cp_United Kingdom,cp_United States of America
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Spectre,880674600.0,6.3,4466,602.0,148.0,0.0,161.0,275868,11700,1.0,...,0,0,0,0,0,0,0,0,1,1
Avengers: Age of Ultron,1405404000.0,7.3,6767,635.0,141.0,0.0,19000.0,462669,92000,4.0,...,0,0,0,0,0,0,0,0,0,1
The Lone Ranger,89289910.0,5.9,2311,450.0,150.0,563.0,1000.0,181792,45757,1.0,...,0,0,0,0,0,0,0,0,0,1
Pirates of the Caribbean: On Stranger Tides,1045714000.0,6.4,4948,448.0,136.0,252.0,1000.0,370704,54083,4.0,...,0,0,0,0,0,0,0,0,0,1
Titanic,1845034000.0,7.5,7562,315.0,194.0,0.0,794.0,793059,45223,0.0,...,0,0,0,0,0,0,0,0,0,1


In [49]:
lasso_preds = np.expm1(model_lasso.predict(X_test))
solution = pd.DataFrame({"movie":test.index.tolist(), "Revenue":lasso_preds})
solution.to_csv("lasso_sol.csv", index = False)

  """Entry point for launching an IPython kernel.
