In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


In [2]:
df_out = pd.read_csv('cleaned_data_dummies.csv', index_col = 0)

In [3]:
def split_train_test(data, test_ratio):
    np.random.seed(0)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_df, test_df = split_train_test(df_out, 0.2)
train_df = train_df.drop(columns= 'Title')
test_df = test_df.drop(columns= 'Title')

# oreiller machine learning book

## Model Baseline performance: Dropping all rows with null values and no outlier treatment

In [138]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PowerTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer , KNNImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MultiLabelBinarizer


In [6]:
train_df_nona = train_df.dropna() 
train_df_nona = train_df_nona.reset_index().drop(columns='index')


In [7]:
x_na = train_df_nona.drop(columns=['Box office','Release date','Day','Language','Country'])
y_na = train_df_nona['Box office']

columns_trans = make_column_transformer(
    (OneHotEncoder(),['Month','Day of week']),
    remainder='passthrough')


In [7]:
lr = LinearRegression()
pipe = make_pipeline(columns_trans, lr)
cross_val_score(pipe, x_na, y_na, cv=3)

array([0.55691298, 0.53895606, 0.56678776])

In [8]:
rf = RandomForestRegressor()
pipe = make_pipeline(columns_trans, rf)
cross_val_score(pipe, x_na, y_na, cv=3)

array([0.59883948, 0.61202462, 0.56323045])

In [9]:
xg = XGBRegressor()
pipe = make_pipeline(columns_trans, xg)
cross_val_score(pipe, x_na, y_na, cv=3)

array([0.57456078, 0.54722915, 0.50012725])

In [11]:
lasso = Lasso()
pipe = make_pipeline(columns_trans, lasso)
cross_val_score(pipe, x_na, y_na, cv=3)

array([ 0.32148232, -0.169304  ,  0.07330619])

In [12]:
ridge = Ridge()
pipe = make_pipeline(columns_trans, ridge)
cross_val_score(pipe, x_na, y_na, cv=3)



array([0.5406905 , 0.57039602, 0.55856209])

## Model Baseline performance: Dropping all rows with null values and scaling and outlier treatment


In [22]:
power_transform = PowerTransformer().fit_transform(x_na[['Budget']])
power_transform = pd.DataFrame(power_transform, columns=['Budget'])
x_na = pd.concat([x_na.drop(columns=['Budget']), power_transform], axis =1)

minmax = MinMaxScaler().fit_transform(x_na[['Year','Running time','Budget']])
minmax = pd.DataFrame(minmax, columns=['Running time','Budget','Year'])
x_na = pd.concat([x_na.drop(columns=['Running time','Budget','Year']),minmax], axis =1)

columns_trans = make_column_transformer(
    (OneHotEncoder(),['Month','Day of week']),
    remainder='passthrough')

In [24]:
lr = LinearRegression()
pipe = make_pipeline(columns_trans, lr)
cross_val_score(pipe, x_na, y_na, cv=3)

array([0.46243757, 0.46845347, 0.48657235])

In [25]:
rf = RandomForestRegressor()
pipe = make_pipeline(columns_trans, rf)
cross_val_score(pipe, x_na, y_na, cv=3)

array([0.58866294, 0.5941524 , 0.56200962])

In [26]:
xg = XGBRegressor()
pipe = make_pipeline(columns_trans, xg)
cross_val_score(pipe, x_na, y_na, cv=3)

array([0.61006934, 0.52888637, 0.49706464])

In [27]:
ridge = Ridge()
pipe = make_pipeline(columns_trans, ridge)
cross_val_score(pipe, x_na, y_na, cv=3)

array([0.4880342 , 0.50035674, 0.54144682])

## Model Baseline with imputation, scaling and outlier treatment


In [29]:
train = train_df.dropna(subset=['Box office'])
train['Language'] = train['Language'].fillna('English')
train = train.reset_index().drop(columns='index')

x = train.drop(columns=['Box office','Release date','Day','Language','Country'])

y = train['Box office']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Language'] = train['Language'].fillna('English')


In [30]:
knn_imputation = KNNImputer(n_neighbors=9).fit_transform(x[['Budget','Running time','Year','Month','Day of week','Based on']])
knn_imputation = pd.DataFrame(knn_imputation, columns=['Budget','Running time','Year','Month','Day of week','Based on'])
x = pd.concat([x.drop(columns=['Budget','Running time','Year','Month','Day of week','Based on']),knn_imputation], axis =1)

power_transform = PowerTransformer().fit_transform(x[['Budget']])
power_transform = pd.DataFrame(power_transform, columns=['Budget'])
x = pd.concat([x.drop(columns=['Budget']), power_transform], axis =1)

minmax = MinMaxScaler().fit_transform(x[['Running time','Budget','Year']])
minmax = pd.DataFrame(minmax, columns=['Running time','Budget','Year'])
x = pd.concat([x.drop(columns=['Running time','Budget','Year']),minmax], axis =1)

In [31]:
columns_trans = make_column_transformer(
    (OneHotEncoder(),['Month','Day of week']),
    remainder='passthrough')

In [33]:
lr = LinearRegression()
pipe = make_pipeline(columns_trans, lr)
cross_val_score(pipe, x, y, cv=3)

array([0.46909868, 0.46298569, 0.51456939])

In [34]:
xg = XGBRegressor()
pipe = make_pipeline(columns_trans, xg)
cross_val_score(pipe, x, y, cv=3)

array([0.61809626, 0.55052878, 0.60698998])

In [35]:
## Best performing
rf = RandomForestRegressor()
pipe = make_pipeline(columns_trans, rf)
cross_val_score(pipe, x, y, cv=3)

array([0.57852987, 0.58632336, 0.63773637])

In [37]:
ridge = Ridge()
pipe = make_pipeline(columns_trans, ridge)
cross_val_score(pipe, x_na, y_na, cv=3)

array([0.4880342 , 0.50035674, 0.54144682])

## Model Baseline with imputation and no scaling and outlier treatment


In [38]:
x = train.drop(columns=['Box office','Release date','Day','Language','Country'])
y = train['Box office']

In [40]:
knn_imputation = KNNImputer(n_neighbors=9).fit_transform(x[['Budget','Running time','Year','Month','Day of week','Based on']])
knn_imputation = pd.DataFrame(knn_imputation, columns=['Budget','Running time','Year','Month','Day of week','Based on'])
x = pd.concat([x.drop(columns=['Budget','Running time','Year','Month','Day of week','Based on']),knn_imputation], axis =1)

columns_trans = make_column_transformer(
    (OneHotEncoder(),['Month','Day of week']),
    remainder='passthrough')

In [43]:
lr = LinearRegression()
pipe = make_pipeline(columns_trans, lr)
cross_val_score(pipe, x, y, cv=3)

array([0.5302604 , 0.52046759, 0.56292754])

In [44]:
xg = XGBRegressor()
pipe = make_pipeline(columns_trans, xg)
cross_val_score(pipe, x, y, cv=3)

array([0.61768371, 0.55155804, 0.60736764])

In [45]:
rf = RandomForestRegressor()
pipe = make_pipeline(columns_trans, rf)
cross_val_score(pipe, x, y, cv=3)

array([0.57695237, 0.57064011, 0.62712534])

In [46]:
ridge = Ridge()
pipe = make_pipeline(columns_trans, ridge)
cross_val_score(pipe, x, y, cv=3)

array([0.4880342 , 0.50035674, 0.54144682])

In [None]:
### XGB 
### knn imput  n = 9 array([0.61771076, 0.55050346, 0.6067077 ])
### knn impute n = 7 array([0.61537241, 0.55510262, 0.58953028])
### knn impute n = 5 array[0.596       0.5583        0.57386]
### simple mean     array([0.62578365, 0.49941513, 0.60900925])

# Final Model
---
- I did a Gridsearch with a handful of different hyper parameters and after a day and a half, my computer was still not able to run all the different folds and hyper paramter combinations. The size of the dataframe (4510, 32711) and the lack of processing power and RAM made the testing process take a long time 
- Having said that, I am selecting the random forest regrossor and xgb because they performed the best in the cross validation phase. I am also setting the max amount of trees to 6 to speed up processing time and to prevent over fitting

# Final test 

In [20]:
test = test_df.dropna(subset=['Box office'])
test['Language'] = test['Language'].fillna('English')
test = test.reset_index().drop(columns='index')

x_test = test.drop(columns=['Box office','Release date','Day','Language','Country'])
y_test = test['Box office']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Language'] = test['Language'].fillna('English')


In [7]:
knn_imputation = KNNImputer(n_neighbors=9).fit_transform(x_test[['Budget','Running time','Year','Month','Day of week','Based on']])
knn_imputation = pd.DataFrame(knn_imputation, columns=['Budget','Running time','Year','Month','Day of week','Based on'])
x_test = pd.concat([x_test.drop(columns=['Budget','Running time','Year','Month','Day of week','Based on']),knn_imputation], axis =1)

power_transform = PowerTransformer().fit_transform(x_test[['Budget']])
power_transform = pd.DataFrame(power_transform, columns=['Budget'])
x_test = pd.concat([x_test.drop(columns=['Budget']), power_transform], axis =1)

minmax = MinMaxScaler().fit_transform(x_test[['Running time','Budget','Year']])
minmax = pd.DataFrame(minmax, columns=['Running time','Budget','Year'])
x_test = pd.concat([x_test.drop(columns=['Running time','Budget','Year']),minmax], axis =1)

In [26]:
columns_trans = make_column_transformer(
    (OneHotEncoder(),['Month','Day of week']),
    remainder='passthrough')

x_test_scaled = columns_trans.fit_transform(x_test)

In [33]:
rf = RandomForestRegressor(n_jobs=-1, max_depth=6)
rf.fit(x_scaled, y)
score = rf.score(x_test_scaled, y_test)

In [34]:
score

0.5603035064081044

In [35]:
xgb = XGBRegressor(n_jobs=-1, max_depth=6)
xgb.fit(x_scaled, y)
score = xgb.score(x_test_scaled, y_test)

In [36]:
score

0.49846186942774295

# Saving our Model
--- 
I will be training the random forest regressor on all of the data and then saving the model to use on the streamlit web app

In [5]:
final = df_out.dropna(subset=['Box office'])
final['Language'] = final['Language'].fillna('English')
final = final.reset_index().drop(columns='index')

x = final.drop(columns=['Box office','Release date','Day','Language','Country','Title'])
x_title = final.drop(columns=['Release date','Day','Language','Country',])
y = final['Box office']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['Language'] = final['Language'].fillna('English')


In [6]:
knn_imputation = KNNImputer(n_neighbors=9).fit_transform(x[['Budget','Running time','Year','Month','Day of week','Based on']])
knn_imputation = pd.DataFrame(knn_imputation, columns=['Budget','Running time','Year','Month','Day of week','Based on'])
x = pd.concat([x.drop(columns=['Budget','Running time','Year','Month','Day of week','Based on']),knn_imputation], axis =1)

power_transform = PowerTransformer().fit_transform(x[['Budget']])
power_transform = pd.DataFrame(power_transform, columns=['Budget'])
x = pd.concat([x.drop(columns=['Budget']), power_transform], axis =1)

minmax = MinMaxScaler().fit_transform(x[['Running time','Budget','Year']])
minmax = pd.DataFrame(minmax, columns=['Running time','Budget','Year'])
x = pd.concat([x.drop(columns=['Running time','Budget','Year']),minmax], axis =1)


In [7]:
columns_trans = make_column_transformer(
    (OneHotEncoder(),['Month','Day of week']),
    remainder='passthrough')

In [8]:
# x_scaled_df = pd.get_dummies(x_title,columns=['Month','Day of week'])

In [9]:
x_scaled = columns_trans.fit_transform(x)

In [10]:
rf = RandomForestRegressor(n_jobs=-1, max_depth=6)
rf.fit(x_scaled,y)

RandomForestRegressor(max_depth=6, n_jobs=-1)

In [21]:
import pickle 
with open('final_film_model','wb') as f:
    pickle.dump(rf, f)

## Dropping all Null values Model

In [50]:
final_na = df_out.dropna()
final_na = final_na.reset_index().drop(columns='index')

x_na = final_na.drop(columns=['Box office','Release date','Day','Language','Country','Title'])
y_na = final_na['Box office']

In [51]:
x_na_dum = pd.get_dummies(data = x_na, columns = ['Month','Day of week'])

In [52]:
rf = RandomForestRegressor(n_jobs=-1, max_depth=6)
rf.fit(x_na_dum,y_na)

RandomForestRegressor(max_depth=6, n_jobs=-1)

In [53]:
with open('no_na_model','wb') as file:
    pickle.dump(rf,file)

### Saving csv file for web app use 

In [32]:
top_100 = final[['Title','Box office']].sort_values(by='Box office',ascending=False)[:100]['Title']

In [72]:
top_100.to_csv('top_films_index.csv',index=False)

In [57]:
pd.DataFrame(x_scaled[top_100.index,:], index=top_100.index).to_csv('top_films_scaled.csv')  