In [6]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PowerTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer , KNNImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MultiLabelBinarizer


# Feature Selection
--- 
- Testing model cross val score with out any of the list object columns 
- the list object columns inlcude [Genres, directed by, starring, produced by, distributed by, music by, edited by, cinemetoography by, written by, screenplay by, productin companies]
- I am also  removing is weekend becasue the day of week column covers that.
- Imputing all the missing values and scalling the budget column
---
### Variane Analysis 
- Remove the list object columns with the least amount of variance or deviation

In [2]:
df_out = pd.read_csv('cleaned_data_dummies.csv', index_col = 0)

In [3]:
def split_train_test(data, test_ratio):
    np.random.seed(0)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_df, test_df = split_train_test(df_out, 0.2)
train_df = train_df.drop(columns= 'Title')
test_df = test_df.drop(columns= 'Title')

In [4]:
train = train_df.dropna(subset=['Box office'])
train = train.reset_index().drop(columns='index')

x = train[['Running time','Based on','rsp','Budget','Year','Month','Day of week']]
y = train['Box office']


In [7]:
knn_imputation = KNNImputer(n_neighbors=9).fit_transform(x)
knn_imputation = pd.DataFrame(knn_imputation, columns=x.columns)
x = pd.concat([x.drop(columns=x.columns),knn_imputation], axis =1)

power_transform = PowerTransformer().fit_transform(x[['Budget']])
power_transform = pd.DataFrame(power_transform, columns=['Budget'])
x = pd.concat([x.drop(columns=['Budget']), power_transform], axis =1)

minmax = MinMaxScaler().fit_transform(x[['Running time','Budget','Year']])
minmax = pd.DataFrame(minmax, columns=['Running time','Budget','Year'])
x = pd.concat([x.drop(columns=['Running time','Budget','Year']),minmax], axis =1)

In [8]:
rf = RandomForestRegressor(n_jobs=-1, random_state=1)
cv = cross_val_score(rf,x,y, cv=3)
print(cv)
cv.mean()

[0.56462271 0.45569228 0.556651  ]


0.5256553313312807

In [9]:
# rf = RandomForestClassifier(random_state = 1)
# param_grid =  {'n_estimators': [100,500,1000], 
#                                   'bootstrap': [True,False],
#                                   'max_depth': [3,5,10,20,50,75,100,None],
#                                   'max_features': ['auto','sqrt'],
#                                   'min_samples_leaf': [1,2,4,10],
#                                   'min_samples_split': [2,5,10]}
                                  
# rnd_grid = RandomizedSearchCV(rf, param_distributions = param_grid, n_iter = 100, cv = 5, verbose = True, n_jobs = -1)
# clf_rand = clf_rf_rnd.fit(X_train_scaled,y_train)

## Variance Analysis 

In [10]:
df = pd.read_csv('cleaned_data.csv')

In [15]:
roles = df.select_dtypes('object').columns.drop(['Language','Country','Title','Release date'])
roles 

Index(['Genres', 'Directed by', 'Starring', 'Produced by', 'Distributed by',
       'Music by', 'Edited by', 'Cinematography', 'Written by',
       'Screenplay by', 'Production companies'],
      dtype='object')

In [16]:
df.select_dtypes('object').columns.drop(['Language','Country','Title','Release date'])
new_df = df.copy()
for role in roles:
    new_df[role] = new_df[role].str.replace('[','').str.replace(']','').str.replace("'",'').str.split(',')

In [17]:
def list_counts(col,df):
    names = {}
    for i in df[col]:
        if type(i) == list:
            for j in i:
                if j != '':
                    if j.strip() not in names:
                        names[j.strip()] = 1
                    else:
                        names[j.strip()] +=1
                else: pass
    count_df = pd.DataFrame.from_dict(names, orient='index', columns = ['Count'])
#     count_df = count_df.sort_values(by='Count', ascending=False)
    return count_df

In [18]:
role_means= {}
for role in roles:
    items = list_counts(role,new_df).index
    means = []
    skip = False
    for i in items:
        try:
            means.append(df['Box office'][df[role].str.contains(f"'{i}'", na=False)].mean())
        except:
            skip = True
            continue
    role_means[role] = means

  return func(self, *args, **kwargs)


In [19]:
role_stds = pd.DataFrame.from_dict(role_means, orient='index').std(axis=1).sort_values(ascending=False)
role_stds

Screenplay by           1.748159e+08
Starring                1.474001e+08
Directed by             1.348909e+08
Edited by               1.348061e+08
Written by              1.247570e+08
Production companies    1.178353e+08
Produced by             1.171524e+08
Cinematography          1.020089e+08
Music by                8.762419e+07
Genres                  7.916602e+07
Distributed by          6.802693e+07
dtype: float64

In [21]:
drop_cols = role_stds.index[-3:].tolist()
drop_cols

['Music by', 'Genres', 'Distributed by']

In [22]:
cols = df_out.columns

In [56]:
music_by = cols[cols.str.contains(drop_cols[0])].tolist()
genres = cols[cols.str.contains(drop_cols[1])] .tolist()
distributed = cols[cols.str.contains(drop_cols[2])].tolist() 

In [58]:
df_cut = df_out[cols.drop(music_by+genres+distributed)]
train_df, test_df = split_train_test(df_cut, 0.2)
train_df = train_df.drop(columns= 'Title')
test_df = test_df.drop(columns= 'Title')

In [59]:
train = train_df.dropna(subset=['Box office'])
train = train.reset_index().drop(columns='index')

x = train.drop(columns=['Box office','Release date','Day','Language','Country','is_weekend'])
y = train['Box office']

In [60]:
knn_imputation = KNNImputer(n_neighbors=9).fit_transform(x[['Budget','Running time','Year','Month','Day of week','Based on']])
knn_imputation = pd.DataFrame(knn_imputation, columns=['Budget','Running time','Year','Month','Day of week','Based on'])
x = pd.concat([x.drop(columns=['Budget','Running time','Year','Month','Day of week','Based on']),knn_imputation], axis =1)

power_transform = PowerTransformer().fit_transform(x[['Budget']])
power_transform = pd.DataFrame(power_transform, columns=['Budget'])
x = pd.concat([x.drop(columns=['Budget']), power_transform], axis =1)

minmax = MinMaxScaler().fit_transform(x[['Running time','Budget','Year']])
minmax = pd.DataFrame(minmax, columns=['Running time','Budget','Year'])
x = pd.concat([x.drop(columns=['Running time','Budget','Year']),minmax], axis =1)

columns_trans = make_column_transformer(
    (OneHotEncoder(),['Month','Day of week']),
    remainder='passthrough')

In [62]:
rf = RandomForestRegressor(n_jobs=-1, max_depth=6)
pipe = make_pipeline(columns_trans, rf)
cv = cross_val_score(pipe, x, y, cv=3)
print(cv)
cv.mean()

[0.54418666 0.52685427 0.58399685]


0.551679261360236

The model peformoed than when I used all the list object columns so it will not be used