# DS 310 Project 2
### Authors: Sean Rendar & Jess Strait

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [11]:
# Import useful packages
import pandas as pd
import numpy as np
import re
import datetime
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_log_error

In [3]:
# Read in training and testing data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [415]:
print(list(train.columns))
print(train.dtypes)

['id', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue']
id                         int64
belongs_to_collection     object
budget                     int64
genres                    object
homepage                  object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
Keywords                  object
cast                 

In [4]:
# Combine train and test for feature engineering
# 0 = train, 1 = test
train['tt'] = 0
test['tt'] = 1
data = train.append(test)
print(data.head())

   id                              belongs_to_collection    budget  \
0   1  [{'id': 313576, 'name': 'Hot Tub Time Machine ...  14000000   
1   2  [{'id': 107674, 'name': 'The Princess Diaries ...  40000000   
2   3                                                NaN   3300000   
3   4                                                NaN   1200000   
4   5                                                NaN         0   

                                              genres  \
0                     [{'id': 35, 'name': 'Comedy'}]   
1  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
2                      [{'id': 18, 'name': 'Drama'}]   
3  [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...   
4  [{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...   

                            homepage    imdb_id original_language  \
0                                NaN  tt2637294                en   
1                                NaN  tt0368933                en   
2  http://sonyclassics.com/whiplash

In [5]:
# Create some binary variables for variables of interest to be made numeric
# Was the original language English?
data['original_language'] = data.original_language.fillna("")
data['orig_eng'] = data.original_language.apply(lambda x: 1 if x == 'en' else 0)
data['orig_chin'] = data.original_language.apply(lambda x: 1 if x == 'zh' else 0)
data['orig_french'] = data.original_language.apply(lambda x: 1 if x == 'fr' else 0)
data['orig_it'] = data.original_language.apply(lambda x: 1 if x == 'it' else 0)
data['orig_hindi'] = data.original_language.apply(lambda x: 1 if x == 'hi' else 0)

print(sum(data.orig_eng))

6351


In [6]:
# Was the movie produced in the US?
data['production_countries'] = data.production_countries.fillna("")
data['production_countries'] = data.production_countries.apply(lambda x: re.findall("'name': \'(.+?)\'", x))
data['usa_prod'] = data.production_countries.apply(lambda x: 1 if 'United States of America' in x else 0)
print(sum(data.usa_prod))

5617


In [7]:
# What was the movie genre?
data['genres'] = data['genres'].fillna('')
data['genres'] = data['genres'].apply(lambda x:re.findall("'name': \'(.+?)\'", x))
genre_dummies = data['genres'].str.join('|').str.get_dummies()
genre_dummies['id'] = data['id']
data = data.merge(genre_dummies, on='id')
print(data.columns)
print(sum(data['Crime']))

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue',
       'tt', 'orig_eng', 'orig_chin', 'orig_french', 'orig_it', 'orig_hindi',
       'usa_prod', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
       'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie',
       'Thriller', 'War', 'Western'],
      dtype='object')
1084


In [8]:
# Is the movie part of a collection?
data['belongs_to_collection'] = data['belongs_to_collection'].fillna('')
data['film_collect'] = data.belongs_to_collection.apply(lambda x: 0 if x=='' else 1)
print(sum(data.film_collect))

1481


In [9]:
# Can we text mine for some popular collections?
data['franchise'] = data.belongs_to_collection.apply(lambda x: 1 if re.search("(?:Star Wars|Rocky|Harry Potter|Lord of the Rings|Star Trek|Hobbit|Twilight|Pirates of the Caribbean|Hunger Games|Jurassic Park)", x) else 0)
data['superhero'] = data.belongs_to_collection.apply(lambda x: 1 if re.search("(?:Avengers|X-Men|Batman|Spider-Man|Iron Man|Transformers)", x) else 0)
data['action_fran'] = data.belongs_to_collection.apply(lambda x: 1 if re.search("(?:James Bond|Rambo|Fast and Furious|Mission: Impossible|Bourne|Indiana Jones)", x) else 0)
print(sum(data.franchise))

62


In [10]:
# Was English part of the spoken language?
data['spoken_languages'] = data.spoken_languages.fillna("")
data['speak_eng'] = data.spoken_languages.apply(lambda x: 1 if 'en' in x else 0)
print(sum(data.speak_eng))

6465


In [12]:
# Can we text mine for some famous actors?
data['cast'] = data.cast.fillna("")
data['famous_actor'] = data.cast.apply(lambda x: 1 if re.search("(?:Dwayne Johnson|Tom Hanks|Brad Pitt|Morgan Freeman|Clint Eastwood|Matt Damon)", x) else 0)
# Create special feature for action stars, as we know action films tend to gross higher
data['famous_action_act'] = data.cast.apply(lambda x: 1 if re.search("(?:Sylvester Stallone|Harrison Ford|Robert Downey|Hugh Jackman|Bruce Willis|Tom Cruise)", x) else 0)
# Create special features for actors more frequently occurring in high-grossing movies
data['samuel_l'] = data.cast.apply(lambda x: 1 if "Samuel L. Jackson" in x else 0)
data['will_s'] = data.cast.apply(lambda x: 1 if "Will Smith" in x else 0)
data['leo'] = data.cast.apply(lambda x: 1 if "Leonardo DiCaprio" in x else 0)
data['depp'] = data.cast.apply(lambda x: 1 if "Johnny Depp" in x else 0)
data['freeman'] = data.cast.apply(lambda x: 1 if "Morgan Freeman" in x else 0)
data['deniro'] = data.cast.apply(lambda x: 1 if "Robert De Niro" in x else 0)
print(sum(data.famous_actor))

264


In [13]:
# Can we text mine for famous actresses?
data['famous_actress'] = data.cast.apply(lambda x: 1 if re.search("(?:Meryl Streep|Jennifer Lawrence|Anne Hathaway|Emma Watson|Sandra Bullock|Halle Berry|Scarlett Johansson|Julia Roberts|Jennifer Aniston|Nicole Kidman)", x) else 0)
print(sum(data.famous_actress))

285


In [14]:
# Can we text mine for some famous directors?
data['crew'] = data.crew.fillna("")
data['famous_director'] = data.crew.apply(lambda x: 1 if re.search("(?:Steven Spielberg|Martin Scorsese|Quentin Tarantino|Stanley Kubrick|Tim Burton|Christopher Nolan|James Cameron|David Fincher|Robert Zemeckis)", x) else 0)
print(sum(data.famous_director))

213


In [15]:
# Can we text mine for some popular production companies?
data['production_companies'] = data.production_companies.fillna("")
data['famous_prod'] = data.production_companies.apply(lambda x: 1 if re.search("(?:Disney|Warner Bros|Paramount|MGM|Twentieth Century Fox|Universal Pictures)", x) else 0)
print(sum(data.famous_prod))

2122


In [16]:
# Does the movie have a tagline?
data['tagline'] = data['tagline'].fillna("")
data['tag_present'] = data.belongs_to_collection.apply(lambda x: 0 if x=='' else 1)
print(sum(data.tag_present))

1481


In [17]:
# Does the movie have a production company?
data['production_companies'] = data['production_companies'].fillna("")
data['prod_present'] = data.production_companies.apply(lambda x: 0 if x=='' else 1)
print(sum(data.prod_present))

6984


In [18]:
# Does the movie have a homepage?
data['homepage'] = data['homepage'].fillna("")
data['home_present'] = data.homepage.apply(lambda x: 0 if x=='' else 1)
print(sum(data.home_present))

2366


In [110]:
data.budget.describe()

count    7.398000e+03
mean     2.264167e+07
std      3.695113e+07
min      0.000000e+00
25%      0.000000e+00
50%      8.000000e+06
75%      2.875000e+07
max      3.800000e+08
Name: budget, dtype: float64

In [19]:
# What was the movie budget?
data['budget'] = data['budget'].fillna(data.budget.mean())

# Was the movie budget greater than the average $65 million
data['big_budget'] = data.budget.apply(lambda x:1 if x>65000000 else 0)
print(sum(data.big_budget))

# Was the movie budget extra small?
data['small_budget'] = data.budget.apply(lambda x:1 if x<2000000 else 0)
print(sum(data.small_budget))

720
2643


In [20]:
# How long was the runtime?
data['runtime'] = data['runtime'].fillna(114)
data['runtime'] = pd.to_numeric(data['runtime'])
data['longer_than_two_fifteen'] = data.runtime.apply(lambda x: 1 if x>135 else 0)
print(sum(data.longer_than_two_fifteen))

664


In [21]:
# Mine the release date
data['release_date'] = data['release_date'].fillna('')
data['month'] = pd.DatetimeIndex(data['release_date']).month
data['year'] = pd.DatetimeIndex(data['release_date']).year
data['weekday'] = pd.DatetimeIndex(data['release_date']).dayofweek

# Create summer dummies
data['summer'] = data.month.apply(lambda x: 1 if 5<x<9 else 0)

# Create dummies for some decades
data['before_1980'] = data.year.apply(lambda x: 1 if x<1980 else 0)
data['before_y2k'] = data.year.apply(lambda x: 1 if 1979<x<2000 else 0)
data['before_2010'] = data.year.apply(lambda x: 1 if 1999<x<2010 else 0)
data['since_2010'] = data.year.apply(lambda x: 1 if 2011<x else 0)

In [22]:
# Mining for keywords that could influence revenue
data['Keywords'] = data['Keywords'].fillna('')
data['woman_director'] = data.Keywords.apply(lambda x: 1 if "woman director" in x else 0)
data['independent_film'] = data.Keywords.apply(lambda x: 1 if "independent film" in x else 0)
data['base_novel'] = data.Keywords.apply(lambda x: 1 if "based on novel" in x else 0)
data['sequel'] = data.Keywords.apply(lambda x: 1 if "sequel" in x else 0)
data['credits'] = data.Keywords.apply(lambda x: 1 if "duringcreditsstinger" in x else 0)

print(data.woman_director.sum())

457


In [23]:
# See if title has changed from the original title
data['original_title'] = data.original_title.fillna('')
data['title'] = data.title.fillna('')
data['title_change']= np.where(data['title'] == data['original_title'], 0, 1)
print(data['title_change'].sum())

847


In [24]:
# Clean up remaining NAs
data['popularity'] = data.popularity.fillna("")
data['revenue'] = data.revenue.fillna("")
data['popularity'] = data['popularity'].astype('int64')

In [25]:
# Limit data frame to variables of interest
num_data = data[['id', 'budget', 'popularity','orig_eng', 'usa_prod', 'tt', 'franchise', 'Action', 'Adventure', 'Animation',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'tag_present', 'home_present', 'famous_action_act', 'will_s', 'samuel_l', 'leo', 'depp', 'freeman',
       'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'prod_present', 'superhero', 'action_fran',
       'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western', 'film_collect', 'speak_eng', 'famous_actor', 'famous_actress','famous_director', 'famous_prod', 'revenue',
                'big_budget', 'longer_than_two_fifteen', 'month', 'year', 'summer', 'weekday', 'before_y2k', 'before_2010', 'since_2010', 'runtime',
                'woman_director', 'independent_film', 'base_novel', 'sequel', 'credits', 'title_change', 'deniro']]

In [26]:
# Split back into training and testing
train = num_data.loc[num_data['tt'] == 0]
test = num_data.loc[num_data['tt'] == 1]
# Clean data
train = train.drop('tt', 1)
test = test.drop('tt', 1)
train['revenue'] = train['revenue'].astype('int64')
train.applymap(np.isreal)

Unnamed: 0,id,budget,popularity,orig_eng,usa_prod,franchise,Action,Adventure,Animation,Comedy,...,before_2010,since_2010,runtime,woman_director,independent_film,base_novel,sequel,credits,title_change,deniro
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2996,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2997,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2998,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [27]:
# Prepare data for modeling 
x_train = train.loc[:,train.columns!='revenue']
x_train = x_train.drop('id', 1)
y_train = train['revenue']
x_test = test.loc[:,test.columns!='revenue']
x_test = x_test.drop('id', 1)

In [28]:
# Normalize the data
x_train = x_train.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
x_test = x_test.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [29]:
x_train.isna().sum()

budget                     0
popularity                 0
orig_eng                   0
usa_prod                   0
franchise                  0
Action                     0
Adventure                  0
Animation                  0
Comedy                     0
Crime                      0
Documentary                0
Drama                      0
Family                     0
Fantasy                    0
tag_present                0
home_present               0
famous_action_act          0
will_s                     0
samuel_l                   0
leo                        0
depp                       0
freeman                    0
Foreign                    0
History                    0
Horror                     0
Music                      0
Mystery                    0
Romance                    0
prod_present               0
superhero                  0
action_fran                0
Science Fiction            0
TV Movie                   0
Thriller                   0
War           

In [30]:
# Fill possible NAs in test set
x_test['TV Movie'] = x_test['TV Movie'].fillna(0)
x_test['month'] = x_test['month'].fillna(0)
x_test['year'] = x_test['year'].fillna(0)
x_test['weekday'] = x_test['weekday'].fillna(0)
x_test.isna().sum()

budget                     0
popularity                 0
orig_eng                   0
usa_prod                   0
franchise                  0
Action                     0
Adventure                  0
Animation                  0
Comedy                     0
Crime                      0
Documentary                0
Drama                      0
Family                     0
Fantasy                    0
tag_present                0
home_present               0
famous_action_act          0
will_s                     0
samuel_l                   0
leo                        0
depp                       0
freeman                    0
Foreign                    0
History                    0
Horror                     0
Music                      0
Mystery                    0
Romance                    0
prod_present               0
superhero                  0
action_fran                0
Science Fiction            0
TV Movie                   0
Thriller                   0
War           

In [31]:
# Start with decision tree classifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [32]:
# Make predictions with decision tree classifier
y_pred = model.predict(x_train)

In [33]:
# Baseline 1 RMSLE: 2.5
test_pred = model.predict(x_test)
print(test_pred)

[50673078  5000000 17530973 ...   777423    74918   182857]


In [None]:
# First submission RMSLE: 2.94175

In [34]:
# Create parameter grid for RandomForest cross-validation
param = { 'max_depth': [5, 10, 15, 20, 25, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [10, 25, 50, 100]}

In [87]:
# Random forest model identified with RandomizedSearchCV
forest = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = forest, param_distributions = param, n_iter = 4, cv = 4, verbose=2, random_state=42, n_jobs = -1, return_train_score = True)

rf_random.fit(x_train, y_train)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:    3.9s finished


RandomizedSearchCV(cv=4, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [88]:
# View results in order of performance
random_df = pd.DataFrame(rf_random.cv_results_).sort_values('mean_test_score', ascending = False)
random_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,params,...,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
2,0.240231,0.003902,0.012484,0.001115,50,5,2,sqrt,20,"{'n_estimators': 50, 'min_samples_split': 5, '...",...,0.704604,0.702167,0.035749,1,0.866996,0.861138,0.863663,0.864847,0.864161,0.002115
0,0.576629,0.008529,0.010501,0.000495,50,5,4,auto,10,"{'n_estimators': 50, 'min_samples_split': 5, '...",...,0.694651,0.701833,0.020514,2,0.853191,0.863521,0.855475,0.862032,0.858555,0.004331
3,0.562905,0.007968,0.010007,0.000714,50,10,4,auto,10,"{'n_estimators': 50, 'min_samples_split': 10, ...",...,0.692326,0.694973,0.025375,3,0.84261,0.847819,0.835143,0.853307,0.84472,0.006699
1,0.051858,0.003696,0.00462,0.000807,10,2,4,sqrt,25,"{'n_estimators': 10, 'min_samples_split': 2, '...",...,0.671702,0.673322,0.025693,4,0.792505,0.768648,0.773263,0.790013,0.781107,0.01032


In [89]:
# Fit best estimator to data
rf_model = rf_random.best_estimator_
rf_model.fit(x_train,y_train)
y_pred = rf_model.predict(x_train)

In [89]:
# Make predictions with random forest
test_pred = rf_model.predict(x_test)
print(test_pred)

[58812477.66885509  6566516.23483217 55534352.73071918 ...
 74251068.32273747 81059611.44950631 15466008.73756455]


In [None]:
# Second submission RMSLE: 2.74767

In [90]:
# Boosted decision tree
ada = AdaBoostRegressor(n_estimators=100, base_estimator=DecisionTreeRegressor(max_depth=10), learning_rate=1e-3, random_state=42)

ada.fit(x_train,y_train)

y_pred = ada.predict(x_train)

In [91]:
# Make predictions with AdaBoosted regressor
test_pred = ada.predict(x_test)
print(test_pred)

[ 7922983.2         4208492.96226415 19919542.66666667 ...
 49309719.53488372 26260552.25       10904523.60344828]


In [None]:
# Third submission RMSLE: 2.62765

In [35]:
# Create parameter grid for DecisionTreeRegressor
param = { 'max_depth': [5, 10, 15, 20, 25, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [1, 2, 3, 4, 5, 6],
 'min_samples_split': [2, 5, 10],
 'min_weight_fraction_leaf': [0.0, 0.1, 0.25, 0.5, 0.75]}

# Instantiate stratified k-fold
cv = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

In [36]:
# Decision tree regressor identified with RandomizedSearchCV
tree = DecisionTreeRegressor()
tree_random = RandomizedSearchCV(estimator = tree, param_distributions = param, n_iter = 4, cv = cv, verbose=2, random_state=42, n_jobs = -1, return_train_score = True)

tree_random.fit(x_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.9s finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                   error_score=nan,
                   estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort='deprecated',
                                                   ra...
                                

In [37]:
# Explore model options
tree_df = pd.DataFrame(tree_random.cv_results_).sort_values('mean_test_score', ascending = False)
tree_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_weight_fraction_leaf,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
2,0.041405,0.004681,0.002914,0.001225,0.0,5,4,auto,20.0,"{'min_weight_fraction_leaf': 0.0, 'min_samples...",...,0.512178,0.134074,1,0.87317,0.882681,0.850041,0.885674,0.872761,0.872865,0.012502
0,0.022818,0.013069,0.002594,0.001353,0.1,2,4,auto,25.0,"{'min_weight_fraction_leaf': 0.1, 'min_samples...",...,0.392091,0.029035,2,0.398781,0.414742,0.413401,0.406383,0.412944,0.40925,0.005981
1,0.008385,0.00185,0.0,0.0,0.75,2,2,sqrt,,"{'min_weight_fraction_leaf': 0.75, 'min_sample...",...,,,3,,,,,,,
3,0.006596,0.001376,0.0,0.0,0.75,2,3,log2,25.0,"{'min_weight_fraction_leaf': 0.75, 'min_sample...",...,,,4,,,,,,,


In [38]:
# Save model for use with boosted regressor
tree_model = tree_random.best_estimator_

In [39]:
# Create parameter grid for AdaBoostRegressor cross validation
param = {'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 1],
         'base_estimator': [tree_model],
         'n_estimators': [5, 10, 15, 20, 25, 50, 75, 100]}

In [40]:
# AdaBoostRegressor with RandomizedSearchCV
ada = AdaBoostRegressor()

ada_random = RandomizedSearchCV(estimator = ada, param_distributions = param, n_iter = 4, cv = cv, verbose=2, random_state=42, n_jobs = -1, return_train_score = True)

ada_random.fit(x_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    5.4s finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
                   error_score=nan,
                   estimator=AdaBoostRegressor(base_estimator=None,
                                               learning_rate=1.0, loss='linear',
                                               n_estimators=50,
                                               random_state=None),
                   iid='deprecated', n_iter=4, n_jobs=-1,
                   param_distributions={'base_estimator': [DecisionTreeRegressor(ccp_alpha=0.0,
                                                                                 criterion='mse',
                                                                                 max_depth=20,
                                                                                 m...
                                                                                 min_impurity_decrease=0.0,
                                                              

In [41]:
# View results in order of performance
ada_df = pd.DataFrame(ada_random.cv_results_).sort_values('mean_test_score', ascending = False)
ada_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_learning_rate,param_base_estimator,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
3,1.439661,0.15371,0.015388,0.002683,50,0.01,"DecisionTreeRegressor(ccp_alpha=0.0, criterion...","{'n_estimators': 50, 'learning_rate': 0.01, 'b...",0.751805,0.594714,...,0.697225,0.065673,1,0.941779,0.945766,0.937978,0.944119,0.942193,0.942367,0.002618
1,1.635464,0.03956,0.020188,0.003544,50,0.001,"DecisionTreeRegressor(ccp_alpha=0.0, criterion...","{'n_estimators': 50, 'learning_rate': 0.001, '...",0.75587,0.58592,...,0.69153,0.06488,2,0.924742,0.939566,0.925808,0.93067,0.9248,0.929117,0.005663
2,0.328052,0.033055,0.005602,0.001344,10,0.3,"DecisionTreeRegressor(ccp_alpha=0.0, criterion...","{'n_estimators': 10, 'learning_rate': 0.3, 'ba...",0.723569,0.490548,...,0.654603,0.089443,3,0.961746,0.967265,0.958569,0.966002,0.966616,0.964039,0.003349
0,0.165956,0.010435,0.004587,0.001352,5,0.001,"DecisionTreeRegressor(ccp_alpha=0.0, criterion...","{'n_estimators': 5, 'learning_rate': 0.001, 'b...",0.669025,0.438716,...,0.598032,0.091372,4,0.889534,0.896253,0.900682,0.892827,0.872321,0.890323,0.009731


In [42]:
# Fit best boosted model
ada_model = ada_random.best_estimator_
ada_model.fit(x_train,y_train)

y_pred = ada_model.predict(x_train)

In [43]:
# RMSLE performance on training data with boosted model
np.sqrt(mean_squared_log_error(y_train, y_pred))

1.8257385521892944

In [131]:
# Make predictions with boosted model
test_pred = ada_model.predict(x_test)
print(test_pred)

[17180747.4         2959067.66666667 10042097.75       ...
 27867209.25        7574496.           925690.2       ]


In [None]:
# Fourth submission RMSLE: 2.49591
# Baseline 1 RMSLE met.

In [370]:
# Try logistic regression
log = LogisticRegression()
log.fit(x_train,y_train)
test_pred = log.predict(x_test)

In [97]:
# Try K-Neighbors Regression
model_knn = KNeighborsRegressor(n_neighbors=6)
model_knn.fit(x_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                    weights='uniform')

In [98]:
test_pred = model_knn.predict(x_test)

In [None]:
# Try XGBoost regression
xg_reg = xg_reg = xgb.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.001,
                max_depth = 3, alpha = 10, n_estimators = 15000)
xg_reg.fit(x_train, y_train)

In [613]:
test_pred = xg_reg.predict(x_test)

In [132]:
# This chunk of code is run for every modeling attempt to create a valid Kaggle submission for RMLSE testing
test["revenue"] = test_pred
result = test[["id","revenue"]]
result.to_csv("adatree_clean.csv", index=False)
result.head()

Unnamed: 0,id,revenue
3000,3001,17180750.0
3001,3002,2959068.0
3002,3003,10042100.0
3003,3004,10371890.0
3004,3005,4679764.0


In [81]:
# Following the development of the AdaBoost decision tree, some additional feature engineering was done to reach the second baseline of 2.0
# Creating the franchise variable - RMSLE: 2.48561
# Modifying the production variable - RMSLE: 2.46957
# Creating the tagline dummy - RMSLE: 2.53704
# Creating the production dummy - RMSLE: 2.49776
# Creating the homepage dummy - RMSLE: 2.54564
# Add additional CV values for AdaBoost tuning - RMSLE: 2.48674
# Add additional CV values for DecisionTreeRegressor tuning - RMSLE: 2.50032
# Create budget and keyword variable with K-Neighbors regression - RMSLE: 2.51089
# Create variable for runtime and try normalization - RMSLE: 2.35785
# Creating date variables - RMSLE: 2.31145
# Try with cross-validated logistic regression - RMSLE: 3.97993
# More runtime feature engineering - RMSLE: 2.30573
# Removing ID variable - RMSLE: 2.28516
# Removing keyword dummy and creating separate actor variables - RMSLE: 2.27979
# Title change variable - RMSLE: 2.29110
# Modifying budget information - RMSLE: 2.28542
# Keyword mining - RMSLE: 2.27859