In [1]:
import pandas as pd 
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
treereg = DecisionTreeRegressor(random_state=1)
from sklearn.ensemble import RandomForestRegressor
rfreg = RandomForestRegressor()
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor

## Data Preparation

In [2]:
campaigns = pd.read_csv('campaigns.csv', index_col=0)
campaigns['campaign_start'] = pd.to_datetime(campaigns['campaign_start'])
campaigns['campaign_end'] = pd.to_datetime(campaigns['campaign_end'])
campaigns['roas'] = pd.to_numeric(campaigns['roas'])
campaigns['startmonth'] = campaigns.campaign_start.dt.month
campaigns['endmonth'] = campaigns.campaign_end.dt.month
campaigns['campaignlength'] = (campaigns.campaign_end - campaigns.campaign_start)
campaigns['campaignlength'] = campaigns.campaignlength.dt.days
campaigns = campaigns[(campaigns.campaignlength >= 1 )]
campaigns['dailyspend'] = (campaigns.spend / campaigns.campaignlength)
campaigns.rating.fillna(campaigns.rating.median(), inplace=True)
campaigns.reviews.fillna(campaigns.reviews.median(), inplace=True)
campaigns.price.fillna(campaigns.price.median(), inplace=True)
campaigns = campaigns.dropna(axis=0, how='any')
campaigns = campaigns.drop(['campaign_start','campaign_end'], axis=1)

In [3]:
advertiser_dummies = pd.get_dummies(campaigns.advertiser_name, prefix='advertiser')
cat_dummies = pd.get_dummies(campaigns.category, prefix='category')
campaignswithdummies = pd.concat([campaigns, cat_dummies], axis=1)
campaignswithdummies = pd.concat([campaignswithdummies, advertiser_dummies], axis=1)

In [4]:
feature_cols = ['retargeting','price','rating','reviews','startmonth','endmonth','campaignlength','dailyspend']
X = campaigns[feature_cols]
y = campaigns.roas

In [5]:
X_dummies = campaignswithdummies.drop(['advertiser_name','ad_campaign_id','spend','sales','roas','category'], axis=1)
y_dummies = campaignswithdummies.roas

### Finding n_estimators, max_features from Random Forest

#### without dummies

In [None]:
# list of values to try for n_estimators
estimator_range = range(10, 310, 10)

# list to store the average RMSE for each value of n_estimators
RMSE_scores = []

# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)
for estimator in estimator_range:
    rfreg = RandomForestRegressor(n_estimators=estimator, random_state=1)
    MSE_scores = cross_val_score(rfreg, X, y, cv=5, scoring='neg_mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
    
# plot n_estimators (x-axis) versus RMSE (y-axis)
plt.plot(estimator_range, RMSE_scores)
plt.xlabel('n_estimators')
plt.ylabel('RMSE (lower is better)')

# show the best RMSE and the corresponding n_estimator
sorted(zip(RMSE_scores, estimator_range))[0]

In [None]:
# list of values to try for max_features
feature_range = range(1, 8)

# list to store the average RMSE for each value of max_features
RMSE_scores = []

# use 10-fold cross-validation with each value of max_features (WARNING: SLOW!)
for feature in feature_range:
    rfreg = RandomForestRegressor(n_estimators=50, max_features=feature, random_state=1)
    MSE_scores = cross_val_score(rfreg, X, y, cv=10, scoring='neg_mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
    
# plot max_features (x-axis) versus RMSE (y-axis)
plt.plot(feature_range, RMSE_scores)
plt.xlabel('max_features')
plt.ylabel('RMSE (lower is better)')

# show the best RMSE and the corresponding max_features
sorted(zip(RMSE_scores, feature_range))[0]

#### With dummies

In [None]:
# list of values to try for n_estimators
estimator_range_dum = range(10, 310, 10)

# list to store the average RMSE for each value of n_estimators
RMSE_scores_dum = []

# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)
for estimator in estimator_range_dum:
    rfreg = RandomForestRegressor(n_estimators=estimator, random_state=1)
    MSE_scores_dum = cross_val_score(rfreg, X_dummies, y_dummies, cv=5, scoring='neg_mean_squared_error')
    RMSE_scores_dum.append(np.mean(np.sqrt(-MSE_scores)))
    
# plot n_estimators (x-axis) versus RMSE (y-axis)
plt.plot(estimator_range_dum, RMSE_scores_dum)
plt.xlabel('n_estimators')
plt.ylabel('RMSE (lower is better)')

# show the best RMSE and the corresponding n_estimator
sorted(zip(RMSE_scores_dum, estimator_range_dum))[0]

In [None]:
# list of values to try for max_features
feature_range_dum = range(1, 38)

# list to store the average RMSE for each value of max_features
RMSE_scores_dum = []

# use 10-fold cross-validation with each value of max_features (WARNING: SLOW!)
for feature in feature_range_dum:
    rfreg = RandomForestRegressor(n_estimators=10, max_features=feature, random_state=1)
    MSE_scores_dum = cross_val_score(rfreg, X_dummies, y_dummies, cv=10, scoring='neg_mean_squared_error')
    RMSE_scores_dum.append(np.mean(np.sqrt(-MSE_scores)))
    
# plot max_features (x-axis) versus RMSE (y-axis)
plt.plot(feature_range_dum, RMSE_scores_dum)
plt.xlabel('max_features')
plt.ylabel('RMSE (lower is better)')

# show the best RMSE and the corresponding max_features
sorted(zip(RMSE_scores_dum, feature_range_dum))[0]

In [None]:
# list of values to try for n_neighbors
neighbor_range = range(320, 345, 1)

# list to store the average RMSE for each value of n_estimators
RMSE_scores = []

# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)
for neighbor in neighbor_range:
    knn = KNeighborsRegressor(n_neighbors=neighbor)
    MSE_scores = cross_val_score(knn, X, y, cv=5, scoring='neg_mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
    
# plot n_estimators (x-axis) versus RMSE (y-axis)
plt.plot(neighbor_range, RMSE_scores)
plt.xlabel('neighbor_range')
plt.ylabel('RMSE (lower is better)')

# show the best RMSE and the corresponding n_estimator
sorted(zip(RMSE_scores, neighbor_range))[0]

## Important Features

In [7]:
# important features without dummies
# n_estimators = 50 and max_features = 1
rfreg = RandomForestRegressor(n_estimators=50, max_features=1, oob_score=True)
rfreg.fit(X, y)

pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,reviews,0.225084
1,price,0.209003
7,dailyspend,0.172604
6,campaignlength,0.154545
2,rating,0.122989
4,startmonth,0.058322
5,endmonth,0.038461
0,retargeting,0.018992


In [8]:
# important features with dummies 
# n_estimators = 50 and max_features = 1
rfreg = RandomForestRegressor(n_estimators=10, max_features=1, oob_score=True)
rfreg.fit(X_dummies, y_dummies)

pd.DataFrame({'feature':X_dummies.columns, 'importance':rfreg.feature_importances_}).sort_values('importance', ascending=False).head(20)

  warn("Some inputs do not have OOB scores. "


Unnamed: 0,feature,importance
2,reviews,0.159952
647,advertiser_P&G Scale,0.125246
1,price,0.112775
3,rating,0.102271
7,dailyspend,0.091228
6,campaignlength,0.066346
4,startmonth,0.047557
12,category_Baby Products,0.046091
608,advertiser_Nintendo - US,0.033416
5,endmonth,0.023381


## Model RMSE Compared

### Linear regression

In [9]:
# Important features without dummies
X_important = X[['dailyspend','rating','endmonth']]
scores = cross_val_score(linreg, X_important, y, cv=10, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-scores))

8.2410628082427735

In [10]:
# Important features with dummies
X_important_dum = X_dummies[['dailyspend','rating','reviews','campaignlength','price']]
scores = cross_val_score(linreg, X_important_dum, y_dummies, cv=10, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-scores))

8.4071000557490354

### Random forest

In [11]:
# All features with dummies
rfreg = RandomForestRegressor(n_estimators=10, max_features=1, oob_score=True)
scores = cross_val_score(rfreg, X_dummies, y_dummies, cv=10, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-scores))

8.0319549336602822

In [12]:
# Important features without dummies
X_important = X[['reviews','dailyspend','rating','price','campaignlength','endmonth']]
rfreg = RandomForestRegressor(n_estimators=50, max_features=1, oob_score=True)
scores = cross_val_score(rfreg, X_important, y, cv=10, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-scores))

8.0273066834033955

### KNN

In [13]:
# Non-dummies are better than dummies
knn = KNeighborsRegressor(n_neighbors=335)
scores = cross_val_score(knn, X, y, cv=10, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-scores))

8.1459852516879945

## Ensembling

**How does bagging work (for decision trees)?**

1. Grow B trees using B bootstrap samples from the training data.
2. Train each tree on its bootstrap sample and make predictions.
3. Combine the predictions:
    - Average the predictions for **regression trees**
    - Take a vote for **classification trees**

Notes:

- **Each bootstrap sample** should be the same size as the original training set.
- **B** should be a large enough value that the error seems to have "stabilized".
- The trees are **grown deep** so that they have low bias/high variance.

Bagging increases predictive accuracy by **reducing the variance**, similar to how cross-validation reduces the variance associated with train/test split (for estimating out-of-sample error) by splitting many times an averaging the results.

In [17]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)



In [29]:
X_dumtrain, X_dumtest, y_dumtrain, y_dumtest = train_test_split(X_dummies, y_dummies, random_state=1)

In [35]:
X_imptrain, X_imptest, y_imptrain, y_imptest = train_test_split(X_important, y, random_state=1)

In [40]:
X_dumimptrain, X_dumimptest, y_dumimptrain, y_dumimptest = train_test_split(X_important_dum, y_dummies, random_state=1)

In [30]:
# create a NumPy array with the same shape as y_test
y_null = np.zeros_like(y_test, dtype=float)

# fill the array with the mean value of y_test
y_null.fill(y_test.mean())

In [28]:
# compute null RMSE
np.sqrt(metrics.mean_squared_error(y_test, y_null))

6.8515231293927723

### Bagging

### All features w/o dummies

In [21]:
bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_train, y_train)
y_pred = bagreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

6.6074077556767232

In [23]:
bagreg = BaggingRegressor(KNeighborsRegressor(), n_estimators=335, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_train, y_train)
y_pred = bagreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

7.3670186359618448

In [24]:
bagreg = BaggingRegressor(LinearRegression(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_train, y_train)
y_pred = bagreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

7.6683413499977737

In [25]:
bagreg = BaggingRegressor(RandomForestRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_train, y_train)
y_pred = bagreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

6.7076605984077737

### All features w/ dummies

In [31]:
bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_dumtrain, y_dumtrain)
y_pred = bagreg.predict(X_dumtest)
np.sqrt(metrics.mean_squared_error(y_dumtest, y_pred))

4.8814046887107745

In [32]:
bagreg = BaggingRegressor(KNeighborsRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_dumtrain, y_dumtrain)
y_pred = bagreg.predict(X_dumtest)
np.sqrt(metrics.mean_squared_error(y_dumtest, y_pred))

7.294597429418368

In [33]:
bagreg = BaggingRegressor(LinearRegression(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_dumtrain, y_dumtrain)
y_pred = bagreg.predict(X_dumtest)
np.sqrt(metrics.mean_squared_error(y_dumtest, y_pred))

25581479.72319581

In [34]:
bagreg = BaggingRegressor(RandomForestRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_dumtrain, y_dumtrain)
y_pred = bagreg.predict(X_dumtest)
np.sqrt(metrics.mean_squared_error(y_dumtest, y_pred))

5.2887279580036042

### Important features w/o dummies

In [36]:
bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_imptrain, y_imptrain)
y_pred = bagreg.predict(X_imptest)
np.sqrt(metrics.mean_squared_error(y_imptest, y_pred))

6.6099138576434253

In [37]:
bagreg = BaggingRegressor(KNeighborsRegressor(), n_estimators=335, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_imptrain, y_imptrain)
y_pred = bagreg.predict(X_imptest)
np.sqrt(metrics.mean_squared_error(y_imptest, y_pred))

7.3685318525419179

In [38]:
bagreg = BaggingRegressor(LinearRegression(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_imptrain, y_imptrain)
y_pred = bagreg.predict(X_imptest)
np.sqrt(metrics.mean_squared_error(y_imptest, y_pred))

7.4422932518938456

In [39]:
bagreg = BaggingRegressor(RandomForestRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_imptrain, y_imptrain)
y_pred = bagreg.predict(X_imptest)
np.sqrt(metrics.mean_squared_error(y_imptest, y_pred))

6.7150209064034767

### Important w/ dummies

In [41]:
bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_dumimptrain, y_dumimptrain)
y_pred = bagreg.predict(X_dumimptest)
np.sqrt(metrics.mean_squared_error(y_dumimptest, y_pred))

6.6464306191292275

In [42]:
bagreg = BaggingRegressor(KNeighborsRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_dumimptrain, y_dumimptrain)
y_pred = bagreg.predict(X_dumimptest)
np.sqrt(metrics.mean_squared_error(y_dumimptest, y_pred))

7.2962126265375176

In [43]:
bagreg = BaggingRegressor(LinearRegression(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_dumimptrain, y_dumimptrain)
y_pred = bagreg.predict(X_dumimptest)
np.sqrt(metrics.mean_squared_error(y_dumimptest, y_pred))

7.4466123118452945

In [44]:
bagreg = BaggingRegressor(RandomForestRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
bagreg.fit(X_dumimptrain, y_dumimptrain)
y_pred = bagreg.predict(X_dumimptest)
np.sqrt(metrics.mean_squared_error(y_dumimptest, y_pred))

6.7225502990495105