In [1]:
import pandas as pd 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)

# https://www.kaggle.com/c/ga-customer-revenue-prediction

In [2]:
data = pd.read_csv("data_pre.csv",index_col=0)

In [3]:
data['logRevenue'] = np.log(data['totals.transactionRevenue']+1)

In [4]:
data['logRevenue'].describe()

count    18514.000000
mean        17.770575
std          1.186022
min          9.210440
25%         16.953935
50%         17.645455
75%         18.420681
max         23.864375
Name: logRevenue, dtype: float64

In [5]:
X = data.copy()
X.drop(['totals.transactionRevenue','date','logRevenue','totals.totalTransactionRevenue','fullVisitorId','visitId','totals.transactions'],axis=1,inplace=True)
y = data['logRevenue']

In [6]:
X.head()

Unnamed: 0,channelGrouping,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,0,4,1508200705,0,0,False,0,0,0,0,0,0,0,0,9,0.0,9,13.0,261.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0
1,1,11,1508192636,0,0,False,1,1,0,0,1,0,1,0,15,0.0,12,38.0,285.0,0,0,0,0,0.0,0,0,0.0,1,1,1,1,1
2,0,6,1508162218,1,1,True,2,2,0,0,2,1,1,0,15,0.0,15,42.0,1044.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0
3,1,17,1508189401,0,0,False,0,2,0,0,2,0,1,0,18,0.0,16,77.0,514.0,0,0,0,0,0.0,0,0,0.0,0,1,1,1,1
4,1,1,1508190484,0,0,False,0,0,0,0,0,0,0,0,21,1.0,20,62.0,487.0,0,0,0,0,0.0,0,0,0.0,1,1,1,1,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

# Using Random Forest

In [8]:
rf = RandomForestRegressor(n_estimators=100)

In [None]:
param = {'max_depth':[5,10,15,20,25,30]}

In [None]:
rf_cv = GridSearchCV(rf,param,cv=10,verbose=True,scoring='neg_mean_squared_log_error')

rf_cv.fit(X_train, y_train)

In [None]:
rf_cv.grid_scores_

In [None]:
best_model = rf_cv.best_estimator_
print(best_model)

In [None]:
predicted_train = best_model.predict(X_train)
best_model.score(X_train,y_train)

In [None]:
rmse_train = np.sqrt(mean_squared_error(y_train, predicted_train))
print("RMSE: %f" % (rmse_train))

In [None]:
predicted_train = best_model.predict(X_test)
best_model.score(X_test,y_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, predicted_train))
print("RMSE: %f" % (rmse))

In [None]:
predicted_vs_actual = y_test.copy()
predicted_vs_actual['predicted'] = np.array(predicted_train)

predicted_vs_actual.head()

In [None]:
y_test.shape

In [None]:
feature_importances = pd.DataFrame(best_model.feature_importances_,index = X_train.columns,columns=['importance']).sort_values('importance',ascending=False)
feature_importances.head()

# Use XGBoost

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [None]:
xg_reg = XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 100)

In [None]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
# Model Selector


In [None]:
#from sklearn.feature_selection import SelectFromModel

In [None]:
#sel = SelectFromModel(RandomForestClassifier(n_estimators = 1))
#sel.fit(X_train, y_train)

In [None]:
#sel.get_support()

In [None]:
#selected_feat= X_train.columns[(sel.get_support())]
#len(selected_feat)

In [None]:
#print(selected_feat)

In [None]:
#pd.series(sel.estimator_,feature_importances_,.ravel()).hist()