In [2]:
## Data Preprocessing
import pandas as pd
import numpy as np

## Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Modeling
from sklearn.tree import DecisionTreeRegressor
from sklearn import ensemble
from sklearn.model_selection import train_test_split, \
                                    cross_val_score, \
                                    GridSearchCV

## Accuracy calculation
# from sklearn import metrics
# from sklearn.metrics import auc, \
#                             confusion_matrix, \
#                             classification_report, \
#                             roc_curve, \
#                             roc_auc_score, \
#                             precision_recall_curve, \
#                             average_precision_score, \
#                             accuracy_score, \
#                             balanced_accuracy_score, \
#                             precision_score, \
#                             recall_score

## Generate Dataset
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification
from sklearn.datasets import make_gaussian_quantiles

## Find out execution time
from datetime import datetime

## Graph visualization
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

## Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
X,Y = make_regression(n_samples = 10000, n_features = 12, 
                     n_informative = 10, random_state=22)
print(X.shape)
print(Y.shape)

(10000, 12)
(10000,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3)

# Decision Tree

In [6]:
dt = DecisionTreeRegressor(random_state=42)

startTime = datetime.now()

param_grid = {
#     'max_features': list(range(1,X_train.shape[1])),
    'max_depth' : [4,5,6,7,8],
#     'min_samples_split': np.linspace(0.1, 1.0, 5, endpoint=True),
#     'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)
}

CV_dt = GridSearchCV(estimator = dt, param_grid = param_grid, cv = 5)
CV_dt.fit(X_train, y_train)

print(CV_dt.best_params_)

print(datetime.now() - startTime)

{'max_depth': 8}
0:00:00.868638


In [8]:
dt = DecisionTreeRegressor(
    max_depth = 8,
#     max_features = 11,
#     min_samples_leaf = 0.1,
#     min_samples_split = 0.1
)

# Train Decision Tree Classifer
dt = dt.fit(X_train,y_train)

#Predict the response for test dataset
pred = dt.predict(X_test)

dt_score = dt.score(X_test,y_test)
print('Score of decision tree classifier: ', dt_score)

Score of decision tree classifier:  0.6234561245627743


# Gradient Boosting

In [9]:
# np.linspace(2, 10, 9, endpoint=True)

In [10]:
gb = ensemble.GradientBoostingRegressor(random_state=42)

startTime = datetime.now()

param_grid = {
#     'max_features': list(range(1,X_train.shape[1])),
    'max_depth' : np.linspace(2, 10, 9, endpoint=True),
#     'min_samples_split': np.linspace(0.1, 1.0, 5, endpoint=True),
#     'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)
}

CV_gb = GridSearchCV(estimator = gb, param_grid = param_grid, cv = 5)
CV_gb.fit(X_train, y_train)

print(CV_gb.best_params_)

print(datetime.now() - startTime)

{'max_depth': 5.0}
0:01:16.719248


In [11]:
gb = ensemble.GradientBoostingRegressor(
    max_depth = 5
)

gb = gb.fit(X_train,y_train)

pred = gb.predict(X_test)

gb_score = gb.score(X_test,y_test)
print('Score of gradient boosting classifier: ', gb_score)

Score of gradient boosting classifier:  0.9617022000653468


# Random Forest

In [13]:
# np.linspace(10, 100, 3, endpoint=True)

In [14]:
rf = ensemble.RandomForestRegressor(random_state=42)

startTime = datetime.now()

param_grid = {
#     'max_features': list(range(1,X_train.shape[1])),
    'max_depth' : np.linspace(2, 10, 9, endpoint=True),
#     'n_estimators': np.linspace(10, 100, 3, endpoint=True)
}

CV_rf = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5)
CV_rf.fit(X_train, y_train)

print(CV_rf.best_params_)

print(datetime.now() - startTime)

{'max_depth': 10.0}
0:00:09.607297


In [15]:
rf = ensemble.RandomForestRegressor(
    max_depth = 10
)

rf = rf.fit(X_train,y_train)

pred = rf.predict(X_test)

rf_score = rf.score(X_test,y_test)
print('Score of random forest classifier: ', rf_score)

Score of random forest classifier:  0.8279334920087433


# AdaBoost 

In [16]:
score = np.array([])

startTime = datetime.now()

for j in np.arange(3) + 4: # [4, 5, 6]
    Ada = ensemble.AdaBoostRegressor(DecisionTreeRegressor(max_depth = j), n_estimators = 100)
    mod_cv = cross_val_score(Ada, X_train, y_train, cv = 10)
    score = np.append(score, np.mean(mod_cv))
    print("j->", j, "Mean->", np.mean(mod_cv))
param = np.argmax(score) + 4
print('best parameter: ', param)
print(datetime.now() - startTime)

j-> 4 Mean-> 0.8406803922291426
j-> 5 Mean-> 0.8542573278519704
j-> 6 Mean-> 0.8620102160048028
best parameter:  6
0:01:07.488611


In [17]:
Ada = ensemble.AdaBoostRegressor(DecisionTreeRegressor(max_depth = param), n_estimators = 100)

Ada = Ada.fit(X_train,y_train)

pred = Ada.predict(X_test)

Ada_score = Ada.score(X_test,y_test)
print('Score of AdaBoost classifier: ', Ada_score)

Score of AdaBoost classifier:  0.8626166040694385


# Best Model Selection

In [18]:
print('Score of decision tree classifier: ', dt_score)
print('Score of gradient boosting classifier: ', gb_score)
print('Score of random forest classifier: ', rf_score)
print('Score of AdaBoost classifier: ', Ada_score)

Score of decision tree classifier:  0.6234561245627743
Score of gradient boosting classifier:  0.9617022000653468
Score of random forest classifier:  0.8279334920087433
Score of AdaBoost classifier:  0.8626166040694385
