In [1]:
import pandas as pd
import numpy as np
import re, string, time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = pd.read_csv('preprocessed.csv')

# Common Functions

In [3]:
def _preprocess(dtrain, dtest):
    # replace np.inf to np.nan
    # dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    # dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest


def _preprocess_log(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest

# passing split_data according to features
def train_cv(clf, model, split_data, preprocess = 'linear'):
    X_train, X_test, y_train, y_test = split_data
    print()
    if preprocess == 'log':
        X_train, X_test = _preprocess_log(X_train, X_test)
    elif preprocess == 'linear':
        X_train, X_test = _preprocess(X_train, X_test)
    elif preprocess == 'no_preprocess': 
        #use original data
        pass
        
    cv_scores, n_folds = [], 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=816)
   
    
    for i, (train_ind, val_ind) in enumerate(skf.split(X_train, y_train)):
        print("Running Fold", i + 1, "/", n_folds)
        start = time.time()
        
        train_x, val_x = X_train.iloc[train_ind, :], X_train.iloc[val_ind, :]
        train_y, val_y = y_train.iloc[train_ind], y_train.iloc[val_ind]
        if model == 'svm':
            clf, train_loss, val_loss = svm_run_model(svm_clf, (train_x, train_y), (val_x, val_y))
        if model == 'gb':
            clf, train_loss, val_loss = gb_run_model(gb_clf, (train_x, train_y), (val_x, val_y))
        if model == 'dt':
            clf, train_loss, val_loss = dt_run_model(dt_clf, (train_x, train_y), (val_x, val_y))
        
        print("train_loss: {0:.6f}, val_loss: {1:.6f}".format(train_loss, val_loss), end="\t")
        
        end = time.time()
        m, s = divmod(end-start, 60)
        h, m = divmod(m, 60)

        print("time elapsed: %d:%02d:%02d" % (h, m, s))
        y_pred = clf.predict(val_x)
        accuracy_score = metrics.accuracy_score(val_y, y_pred)
        f_score = metrics.f1_score(val_y, y_pred, average='macro')
        cv_scores.append([train_loss, val_loss, f_score, accuracy_score])
        
        print("accuracy score: ", accuracy_score)
        print("f score: ", f_score)
        
    mean_train_loss = np.mean([cv_scores[i][0] for i in range(len(cv_scores))])
    mean_val_loss = np.mean([cv_scores[i][1] for i in range(len(cv_scores))])
    
    print("train_loss mean: {0:.6f}, val_loss mean: {1:.6f}".format(mean_train_loss, mean_val_loss))

    return clf, cv_scores

# Training the models

## 1. GradientBoostingRegressor

In [4]:
gb_features = ['bathrooms', 'bedrooms', 'price']
X = data[gb_features]
y = data['interest_level']

In [5]:
def split_data():
    return train_test_split(X,y)

def gb_run_model(clf, dtrain,dtest=None):
    if dtest:
        clf.fit(dtrain[0],dtrain[1])
        y_train_pred, y_test_pred = clf.predict_proba(dtrain[0]), clf.predict_proba(dtest[0])
        y_train_loss, y_test_loss = log_loss(dtrain[1], y_train_pred), log_loss(dtest[1], y_test_pred)
        return clf, y_train_loss, y_test_loss
    else:
        clf.fit(dtrain[0],dtrain[1])
        y_train_pred = clf.predict_proba(dtrain[0])
        y_train_loss = log_loss(dtrain[1], y_train_pred)
        return clf, y_train_loss



In [6]:
gb_clf = GradientBoostingClassifier()
params = {'learning_rate': 0.02,
          'n_estimators': 10000,
          'subsample': 0.7,
          'max_depth': 5,
          'random_state': 36683,
          'verbose': 1
         }
gb_clf.set_params(**params)
train_cv(gb_clf, 'gb',split_data(), preprocess='no_preprocess')


Running Fold 1 / 5
      Iter       Train Loss      OOB Improve   Remaining Time 
         1       13611.3189          23.1097            8.79m
         2       13550.2589          21.5501            8.52m
         3       13522.1604          19.6519            8.43m
         4       13471.6136          19.6575            9.33m
         5       13427.4461          18.5323            9.09m
         6       13487.4414          16.1698            8.94m
         7       13361.7994          16.2581            8.86m
         8       13276.5114          15.2057            8.75m
         9       13241.8961          14.6790            9.05m
        10       13309.2943          13.7209            8.98m
        20       12865.4916           8.9130            8.72m
        30       12797.6860           5.1751            9.43m
        40       12652.9192           3.7426            9.27m
        50       12527.9846           2.2455            9.17m
        60       12481.5589           1.4714     

        20       12927.7901           8.5135            8.69m
        30       12747.1858           5.4830            8.70m
        40       12642.0248           3.3442            8.62m
        50       12551.6003           2.2713            8.55m
        60       12511.8724           1.2297            8.48m
        70       12411.3023           1.0342            8.41m
        80       12453.1428           0.5845            8.36m
        90       12382.6727           0.3765            8.32m
       100       12346.1899           0.0296            8.27m
       200       12167.3438          -0.1436            7.87m
       300       12034.5779          -0.4530            7.95m
       400       12005.9184          -0.3604            7.69m
       500       11929.8609          -0.3080            7.50m
       600       11846.8161          -0.3698            7.38m
       700       11846.2198          -0.3804            7.29m
       800       11732.5062          -0.5321            7.18m
       9

(GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.02, loss='deviance', max_depth=5,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=10000,
                            n_iter_no_change=None, presort='auto',
                            random_state=36683, subsample=0.7, tol=0.0001,
                            validation_fraction=0.1, verbose=1,
                            warm_start=False),
 [[0.6298458045002828,
   0.7800941202854448,
   0.3794254654982827,
   0.6838940354298716],
  [0.6311265644623326,
   0.7673263807715758,
   0.3733686973640262,
   0.6810793237971391],
  [0.6284543967895811,
   0.8019302014145712,
   0.38757689609610213,
   0.6781534460338101],
  [0.6275106673162203,
   0.7

In [6]:
gb_clf = GradientBoostingClassifier()
params = {'learning_rate': 0.02,
          'n_estimators': 1000,
          'subsample': 0.7,
          'max_depth': 5,
          'random_state': 36683,
          'verbose': 1
         }
gb_clf.set_params(**params)
train_cv(gb_clf, 'gb',split_data(), preprocess='no_preprocess')


Running Fold 1 / 5
      Iter       Train Loss      OOB Improve   Remaining Time 
         1       13564.7505          21.9900            1.13m
         2       13504.5402          21.2265            1.04m
         3       13377.6188          19.7885            1.08m
         4       13401.6395          19.4041            1.07m
         5       13387.5332          17.5449            1.02m
         6       13351.1424          16.2451           59.13s
         7       13215.9076          15.8187           57.91s
         8       13225.5754          15.8074           59.00s
         9       13248.1530          14.9355           58.01s
        10       13092.9963          14.2717           57.02s
        20       12889.0398           8.1924           53.40s
        30       12734.7514           5.1857           53.18s
        40       12571.1657           2.9650           52.02s
        50       12423.4573           2.4009           51.10s
        60       12541.6083           1.3878     

         8       13178.2271          15.3701           52.04s
         9       13259.4540          14.0742           53.20s
        10       13218.8273          12.9230           52.78s
        20       12902.7278           8.8442           51.01s
        30       12678.3394           5.3381           50.55s
        40       12584.1329           3.2789           49.58s
        50       12494.7635           2.2431           48.73s
        60       12362.5756           1.5135           47.92s
        70       12299.0336           1.0706           47.17s
        80       12355.2756           0.9342           46.75s
        90       12249.6744           0.2201           46.15s
       100       12277.9252           0.1382           45.51s
       200       12058.2088          -0.4595           40.62s
       300       12057.0749          -0.1486           34.24s
       400       12045.6105          -0.3434           28.79s
       500       11840.7372          -0.3389           23.71s
       6

(GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.02, loss='deviance', max_depth=5,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=1000,
                            n_iter_no_change=None, presort='auto',
                            random_state=36683, subsample=0.7, tol=0.0001,
                            validation_fraction=0.1, verbose=1,
                            warm_start=False),
 [[0.6719042190346873,
   0.722855029270888,
   0.3485588682695016,
   0.6864943929790346],
  [0.6704570671550301,
   0.7239355797787529,
   0.3561916920726081,
   0.6918075422626788],
  [0.6733620623927397,
   0.7193758871905407,
   0.35255309417823316,
   0.6919200130060152],
  [0.6712196877973301,
   0.725