In [1]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017
FOLDS = 5
SHIFT = 5

In [2]:
# Get data
train = pd.read_csv("clean_data/train.csv")
y = pd.read_csv("clean_data/y.csv")
test = pd.read_csv("clean_data/test.csv")
test_ids = pd.read_csv("clean_data/test_ids.csv", header = None)

display(train.shape)
display(train.head(2))
display(y.shape)
display(y.head(2))
display(test.shape)
display(test.head(2))
display(test_ids.shape)
display(test_ids.head(2))


(4209, 367)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,srp_3,srp_4,srp_5,srp_6,srp_7,srp_8,srp_9,srp_10,srp_11,srp_12
0,0,10,21,44,0,3,20,9,14,0,...,126.974,-13.625,1.239,17.342,104.678,-3.716,-2.477,0.0,28.49,1.239
1,5761,14,11,29,5,3,15,9,14,0,...,119.613,-13.625,0.0,17.342,102.271,-2.477,2.477,0.0,13.625,1.239


(4209, 1)

Unnamed: 0,y
0,130.81
1,115.07


(4209, 367)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,srp_3,srp_4,srp_5,srp_6,srp_7,srp_8,srp_9,srp_10,srp_11,srp_12
0,1,50,21,13,5,3,19,0,22,0,...,119.613,0.0,0.0,27.251,96.078,1.239,0.0,-2.477,29.728,2.477
1,2,19,1,33,0,3,1,6,24,0,...,126.974,-11.148,1.239,30.967,124.497,-2.477,-2.477,1.239,1.239,0.0


(4209, 1)

Unnamed: 0,0
0,1
1,2


In [3]:
# Definitions for ML models

def fit_lasso(X_train, y_train, seed = SEED, verbose = False) :
    la = LassoCV(
        alphas = [0.001, 0.003, 0.006, 0.01, 0.03, 0.06], 
        n_jobs = N_JOBS, 
        random_state = seed, 
        #tol = 0.0005,
        max_iter = 50000, 
        normalize  = True)
    la.fit(X_train, y_train)
    
    alpha = la.alpha_
    la = LassoCV(
        alphas = [alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, 
                  alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3], 
        n_jobs = N_JOBS, 
        random_state = seed, 
        #tol = 0.0005,
        max_iter = 50000, 
        normalize  = True)
    la.fit(X_train, y_train)
    if (verbose) :
        print("LASSO Best alpha :", la.alpha_)
        print("LASSO iter :", la.n_iter_)

    return (la)


def fit_extra_trees(X_train, y_train, seed = SEED, verbose = False) :
    # TODO : better criterion available?
    et = ExtraTreesRegressor(
        n_estimators = 100,
        criterion = "mse", 
        max_features = 0.9,
        max_depth = 5,
        min_samples_split = 15,
        min_samples_leaf = 2, 
        #bootstrap = True, 
        n_jobs = N_JOBS,
        random_state = seed)
    et.fit(X_train, y_train)                           

    return (et)


In [4]:
# Fit and predict

def fitpredict(model, train, test, y) :
    all_cvs = 0
    all_preds = np.zeros(test.shape[0])

    # Do cross-val
    kf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    oof_preds = pd.DataFrame()
    oof_targets = pd.DataFrame()
    for i, (train_index, test_index) in enumerate(kf.split(train)):
        print("---------")
        print("FOLD " + str(i + 1))
        X_train, X_val = train.iloc[train_index], train.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Fit and predict
        if(model == "la") :
            predictor = fit_lasso(X_train, y_train.y, seed = (SEED * i))
        elif(model == "et") :
            predictor = fit_extra_trees(X_train, y_train.y, seed = (SEED * i))
        
        # Compute score on this fold
        preds_val = predictor.predict(X_val)    
        fold_score = r2_score(y_val.y, preds_val)
        #fold_score = r2_score(np.exp(y_val.y) - SHIFT, np.exp(preds_val) - SHIFT)
        print("Fold r2_score : " + str(fold_score))
        
        # Store OOF predictions
        oof_preds = pd.concat([pd.Series(oof_preds), pd.Series(preds_val)], axis = 0)
        oof_targets = pd.concat([pd.Series(oof_targets), pd.Series(y_val.y)], axis = 0)
        
        # Predict on test set having learned just on this X_train
        preds_test = predictor.predict(test)
        all_preds += preds_test
    
    # Compte error on OOF predictions
    cv_score = r2_score(oof_targets, oof_preds)
    #cv_score = r2_score(np.exp(oof_targets) - SHIFT, np.exp(oof_preds) - SHIFT)
    print("Global OOF r2_score : " + str(cv_score))
    
    # METHOD 1
    # Compute average of test set predictions
    preds_avg = all_preds / FOLDS
    
    # METHOD 2
    # Learn and predict on whole sets
    if(model == "la") :
        predictor = fit_lasso(train, y, seed = (SEED * i))
    elif(model == "et") :
        predictor = fit_extra_trees(train, y, seed = (SEED * i))
    preds_simple = predictor.predict(test)
    
    # Save data from la validation set for blending purposes
    X_val.to_csv("clean_data/" + model + "_X_val.csv", index = False)
    y_val.y.to_csv("clean_data/y_val.csv", index = False)    
    pd.Series(preds_val).to_csv("clean_data/" + model + "_preds_val.csv", index = False)    
        
    return(preds_simple, preds_avg, cv_score)


<b>Script</b>

In [5]:
model = "et"
preds_simple, preds_avg, cv_score = fitpredict(model, train, test, y)

---------
FOLD 1
Fold r2_score : 0.578509382665
---------
FOLD 2
Fold r2_score : 0.590783643572
---------
FOLD 3
Fold r2_score : 0.629377221441
---------
FOLD 4
Fold r2_score : 0.628142123004
---------
FOLD 5
Fold r2_score : 0.455449547582
Global OOF r2_score : 0.572060803892


In [6]:
# Save predictions
pd.DataFrame({"ID": test_ids[0], "y": preds_simple}).to_csv("preds/" + model + "_simple.csv", index = False)
pd.DataFrame({"ID": test_ids[0], "y": preds_avg}).to_csv("preds/" + model + "_avg.csv", index = False)
