In [2]:
import pandas as pd
import numpy as np
import csv
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn import preprocessing

In [3]:
#Root Mean Squared Log Error
#Ensure all positive values in input arrays
def rmsle(truth, predicted):
    return np.sqrt(np.mean((predicted - truth) ** 2))

In [14]:
#load data
train_df = pd.read_csv('../input/train.csv', index_col = "ID")
y_train = np.log1p(train_df.pop('target'))

test_df = pd.read_csv('../input/test.csv', index_col = "ID")
test_ID = test_df.index.tolist()

In [21]:
# GLOBAL VARS
USE_PCA = False
NUM_OF_FEATURES = 1000 #number of features to keep from RF
USE_SELECT_FEATURES = True

# Preprocessing

In [29]:
#remove columns with only one value
colsToRemove = train_df.columns[train_df.nunique() == 1]
train_df.drop(colsToRemove.values, axis = 1, inplace = True)
test_df.drop(colsToRemove.values, axis = 1, inplace = True)

In [30]:
#PCA dimensionality reduction
if USE_PCA:
    pca = PCA(n_components = 10)
    pca_train = pd.DataFrame(pca.fit_transform(train_df))
    pca_test = pd.DataFrame(pca.transform(test_df))
    train_df.append(pca_train)
    test_df.append(pca_test)

#use RF to select the NUM_OF_FEATURES most important features for predicting target
from sklearn import model_selection
from sklearn import ensemble
if USE_SELECT_FEATURES:
    NUM_OF_FEATURES = 1000
    x1, x2, y1, y2 = model_selection.train_test_split(
        train_df, y_train.values, test_size=0.20, random_state=5)
    model = ensemble.RandomForestRegressor(n_jobs=-1)
    model.fit(x1, y1)
    print(rmsle(y2, model.predict(x2)))
    col = pd.DataFrame({'importance': model.feature_importances_, 'feature': train_df.columns}).sort_values(
        by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
    X_train = train_df[col].copy()
    test = test_df[col].copy()

  result = result.union(other)


1.53712799768


In [34]:
#feature engineering
X_train.loc[:,'mean'] = X_train.mean(axis=1)
X_train.loc[:,'std'] = X_train.std(axis=1)
X_train.loc[:,'min'] = X_train.min(axis=1)
X_train.loc[:,'max'] = X_train.max(axis=1)

test.loc[:,'mean'] = test.mean(axis=1)
test.loc[:,'std'] = test.std(axis=1)
test.loc[:,'min'] = test.min(axis=1)
test.loc[:,'max'] = test.max(axis=1)

# Number of diferent values in a row.
X_train.loc[:,'number_of_different'] = X_train.nunique(axis=1)
test.loc[:,'number_of_different'] = test.nunique(axis=1)

# Number of non zero values (e.g. transaction count)
X_train.loc[:,'non_zero_count'] = X_train.fillna(0).astype(bool).sum(axis=1)
test.loc[:,'non_zero_count'] = test.fillna(0).astype(bool).sum(axis=1)

# Modelling

In [31]:
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

#note: X_train and y_train are fed as global variables
NUM_FOLDS = 5 #need tuned
def rmsle_cv(model):
    kf = KFold(NUM_FOLDS, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse = np.sqrt(-cross_val_score(model, X_train.values, y_train, 
                                    scoring = "neg_mean_squared_error", cv = kf))
    return(rmse)

model_xgb = xgb.XGBRegressor(colsample_bytree = 0.054, colsample_bylevel = 0.5, 
                             gamma = 1.45, learning_rate = 0.01, max_depth = 22, 
                             objective = 'reg:linear', booster = 'gbtree',
                             min_child_weight = 57, n_estimators = 1000, reg_alpha = 0, 
                             reg_lambda = 0, eval_metric = 'rmse', subsample = 0.67, 
                             silent = 0, n_jobs = -1, early_stopping_rounds = 14,
                             random_state = 7, nthread = -1)


In [35]:
score = rmsle_cv(model_xgb)

In [36]:
print("{}\nmean: {}\nstd:  {}".format(score, np.mean(score), np.std(score)))

[ 1.34846883  1.4348952   1.29793963  1.3549723   1.45467435]
mean: 1.3781900629561208
std:  0.05818568009808714


In [28]:
%%time
model_xgb.fit(X_train.values, y_train)
output = np.expm1(model_xgb.predict(test.values))

submit = pd.DataFrame()
submit['ID'] = test_ID
submit['target'] = output
submit.to_csv('../predictions/predictions.csv', index=False)

Wall time: 6min 21s
