In [1]:
import pandas as pd
import numpy as np
import csv
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn import preprocessing

In [8]:
#Root Mean Squared Log Error
#Ensure all positive values in input arrays
def rmsle(truth, predicted):
    return np.sqrt(np.mean((predicted - truth) ** 2))

In [67]:
#load data
train_df = pd.read_csv('../input/train.csv', index_col = "ID")
y_train = np.log1p(train_df.pop('target'))

test_df = pd.read_csv('../input/test.csv', index_col = "ID")
test_ID = test_df.index.tolist()

# Preprocessing

In [68]:
#remove columns with only one value
colsToRemove = train_df.columns[train_df.nunique() == 1]
train_df.drop(colsToRemove.values, axis = 1, inplace = True)
test_df.drop(colsToRemove.values, axis = 1, inplace = True)

#round to NUM_OF_DECIMALS
NUM_OF_DECIMALS = 5
train_df = train_df.round(NUM_OF_DECIMALS)
test_df = test_df.round(NUM_OF_DECIMALS)

In [16]:
#remove duplicate columns
colsToRemove = []
columns = train_df.columns
for i in range(len(columns)-1):
    v = train_df[columns[i]].values
    dupCols = []
    for j in range(i + 1,len(columns)):
        if np.array_equal(v, train_df[columns[j]].values):
            colsToRemove.append(columns[j])
train_df.drop(colsToRemove, axis=1, inplace=True) 
test_df.drop(colsToRemove, axis=1, inplace=True) 

In [69]:
#add features/preprocessing
#PCA dimensionality reduction
pca = PCA(n_components = 1000)
X_train = pd.DataFrame(pca.fit_transform(train_df))
test_df = pd.DataFrame(pca.transform(test_df))

#feature engineering
# data['mean'] = data_orig.mean(axis=1)
# data['std'] = data_orig.std(axis=1)
# data['min'] = data_orig.min(axis=1)
# data['max'] = data_orig.max(axis=1)

# # Number of diferent values in a row.
# data['number_of_different'] = data_orig.nunique(axis=1)

# # Number of non zero values (e.g. transaction count)
# data['non_zero_count'] = data_orig.fillna(0).astype(bool).sum(axis=1)

# Modelling

In [34]:
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

#note: X_train and y_train are fed as global variables
NUM_FOLDS = 8 #need tuned
def rmsle_cv(model):
    kf = KFold(NUM_FOLDS, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse = np.sqrt(-cross_val_score(model, X_train.values, y_train, 
                                    scoring = "neg_mean_squared_error", cv = kf))
    return(rmse)

model_xgb = xgb.XGBRegressor(colsample_bytree = 0.054, colsample_bylevel = 0.5, 
                             gamma = 1.45, learning_rate = 0.02, max_depth = 22, 
                             objective = 'reg:linear', booster = 'gbtree',
                             min_child_weight = 57, n_estimators = 1000, reg_alpha = 0, 
                             reg_lambda = 0, eval_metric = 'rmse', subsample = 0.67, 
                             silent = 1, n_jobs = -1, early_stopping_rounds = 14,
                             random_state = 7, nthread = -1, verbose = True)


In [35]:
score = rmsle_cv(model_xgb)

In [39]:
print(score, '\nmean: ', np.mean(score), '\nstd: ', np.std(score))

[ 1.44619318  1.5566009   1.5092827   1.47950204  1.48087826  1.52347618
  1.55837553  1.56091438] 
mean:  1.51440289707 
std:  0.0402812915529


In [70]:
%%time
model_xgb.fit(X_train.values, y_train)
output = np.expm1(model_xgb.predict(test_df.values))

submit = pd.DataFrame()
submit['ID'] = test_ID
submit['target'] = output
submit.to_csv('../predictions/predictions.csv', index=False)

Wall time: 5min 29s
