In [1]:
import pandas as pd
import numpy as np
import csv
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn import preprocessing

In [2]:
#Root Mean Squared Log Error
#Ensure all positive values in input arrays
def rmsle(truth, predicted):
    return np.sqrt(np.mean((predicted - truth) ** 2))

In [12]:
#load data
train_df = pd.read_csv('../input/train.csv', index_col = "ID")
y_train = np.log1p(train_df.pop('target'))

test_df = pd.read_csv('../input/test.csv', index_col = "ID")
test_ID = test_df.index.tolist()

In [11]:
# GLOBAL VARS
USE_PCA = False
NUM_OF_FEATURES = 1000 # number of features to keep from RF
NUM_OF_DECIMALS = 4 # number of decimal places to keep

# Preprocessing

In [17]:
#remove columns with only one value
colsToRemove = train_df.columns[train_df.nunique() == 1]
train_df.drop(colsToRemove.values, axis = 1, inplace = True)
test_df.drop(colsToRemove.values, axis = 1, inplace = True)

#round to NUM_OF_DECIMALS
train_df = train_df.round(NUM_OF_DECIMALS)
test_df = test_df.round(NUM_OF_DECIMALS)

In [18]:
#PCA dimensionality reduction
if USE_PCA:
    pca = PCA(n_components = 1000)
    train_df = pd.DataFrame(pca.fit_transform(train_df))
    test_df = pd.DataFrame(pca.transform(test_df))

#use RF to select the NUM_OF_FEATURES most important features for predicting target
from sklearn import model_selection
from sklearn import ensemble
NUM_OF_FEATURES = 1000
x1, x2, y1, y2 = model_selection.train_test_split(
    train_df, y_train.values, test_size=0.20, random_state=5)
model = ensemble.RandomForestRegressor(n_jobs=-1)
model.fit(x1, y1)
print(rmsle(y2, model.predict(x2)))
col = pd.DataFrame({'importance': model.feature_importances_, 'feature': train_df.columns}).sort_values(
    by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
X_train = train_df[col]
test_df = test_df[col]
X_train.shape

1.53680625241


(4459, 1000)

In [19]:
#feature engineering
X_train['mean'] = X_train.mean(axis=1)
X_train['std'] = X_train.std(axis=1)
X_train['min'] = X_train.min(axis=1)
X_train['max'] = X_train.max(axis=1)

test_df['mean'] = test_df.mean(axis=1)
test_df['std'] = test_df.std(axis=1)
test_df['min'] = test_df.min(axis=1)
test_df['max'] = test_df.max(axis=1)

# Number of diferent values in a row.
X_train['number_of_different'] = X_train.nunique(axis=1)
test_df['number_of_different'] = test_df.nunique(axis=1)

# Number of non zero values (e.g. transaction count)
X_train['non_zero_count'] = X_train.fillna(0).astype(bool).sum(axis=1)
test_df['non_zero_count'] = test_df.fillna(0).astype(bool).sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

# Modelling

In [20]:
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

#note: X_train and y_train are fed as global variables
NUM_FOLDS = 5 #need tuned
def rmsle_cv(model):
    kf = KFold(NUM_FOLDS, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse = np.sqrt(-cross_val_score(model, X_train.values, y_train, 
                                    scoring = "neg_mean_squared_error", cv = kf))
    return(rmse)

model_xgb = xgb.XGBRegressor(colsample_bytree = 0.054, colsample_bylevel = 0.5, 
                             gamma = 1.45, learning_rate = 0.02, max_depth = 22, 
                             objective = 'reg:linear', booster = 'gbtree',
                             min_child_weight = 57, n_estimators = 1000, reg_alpha = 0, 
                             reg_lambda = 0, eval_metric = 'rmse', subsample = 0.67, 
                             silent = 0, n_jobs = -1, early_stopping_rounds = 14,
                             random_state = 7, nthread = -1)


In [21]:
score = rmsle_cv(model_xgb)

In [22]:
print(score, '\nmean: ', np.mean(score), '\nstd:  ', np.std(score))

[ 1.35241623  1.44002085  1.30074459  1.36320909  1.4553989 ] 
mean:  1.38235793052 
std:   0.0575915212268


In [23]:
%%time
model_xgb.fit(X_train.values, y_train)
output = np.expm1(model_xgb.predict(test_df.values))

submit = pd.DataFrame()
submit['ID'] = test_ID
submit['target'] = output
submit.to_csv('../predictions/predictions.csv', index=False)



Wall time: 12min 30s
