In [27]:
import pandas as pd
import numpy as np
import csv
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn import preprocessing

In [2]:
#Root Mean Squared Log Error
#Ensure all positive values in input arrays
def rmsle(truth, predicted):
    return np.sqrt(np.mean((np.log1p(np.absolute(predicted))-np.log1p(truth))**2))

In [20]:
#add features/preprocessing
def prepare(data_orig):
    
    #PCA dimensionality reduction
#     pca = PCA(n_components = 100)
#     data = pd.DataFrame(pca.fit_transform(data_orig))
    data = pd.DataFrame()
    #feature engineering
    data['mean'] = data_orig.mean(axis=1)
    data['std'] = data_orig.std(axis=1)
    data['min'] = data_orig.min(axis=1)
    data['max'] = data_orig.max(axis=1)
    
    # Number of diferent values in a row.
    data['number_of_different'] = data_orig.nunique(axis=1)
    
    # Number of non zero values (e.g. transaction count)
    data['non_zero_count'] = data_orig.fillna(0).astype(bool).sum(axis=1)
    
    return data

In [18]:
#load data
train_df = pd.read_csv('../input/train.csv')
y_train = train_df.pop('target')
iiii = train_df.pop('ID')

In [21]:
#remove columns only with zeros
train_df = train_df.loc[:, (train_df != 0).any(axis = 0)]

#add features/preprocessing
X_train = prepare(train_df).values

In [22]:
model = xgb.XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
model.fit(X_train[:3000], y_train[:3000], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [23]:
predictions = model.predict(X_train[3000:])
rmsle(y_train[3000:], predictions)

1.7704018371986483

In [24]:
#create, fit, and evaluate model
forest = RandomForestRegressor(n_estimators = 100)
scores = np.sqrt(-cross_val_score(forest, X_train, y_train, cv = 5, 
                                  scoring = 'neg_mean_squared_log_error'))

In [25]:
print(np.mean(scores))
print(scores)

1.62185737194
[ 1.56127572  1.6685679   1.54884223  1.64102238  1.68957863]


In [None]:
test_df = pd.read_csv('../input/test.csv')
ids = test_df.pop('ID')

with open("../predictions/predictions.csv", "w", newline = '') as writeCSV:
    writer = csv.writer(writeCSV)
    writer.writerow(["ID","target"])
    kTest = prepare(test_df).values
    output = forest.predict(kTest)

    for i in range(len(output)):
        writer.writerow([ids[i], output[i]])