In [9]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn import preprocessing

In [10]:
#Root Mean Squared Log Error
#Ensure all positive values in input arrays
def rmsle(truth, predicted):
    return np.sqrt(np.mean((np.log1p(predicted)-np.log1p(truth))**2))

In [34]:
#add features/preprocessing
def prepare(data_orig):
    
    #PCA dimensionality reduction
    pca = PCA(n_components = 5)
    data = pd.DataFrame(pca.fit_transform(data_orig))
    
    #feature engineering
    data['mean'] = data_orig.mean(axis=1)
    data['std'] = data_orig.std(axis=1)
    data['min'] = data_orig.min(axis=1)
    data['max'] = data_orig.max(axis=1)
    
    # Number of diferent values in a row.
    data['number_of_different'] = data_orig.nunique(axis=1)
    
    # Number of non zero values (e.g. transaction count)
    data['non_zero_count'] = data_orig.fillna(0).astype(bool).sum(axis=1)
    
    return data

In [30]:
#load data
train_df = pd.read_csv('../input/train.csv')
y_train = train_df.pop('target')
iiii = train_df.pop('ID')

In [35]:
#remove columns only with zeros
train_df = train_df.loc[:, (train_df != 0).any(axis = 0)]

X_train = prepare(train_df).values

In [36]:
#create, fit, and evaluate model
forest = RandomForestRegressor(n_estimators = 100)
scores = np.sqrt(-cross_val_score(forest, X_train, y_train, cv = 5, 
                                  scoring = 'neg_mean_squared_log_error'))

In [37]:
print(np.mean(scores))
print(scores)

1.668862510038783
[1.59516393 1.69817128 1.58809503 1.69981058 1.76307173]


In [None]:
test_df = pd.read_csv('../input/test.csv')
ids = test_df.pop('ID')

with open("../predictions/predictions.csv", "w", newline = '') as writeCSV:
    writer = csv.writer(writeCSV)
    writer.writerow(["ID","target"])
    kTest = prepare(test_df).values
    output = forest.predict(kTest)

    for i in range(len(output)):
        writer.writerow([ids[i], output[i]])