In [46]:
#James Chartouni
#code edited/inspired https://www.kaggle.com/samratp/aggregates-sumvalues-sumzeros-k-means-pca
#code edited/inspired https://www.kaggle.com/samratp/aggregates-sumvalues-sumzeros-k-means-pca
import pandas as pd
import numpy as np
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import RobustScaler, StandardScaler, normalize, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import TruncatedSVD, PCA, SparsePCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, DBSCAN, Birch, SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.linear_model import Lasso, Ridge, ElasticNet


import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBRegressor

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot

from helpers import *
from scipy.stats import boxcox

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

In [23]:
train = pd.read_csv("data/train_cleaned.csv")
train_orig = pd.read_csv("data/train.csv")
y = np.log1p(train_orig["target"].values)
X = train

print ("Rows: " + str(train.shape[0]) + ", Columns: " + str(train.shape[1]))
#print(train.head())

test = pd.read_csv('data/test_cleaned.csv')
#test_X = test.drop(["ID"], axis=1)
print ("Rows: " + str(test.shape[0]) + ", Columns: " + str(test.shape[1]))

#print(test.head())

Rows: 4459, Columns: 1420
Rows: 49342, Columns: 1420


In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [42]:
import math 

def rmsle_rf(estimator, X, y0):
    #print(estimator.best_params_)
    y = estimator.predict(X)
    if len(y[y<=-1]) != 0:
        y[y<=-1] = 0.0
    #print("here",y[y<=-1],len(y[y<=-1]))
    assert len(y) == len(y0)
    r = np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))
    if math.isnan(r):
        print("this is a nan")
        print(scipy.stats.describe(y))
        plt.hist(y, bins=10, color='blue')
        plt.savefig("nan_y.png")
        
    return r

In [43]:
def objective(params):
    params = {'n_estimators': int(params['n_estimators']), 'max_depth': int(params['max_depth'])}
    clf = RandomForestRegressor(n_jobs=4, **params)
    score = cross_val_score(clf, X_train, y_train, scoring=rmsle_rf, cv=KFold()).mean()
    print("Score {:.3f} params {}".format(score, params))
    return score

space = {
    'n_estimators': hp.quniform('n_estimators', 25, 500, 25),
    'max_depth': hp.quniform('max_depth', 1, 10, 1)
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=300)

Gini 0.106 params {'n_estimators': 50, 'max_depth': 2}
Gini 0.094 params {'n_estimators': 225, 'max_depth': 7}
Gini 0.094 params {'n_estimators': 350, 'max_depth': 8}
Gini 0.097 params {'n_estimators': 50, 'max_depth': 4}
Gini 0.111 params {'n_estimators': 475, 'max_depth': 1}
Gini 0.097 params {'n_estimators': 375, 'max_depth': 4}
Gini 0.093 params {'n_estimators': 125, 'max_depth': 9}
Gini 0.094 params {'n_estimators': 175, 'max_depth': 8}
Gini 0.100 params {'n_estimators': 275, 'max_depth': 3}


KeyboardInterrupt: 

In [52]:
def objective(params):
    lgb_params ={'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 'metric': {'rmse'},
                 'num_leaves': space['num_leaves'], 'learning_rate': space['learning_rate'], 
                 'max_bin': space['max_bin'], "learning_rate": space['learning_rate'],
                 'max_depth': space['max_depth'], 'min_child_samples':space['min_child_samples'], 
                 'subsample': space['subsample'], "n_estimators": space['n_estimators'],
                 'colsample_bytree': space['colsample_bytree'], 'n_threads':-1, 'verbose': 0}
    
    clf = lgbm.LGBMRegressor(
        **params
    )
    
    score = cross_val_score(clf, X, Y, scoring=rmsle_rf, cv=StratifiedKFold()).mean()
    print("Score {:.3f} params {}".format(score, params))
    return score

space = {
    
    'num_leaves': hp.quniform('num_leaves', 8, 128, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'learning_rate' : hp.uniform('learning_Rate', .01, .005, .001),
    #'n_estimators' : hp.uniform('n_estimators', 300, 500 ),
    #'subsample': hp.uniform('subsample', 0.5, 1.0), 
    #'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1.0),
    #'max_bin': hp.quniform('max_bin', 300, 500, 1),
    #'num_boost_round': hp.quniform('num_boost_round', 20, 1000, 1),
    #'max_depth': hp.quniform('max_depth', 3, 10, 1),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)

TypeError: ap_uniform_sampler() got multiple values for argument 'size'

In [None]:
COMPONENTS = 10

# List of decomposition methods to use
methods = [
    TruncatedSVD(n_components=COMPONENTS),
    PCA(n_components=COMPONENTS),
    FastICA(n_components=COMPONENTS),
    GaussianRandomProjection(n_components=COMPONENTS, eps=0.1),
    SparseRandomProjection(n_components=COMPONENTS, dense_output=True)    
]

# Run all the methods
embeddings = []
for method in methods:
    name = method.__class__.__name__    
    embeddings.append(
        pd.DataFrame(method.fit_transform(total_df), columns=[f"{name}_{i}" for i in range(COMPONENTS)])
    )
    print(f">> Ran {name}")
    
# Put all components into one dataframe
components_df = pd.concat(embeddings, axis=1)

ModuleNotFoundError: No module named 'hpsklearn'