In [7]:
#James Chartouni
import pandas as pd
import numpy as np
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler, normalize, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, DBSCAN, Birch, SpectralClustering
from sklearn.cluster import AgglomerativeClustering

import lightgbm as lgb
import xgboost as xgb

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot

from helpers import *

In [9]:

def lgbm_auto(data, y_train, test_data):
      
    train_X, val_X, train_y, val_y = train_test_split(data, y_train, test_size = 0.2, random_state = 42)
    
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=50, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_data, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

In [12]:
train = pd.read_csv("data/train_cleaned.csv")
y_train = train['target'] #TRY A LOG TRANSFORM
y_train = np.log1p(train["target"].values)
X_train = train.drop(["target", "ID"], axis=1)

print ("Rows: " + str(train.shape[0]) + ", Columns: " + str(train.shape[1]))
print(train.head())

test = pd.read_csv('data/test_cleaned.csv')
X_test = test.drop(["ID"], axis=1)
print ("Rows: " + str(test.shape[0]) + ", Columns: " + str(test.shape[1]))
test.head()

Rows: 4459, Columns: 4993
          ID      target  48df886f9  0deb4b6a8  34b15f335  a8cb14b00  \
0  000d6aaf2  38000000.0        0.0          0        0.0          0   
1  000fbd867    600000.0        0.0          0        0.0          0   
2  0027d6b71  10000000.0        0.0          0        0.0          0   
3  0028cbf45   2000000.0        0.0          0        0.0          0   
4  002a68644  14400000.0        0.0          0        0.0          0   

   2f0771a37  30347e683  d08d1fbe3  6ee66e115    ...      3ecc09859  \
0          0          0          0          0    ...            0.0   
1          0          0          0          0    ...            0.0   
2          0          0          0          0    ...            0.0   
3          0          0          0          0    ...            0.0   
4          0          0          0          0    ...            0.0   

   9281abeea  8675bec0b  3a13ed79a  f677d4d13  71b203550  137efaa80  \
0        0.0        0.0          0         

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
print("baseline")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

baseline
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.61613
[100]	valid_0's rmse: 1.55793
[150]	valid_0's rmse: 1.51496
[200]	valid_0's rmse: 1.48384
[250]	valid_0's rmse: 1.46215
[300]	valid_0's rmse: 1.44614
[350]	valid_0's rmse: 1.4358
[400]	valid_0's rmse: 1.42738
[450]	valid_0's rmse: 1.42235
[500]	valid_0's rmse: 1.41816
[550]	valid_0's rmse: 1.4155
[600]	valid_0's rmse: 1.4136
[650]	valid_0's rmse: 1.4119
[700]	valid_0's rmse: 1.41163
[750]	valid_0's rmse: 1.4113
[800]	valid_0's rmse: 1.41097
[850]	valid_0's rmse: 1.41103
[900]	valid_0's rmse: 1.4108
[950]	valid_0's rmse: 1.41131
[1000]	valid_0's rmse: 1.41175
Early stopping, best iteration is:
[908]	valid_0's rmse: 1.41069


In [14]:
flist = [x for x in X_train.columns if not x in ['ID','target']]

flist_kmeans = []
for ncl in range(2,11):#change this to change how many clusters 
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(X_train[flist].values)
    print(cls.predict(X_train[flist].values))
    X_train['kmeans_cluster_'+str(ncl)] = cls.predict(X_train[flist].values)
    X_test['kmeans_cluster_'+str(ncl)] = cls.predict(X_test[flist].values)
    flist_kmeans.append('kmeans_cluster_'+str(ncl))
print(flist_kmeans)

[1 1 1 ..., 1 1 1]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]
[3 3 3 ..., 3 3 3]
['kmeans_cluster_2', 'kmeans_cluster_3', 'kmeans_cluster_4', 'kmeans_cluster_5', 'kmeans_cluster_6', 'kmeans_cluster_7', 'kmeans_cluster_8', 'kmeans_cluster_9', 'kmeans_cluster_10']


In [15]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 5000)
Test set size: (49342, 5000)


In [16]:
print("w/ kmeans")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ kmeans
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.61695
[100]	valid_0's rmse: 1.55722
[150]	valid_0's rmse: 1.51417
[200]	valid_0's rmse: 1.48357
[250]	valid_0's rmse: 1.46176
[300]	valid_0's rmse: 1.44649
[350]	valid_0's rmse: 1.43531
[400]	valid_0's rmse: 1.42694
[450]	valid_0's rmse: 1.42107
[500]	valid_0's rmse: 1.41726
[550]	valid_0's rmse: 1.41401
[600]	valid_0's rmse: 1.41199
[650]	valid_0's rmse: 1.41083
[700]	valid_0's rmse: 1.41006
[750]	valid_0's rmse: 1.41
[800]	valid_0's rmse: 1.41018
Early stopping, best iteration is:
[716]	valid_0's rmse: 1.40961


In [17]:
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X_train[flist].values)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

X_train['AfinityProp'] = af.predict(X_train[flist].values)
X_test['AfinityProp'] = af.predict(X_test[flist].values)

print('Estimated number of clusters: %d' % n_clusters_)

Estimated number of clusters: 4458


In [18]:
print("w/ affinity ")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ affinity 
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.61602
[100]	valid_0's rmse: 1.5565
[150]	valid_0's rmse: 1.515
[200]	valid_0's rmse: 1.48294
[250]	valid_0's rmse: 1.4595
[300]	valid_0's rmse: 1.44423
[350]	valid_0's rmse: 1.43382
[400]	valid_0's rmse: 1.42543
[450]	valid_0's rmse: 1.42003
[500]	valid_0's rmse: 1.41528
[550]	valid_0's rmse: 1.41248
[600]	valid_0's rmse: 1.41022
[650]	valid_0's rmse: 1.40916
[700]	valid_0's rmse: 1.40913
[750]	valid_0's rmse: 1.40929
Early stopping, best iteration is:
[675]	valid_0's rmse: 1.40888


In [19]:
#means-shift
from sklearn import cluster

bandwidth = cluster.estimate_bandwidth(X_train, quantile=.3, n_jobs=-1) #adjust here 
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)

print(ms.fit_predict(X_train[flist].values))
X_train['meanshift'] = ms.predict(X_train[flist].values)
X_test['meanshift'] = ms.predict(X_test[flist].values)

labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)



[0 0 0 ..., 0 0 0]
number of estimated clusters : 199


In [20]:
print("w/ meanshift")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ meanshift
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.61571
[100]	valid_0's rmse: 1.55643
[150]	valid_0's rmse: 1.51311
[200]	valid_0's rmse: 1.48201
[250]	valid_0's rmse: 1.45949
[300]	valid_0's rmse: 1.44373
[350]	valid_0's rmse: 1.43285
[400]	valid_0's rmse: 1.42485
[450]	valid_0's rmse: 1.42008
[500]	valid_0's rmse: 1.4164
[550]	valid_0's rmse: 1.414
[600]	valid_0's rmse: 1.41177
[650]	valid_0's rmse: 1.4105
[700]	valid_0's rmse: 1.40989
[750]	valid_0's rmse: 1.41002
[800]	valid_0's rmse: 1.41012
Early stopping, best iteration is:
[710]	valid_0's rmse: 1.40978


In [21]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 5002)
Test set size: (49342, 5002)


In [22]:
#one hot encode clustered varaibles
categorical_to_binarizer = ["meanshift"] #probably exclude affinity prop
X_train = label_binarizer(X_train, categorical_to_binarizer)
X_train = X_train.drop(categorical_to_binarizer, axis=1)

X_test = label_binarizer(X_test, categorical_to_binarizer)
X_test = X_test.drop(categorical_to_binarizer, axis=1)

In [23]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 5200)
Test set size: (49342, 5151)


In [24]:
print("w/ one hot encode")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ one hot encode
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.61565
[100]	valid_0's rmse: 1.55841
[150]	valid_0's rmse: 1.51603
[200]	valid_0's rmse: 1.4843
[250]	valid_0's rmse: 1.46176
[300]	valid_0's rmse: 1.44537
[350]	valid_0's rmse: 1.43439
[400]	valid_0's rmse: 1.42656
[450]	valid_0's rmse: 1.42077
[500]	valid_0's rmse: 1.41689
[550]	valid_0's rmse: 1.41411
[600]	valid_0's rmse: 1.41206
[650]	valid_0's rmse: 1.41118
[700]	valid_0's rmse: 1.41083
[750]	valid_0's rmse: 1.41024
[800]	valid_0's rmse: 1.41029
[850]	valid_0's rmse: 1.41047
Early stopping, best iteration is:
[788]	valid_0's rmse: 1.41014


In [25]:
#Birch
birch = Birch(n_clusters=10)
birch.fit_predict(X_train[flist].values)
X_train['birch'] = birch.predict(X_train[flist].values)
X_test['birch'] = birch.predict(X_test[flist].values)

#one hot encode clustered varaibles
categorical_to_binarizer = ["birch"] #probably exclude affinity prop
X_train = label_binarizer(X_train, categorical_to_binarizer)
X_train = X_train.drop(categorical_to_binarizer, axis=1)

X_test = label_binarizer(X_test, categorical_to_binarizer)
X_test = X_test.drop(categorical_to_binarizer, axis=1)

In [26]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 5210)
Test set size: (49342, 5152)


In [27]:
print("w/ birch")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ birch
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.61565
[100]	valid_0's rmse: 1.55841
[150]	valid_0's rmse: 1.51603
[200]	valid_0's rmse: 1.4843
[250]	valid_0's rmse: 1.46176
[300]	valid_0's rmse: 1.44537
[350]	valid_0's rmse: 1.43439
[400]	valid_0's rmse: 1.42656
[450]	valid_0's rmse: 1.42077
[500]	valid_0's rmse: 1.41689
[550]	valid_0's rmse: 1.41411
[600]	valid_0's rmse: 1.41206
[650]	valid_0's rmse: 1.41118
[700]	valid_0's rmse: 1.41083
[750]	valid_0's rmse: 1.41024
[800]	valid_0's rmse: 1.41029
[850]	valid_0's rmse: 1.41047
Early stopping, best iteration is:
[788]	valid_0's rmse: 1.41014


In [28]:
X_train.to_csv("data/train_clustered.csv", index=False)
X_test.to_csv('data/test_clustered.csv', index=False)