In [29]:
#James Chartouni
import pandas as pd
import numpy as np
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler, normalize, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, DBSCAN, Birch, SpectralClustering
from sklearn.cluster import AgglomerativeClustering

import lightgbm as lgb
import xgboost as xgb

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot

from helpers import *

In [30]:

def lgbm_auto(data, y_train, test_data):
      
    train_X, val_X, train_y, val_y = train_test_split(data, y_train, test_size = 0.2, random_state = 42)
    
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42,
        "n_threads":-1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=50, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_data, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

In [31]:
train = pd.read_csv("data/train_cleaned.csv")
train_orig = pd.read_csv("data/train.csv")
y_train = np.log1p(train_orig["target"].values)
X_train = train
#X_train = train.drop(["target", "ID"], axis=1)

print ("Rows: " + str(train.shape[0]) + ", Columns: " + str(train.shape[1]))
print(train.head())

test = pd.read_csv('data/test_cleaned.csv')
#X_test = test.drop(["ID"], axis=1)
print ("Rows: " + str(test.shape[0]) + ", Columns: " + str(test.shape[1]))
X_test = test


#consolidate data before clustering 


Rows: 4459, Columns: 1416
   SumValues  SumZeros  20aa07010  dc5a8f1d8  4681de4fd  0d866c3d7  bd8f989f1  \
0        103      4628        0.0        0.0          0        0.0        0.0   
1         68      4663  2200000.0        0.0          0        0.0        0.0   
2         19      4712        0.0        0.0          0        0.0        0.0   
3         23      4708        0.0        0.0          0        0.0        0.0   
4         27      4704  2000000.0        0.0          0        0.0        0.0   

   22ed6dba3  92b13ebba  233c7c17c      ...                Mean         Max  \
0        0.0        0.0        0.0      ...       152382.311484  40000000.0   
1        0.0        0.0        0.0      ...       112466.525725  50000000.0   
2        0.0        0.0        0.0      ...        16109.936575  12000000.0   
3        0.0        0.0        0.0      ...         7059.337560   6000000.0   
4        0.0        0.0        0.0      ...        37778.858351  37662000.0   

            

In [32]:
print("baseline")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

baseline
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.5913
[100]	valid_0's rmse: 1.51809
[150]	valid_0's rmse: 1.46916
[200]	valid_0's rmse: 1.43673
[250]	valid_0's rmse: 1.41455
[300]	valid_0's rmse: 1.39998
[350]	valid_0's rmse: 1.38966
[400]	valid_0's rmse: 1.3824
[450]	valid_0's rmse: 1.37831
[500]	valid_0's rmse: 1.37556
[550]	valid_0's rmse: 1.37307
[600]	valid_0's rmse: 1.37128
[650]	valid_0's rmse: 1.37105
[700]	valid_0's rmse: 1.37119
Early stopping, best iteration is:
[646]	valid_0's rmse: 1.37093


In [33]:
flist = [x for x in X_train.columns if not x in ['ID','target']]

flist_kmeans = []
for ncl in range(2,11):#change this to change how many clusters 
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(X_train[flist].values)
    print(cls.predict(X_train[flist].values))
    X_train['kmeans_cluster_'+str(ncl)] = cls.predict(X_train[flist].values)
    X_test['kmeans_cluster_'+str(ncl)] = cls.predict(X_test[flist].values)
    flist_kmeans.append('kmeans_cluster_'+str(ncl))
print(flist_kmeans)

[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 4 0 0]
[0 0 0 ..., 5 0 0]
[0 0 0 ..., 4 0 0]
[0 0 0 ..., 6 0 0]
[0 0 0 ..., 5 0 0]
[2 2 2 ..., 0 2 2]
['kmeans_cluster_2', 'kmeans_cluster_3', 'kmeans_cluster_4', 'kmeans_cluster_5', 'kmeans_cluster_6', 'kmeans_cluster_7', 'kmeans_cluster_8', 'kmeans_cluster_9', 'kmeans_cluster_10']


In [34]:
#one hot encode clustered varaibles
X_train = label_binarizer(X_train, flist_kmeans)
X_train = X_train.drop(flist_kmeans, axis=1)

X_test = label_binarizer(X_test, flist_kmeans)
X_test = X_test.drop(flist_kmeans, axis=1)

'#one hot encode clustered varaibles\nX_train = label_binarizer(X_train, flist_kmeans)\nX_train = X_train.drop(flist_kmeans, axis=1)\n\nX_test = label_binarizer(X_test, flist_kmeans)\nX_test = X_test.drop(flist_kmeans, axis=1)'

In [35]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 1425)
Test set size: (49342, 1425)


In [36]:
print("w/ kmeans") #1.3707
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ kmeans
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.59022
[100]	valid_0's rmse: 1.51873
[150]	valid_0's rmse: 1.46965
[200]	valid_0's rmse: 1.43711
[250]	valid_0's rmse: 1.41457
[300]	valid_0's rmse: 1.39971
[350]	valid_0's rmse: 1.3904
[400]	valid_0's rmse: 1.38259
[450]	valid_0's rmse: 1.3788
[500]	valid_0's rmse: 1.37594
[550]	valid_0's rmse: 1.37471
[600]	valid_0's rmse: 1.37359
[650]	valid_0's rmse: 1.37281
[700]	valid_0's rmse: 1.37193
[750]	valid_0's rmse: 1.37198
[800]	valid_0's rmse: 1.37137
[850]	valid_0's rmse: 1.37115
[900]	valid_0's rmse: 1.37153
[950]	valid_0's rmse: 1.3717
Early stopping, best iteration is:
[850]	valid_0's rmse: 1.37115


In [16]:
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X_train[flist].values)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

X_train['AfinityProp'] = af.predict(X_train[flist].values)
X_test['AfinityProp'] = af.predict(X_test[flist].values)

print('Estimated number of clusters: %d' % n_clusters_)

Estimated number of clusters: 4457


In [17]:
print("w/ affinity ")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ affinity 
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.5898
[100]	valid_0's rmse: 1.51796
[150]	valid_0's rmse: 1.46856
[200]	valid_0's rmse: 1.43597
[250]	valid_0's rmse: 1.41387
[300]	valid_0's rmse: 1.39813
[350]	valid_0's rmse: 1.38673
[400]	valid_0's rmse: 1.37921
[450]	valid_0's rmse: 1.37513
[500]	valid_0's rmse: 1.37202
[550]	valid_0's rmse: 1.36955
[600]	valid_0's rmse: 1.36867
[650]	valid_0's rmse: 1.36804
[700]	valid_0's rmse: 1.36803
[750]	valid_0's rmse: 1.36777
[800]	valid_0's rmse: 1.3676
[850]	valid_0's rmse: 1.36803
Early stopping, best iteration is:
[786]	valid_0's rmse: 1.36748


In [18]:
#means-shift
from sklearn import cluster

bandwidth = cluster.estimate_bandwidth(X_train, quantile=.3, n_jobs=-1) #adjust here 
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)

print(ms.fit_predict(X_train[flist].values))
X_train['meanshift'] = ms.predict(X_train[flist].values)
X_test['meanshift'] = ms.predict(X_test[flist].values)

labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)



[0 0 0 ..., 0 0 0]
number of estimated clusters : 8


In [19]:
print("w/ meanshift")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ meanshift
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.59129
[100]	valid_0's rmse: 1.51934
[150]	valid_0's rmse: 1.47086
[200]	valid_0's rmse: 1.43767
[250]	valid_0's rmse: 1.41486
[300]	valid_0's rmse: 1.39931
[350]	valid_0's rmse: 1.38907
[400]	valid_0's rmse: 1.38155
[450]	valid_0's rmse: 1.37615
[500]	valid_0's rmse: 1.37274
[550]	valid_0's rmse: 1.37022
[600]	valid_0's rmse: 1.36968
[650]	valid_0's rmse: 1.36899
[700]	valid_0's rmse: 1.36915
[750]	valid_0's rmse: 1.36879
[800]	valid_0's rmse: 1.36855
[850]	valid_0's rmse: 1.36823
[900]	valid_0's rmse: 1.36869
Early stopping, best iteration is:
[820]	valid_0's rmse: 1.36808


In [20]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 1471)
Test set size: (49342, 1471)


In [21]:
#one hot encode clustered varaibles
categorical_to_binarizer = ["meanshift"] #probably exclude affinity prop
X_train = label_binarizer(X_train, categorical_to_binarizer)
X_train = X_train.drop(categorical_to_binarizer, axis=1)

X_test = label_binarizer(X_test, categorical_to_binarizer)
X_test = X_test.drop(categorical_to_binarizer, axis=1)

In [22]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 1478)
Test set size: (49342, 1478)


In [23]:
print("w/ one hot encode")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ one hot encode
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.59466
[100]	valid_0's rmse: 1.52269
[150]	valid_0's rmse: 1.47546
[200]	valid_0's rmse: 1.44197
[250]	valid_0's rmse: 1.41836
[300]	valid_0's rmse: 1.40325
[350]	valid_0's rmse: 1.39365
[400]	valid_0's rmse: 1.38653
[450]	valid_0's rmse: 1.38209
[500]	valid_0's rmse: 1.37968
[550]	valid_0's rmse: 1.3772
[600]	valid_0's rmse: 1.37613
[650]	valid_0's rmse: 1.37552
[700]	valid_0's rmse: 1.37445
[750]	valid_0's rmse: 1.37422
[800]	valid_0's rmse: 1.37424
[850]	valid_0's rmse: 1.37432
Early stopping, best iteration is:
[770]	valid_0's rmse: 1.37399


In [24]:
#Birch
birch = Birch(n_clusters=10)
birch.fit_predict(X_train[flist].values)
X_train['birch'] = birch.predict(X_train[flist].values)
X_test['birch'] = birch.predict(X_test[flist].values)

#one hot encode clustered varaibles
categorical_to_binarizer = ["birch"] #probably exclude affinity prop
X_train = label_binarizer(X_train, categorical_to_binarizer)
X_train = X_train.drop(categorical_to_binarizer, axis=1)

X_test = label_binarizer(X_test, categorical_to_binarizer)
X_test = X_test.drop(categorical_to_binarizer, axis=1)

In [25]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 1488)
Test set size: (49342, 1488)


In [26]:
print("w/ birch")
pred_test_lgbm, model_lgbm, evals_result_lgbm = lgbm_auto(X_train, y_train, X_test)

w/ birch
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 1.59269
[100]	valid_0's rmse: 1.52151
[150]	valid_0's rmse: 1.47268
[200]	valid_0's rmse: 1.43928
[250]	valid_0's rmse: 1.41583
[300]	valid_0's rmse: 1.39998
[350]	valid_0's rmse: 1.3899
[400]	valid_0's rmse: 1.38237
[450]	valid_0's rmse: 1.37808
[500]	valid_0's rmse: 1.37522
[550]	valid_0's rmse: 1.37224
[600]	valid_0's rmse: 1.37111
[650]	valid_0's rmse: 1.37023
[700]	valid_0's rmse: 1.37022
[750]	valid_0's rmse: 1.36988
[800]	valid_0's rmse: 1.36986
[850]	valid_0's rmse: 1.36979
Early stopping, best iteration is:
[763]	valid_0's rmse: 1.36949


In [28]:
#LGBM predictions
sub = pd.read_csv('data/sample_submission.csv')

sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test_lgbm

sub["target"] = sub_lgb["target"] 

print(sub.head())
sub.to_csv('submit/sub_lgb_cluster.csv', index=False)

          ID        target
0  000137c73  2.604974e+06
1  00021489f  2.177407e+06
2  0004d7953  2.515046e+06
3  00056a333  4.243339e+06
4  00056d8eb  1.630270e+06


In [28]:
X_train.to_csv("data/train_clustered.csv", index=False)
X_test.to_csv('data/test_clustered.csv', index=False)