# Clustering

Imports:

In [1]:
import os
import pandas as pd

import numpy as np
from sklearn import svm
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import IsolationForest

from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import LocalOutlierFactor
from sklearn.manifold import TSNE
import pickle

Directories:

In [11]:
def get_parent_dir(directory):
    import os
    return os.path.dirname(directory)

current_dirs_parent = get_parent_dir(os.getcwd())
dataraw_dir=current_dirs_parent+"/01.Data/Raw/"
preproc_dir=current_dirs_parent+"/01.Data/Preprocessing/"
models_b_dir=current_dirs_parent+"/03.Models/Set B/"

Load preprocessed data:

In [3]:
train_master_pd=pd.read_csv(preproc_dir+"train_b_preproc1.csv")
test_master_pd=pd.read_csv(preproc_dir+"test_b_preproc1.csv")

In [4]:
train_master_y0=train_master_pd[train_master_pd["y_lead2"]==0]

In [5]:
cols=train_master_pd.columns
selcols=[a for a in cols if a.startswith("x")]

In [6]:
X_train=train_master_pd[train_master_pd.columns.intersection(selcols)].copy()
X_train.fillna(0, inplace=True) 
X_train.replace(to_replace=np.inf, value=0, inplace=True)

In [7]:
X_train_y0=train_master_y0[train_master_y0.columns.intersection(selcols)].copy()
X_train_y0.fillna(0, inplace=True) 
X_train_y0.replace(to_replace=np.inf, value=0, inplace=True)

In [8]:
X_test=test_master_pd[test_master_pd.columns.intersection(selcols)].copy()
X_test.fillna(0, inplace=True) 
X_test.replace(to_replace=np.inf, value=0, inplace=True)

Scaling data

In [9]:
scaler= StandardScaler().fit(X_train)
X_train_sc=scaler.transform(X_train)
X_test_sc=scaler.transform(X_test)
X_train_y0_sc=scaler.transform(X_train_y0)

In [29]:
pickle.dump(scaler, open(models_b_dir+"clust_scaler.pickle.dat", "wb"))

SVDD

In [27]:
clf= svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1,random_state=42)
clf.fit(X_train_sc)

y_pred_train = clf.predict(X_train_sc)
y_pred_test = clf.predict(X_test_sc)

svdd_test=pd.DataFrame(y_pred_test)
svdd_train=pd.DataFrame(y_pred_train)



In [28]:
pickle.dump(clf, open(models_b_dir+"clust_svdd.pickle.dat", "wb"))

DBSCAN

In [13]:
db = DBSCAN(eps=0.5, min_samples=10,metric="l2")

y_pred_train=db.fit_predict(X_train_sc)
dbscan_train=pd.DataFrame(y_pred_train)

y_pred_test=db.fit_predict(X_test_sc)
dbscan_test=pd.DataFrame(y_pred_test)

In [14]:
pickle.dump(db, open(models_b_dir+"clust_db.pickle.dat", "wb"))

Isolation Forest

In [15]:
rng = np.random.RandomState(42)
isoforest = IsolationForest(behaviour='new', max_samples="auto",
                      random_state=rng, contamination='auto')

isoforest.fit(X_train_y0_sc)
y_pred_train = isoforest.predict(X_train_sc)
y_pred_test = isoforest.predict(X_test_sc)

isoforest_train=pd.DataFrame(y_pred_train)
isoforest_test=pd.DataFrame(y_pred_test)

In [16]:
pickle.dump(isoforest, open(models_b_dir+"clust_isoforest.pickle.dat", "wb"))

PCA

In [17]:
pca = PCA(n_components=5, svd_solver='full',random_state=42)

pca.fit(X_train_sc) 
print(pca.explained_variance_ratio_) 

train_pca=pca.transform(X_train_sc)
test_pca=pca.transform(X_test_sc)

train_pca_df=pd.DataFrame(train_pca)
test_pca_df=pd.DataFrame(test_pca)

[0.05953301 0.04691735 0.03207016 0.02935977 0.02123745]


In [18]:
pickle.dump(pca, open(models_b_dir+"clust_pca.pickle.dat", "wb"))

ICA

In [19]:
fica = FastICA(n_components=5,random_state=42)
fica.fit(X_train_sc)
train_fica=fica.transform(X_train_sc)
test_fica=fica.transform(X_test_sc)

train_fica_df=pd.DataFrame(train_fica)
test_fica_df=pd.DataFrame(test_fica)

In [20]:
pickle.dump(fica, open(models_b_dir+"clust_fica.pickle.dat", "wb"))

SVD

In [21]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
svd.fit(X_train_sc)  
train_svd=svd.transform(X_train_sc)
test_svd=svd.transform(X_test_sc)

train_svd_df=pd.DataFrame(train_svd)
test_svd_df=pd.DataFrame(test_svd)

In [22]:
pickle.dump(svd, open(models_b_dir+"clust_svd.pickle.dat", "wb"))

LOF

In [23]:
lof = LocalOutlierFactor(novelty=True)

lof.fit(X_train_y0_sc)

train_lof=lof.predict(X_train_sc)
test_lof=lof.predict(X_test_sc)

train_lof_df=pd.DataFrame(train_lof)
test_lof_df=pd.DataFrame(test_lof)



In [24]:
pickle.dump(lof, open(models_b_dir+"clust_lof.pickle.dat", "wb"))

TSNE

In [25]:
tsnes = TSNE(n_components=3,random_state=42)

train_tsne=tsnes.fit_transform(X_train_sc)
test_tsne=tsnes.fit_transform(X_test_sc)

train_tsne_df=pd.DataFrame(train_tsne)
test_tsne_df=pd.DataFrame(test_tsne)

In [26]:
pickle.dump(tsnes, open(models_b_dir+"clust_tsnes.pickle.dat", "wb"))

Bind Train Frames

In [None]:
colnames=[*["x_svdd_"+str(i) for i in svdd_train.columns],\
          *["x_db_"+str(i) for i in dbscan_train.columns],\
          *["x_iso_"+str(i) for i in isoforest_train.columns],\
          *["x_pca_"+str(i) for i in train_pca_df.columns],\
          *["x_svd_"+str(i) for i in train_svd_df.columns],\
          *["x_lof_"+str(i) for i in train_lof_df.columns],\
          *["x_tsne_"+str(i) for i in train_tsne_df.columns]]

final_train_df=pd.concat([svdd_train,\
                          dbscan_train,\
                          isoforest_train,\
                          train_pca_df,\
                          train_svd_df,\
                          train_lof_df,\
                          train_tsne_df],axis=1)

final_train_df.columns=colnames

Bind Test Frames

In [None]:
final_test_df=pd.concat([svdd_test,dbscan_test,isoforest_test,test_pca_df,test_svd_df,test_lof_df,test_tsne_df],axis=1)
final_test_df.columns=colnames

Write to file

In [None]:
final_train_df.to_csv(preproc_dir+"train_b_cluster.csv",index=False)  
final_test_df.to_csv(preproc_dir+"test_b_cluster.csv",index=False)  