<a href="https://colab.research.google.com/github/lilakhd/Spectral-and-K-means-clustering-of-patients-with-MDD/blob/main/Research_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# I) Importing Data

In [None]:
# Importing Dependencies for Data Import
from google.colab import files
import io
import pandas as pd

In [None]:
# Importing data as pandas df
uploaded = files.upload()
df2 = pd.read_csv(io.BytesIO(uploaded['gendep.csv']))

# II) Exploring Data

In [None]:
# Preview data
df2.head()

In [None]:
# Dataset Shape
i,j = df2.shape
print(f"A total of {i} participants were included in the initial dataset")
print(f"A total of {j} columns were included in the initial dataset")

In [None]:
# Descriptives for Whole Sample
df2.describe()

In [None]:
# Exploring which variables have missing values
mi_col=[col for col in df2.columns if df2[col].isnull().any()]
n_mi=len(mi_col)
print(f"{n_mi} variable has missing data")
print(f"{mi_col} is the only variable with missing data")
# Number of subjects with missing PRS
prs_mi = df2["prs"].isna().sum()
print(f"A total of {prs_mi} subjects have missing PRS values.")

In [None]:
# Omitting subjects with missing PRS 
df3 = df2.dropna()

In [None]:
# Descriptives for Whole Sample (minus those with missing PRS)
df3.describe()

In [None]:
# Descriptives by Drug Assignment (minus those with missing PRS)
df3.groupby("drug").describe()

# III) Subsetting Data

In [None]:
# 1. Subsetting Whole Sample Data
df_all = df3.drop(['centreid','subjectid', 'Row.names','bloodsampleid.x'], axis=1)
# 2. Subsetting Escitalopram Sample (A)
df_esc = df3.drop(['centreid', 'subjectid','Row.names','bloodsampleid.x'], axis=1)
df_esc = df_esc.loc[df_esc['drug'] == 2]
df_esc = df_esc.drop(['drug'],axis=1)
# 3. Subsetting Nortriptyline Sample (B)
df_nor = df3.drop(['centreid', 'subjectid', 'Row.names','bloodsampleid.x'], axis=1)
df_nor = df_nor.loc[df_nor['drug'] == 1]
df_nor = df_nor.drop(['drug'],axis=1)

In [None]:
df_all.head()

# IV) Defining Feature Spaces/Sets

## A. All Sample

In [None]:
# Raw Data: Features Only
X_all = df_all.drop(["mdpercadj", "hdremit.all"], axis=1)
X_all.head()

# Standardized & Scaled Features Only
from sklearn.preprocessing import StandardScaler
sc = StandardScaler
X_scaled_all = sc().fit_transform(X_all)

  
# Dimensionality Reduction
from sklearn.decomposition import PCA 
## PCA n_components = 2 
pca = PCA(n_components = 2) 
X_principal2_all = pca.fit_transform(X_scaled_all) 
X_principal2_all = pd.DataFrame(X_principal2_all) 
X_principal2_all.columns = ['P1', 'P2'] 
X_principal2_all.head(2) 

## PCA n_components optimised
import matplotlib.pyplot as plt
import numpy as np
pca = PCA().fit(X_scaled_all)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title('All Sample (N=421)')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
 ### n_components = 90 ###

pca90 = PCA(n_components = 90) 
X_principal90_all = pca90.fit_transform(X_scaled_all) 
X_principal90_all = pd.DataFrame(X_principal90_all) 
X_principal90_all.columns = ['P%d' % i for i in range(1, 91, 1)]
X_principal90_all.head(2) 

# Saving PCA plot
#from google.colab import files
#plt.savefig("all_PCA_n_selection.png")
#files.download("all_PCA_n_selection.png") 


In [None]:
# Labels (Outcomes):
Y_all = df_all[["mdpercadj", "hdremit.all"]]
Y_all.head(5)

## B. Esciltalopram Sample

In [None]:
# Raw Data: Features Only
X_esc = df_esc.drop(["mdpercadj", "hdremit.all"], axis=1)
X_esc.head()

# Standardized & Scaled Features Only
from sklearn.preprocessing import StandardScaler
sc = StandardScaler
X_scaled_esc = sc().fit_transform(X_esc)

  
# Dimensionality Reduction
from sklearn.decomposition import PCA 
## PCA n_components = 2 
pca = PCA(n_components = 2) 
X_principal2_esc = pca.fit_transform(X_scaled_esc) 
X_principal2_esc = pd.DataFrame(X_principal2_esc) 
X_principal2_esc.columns = ['P1', 'P2'] 
X_principal2_esc.head(2) 

## PCA n_components optimised
import matplotlib.pyplot as plt
import numpy as np
pca_esc = PCA().fit(X_scaled_esc)
plt.plot(np.cumsum(pca_esc.explained_variance_ratio_))
plt.title('Escitalopram Sample (n=217)')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
fig1 = plt.gcf()
plt.show()
fig1.savefig('esc_PCA_n_selection')


 ### n_components = 90 ###

pca90_esc = PCA(n_components = 90) 
X_principal90_esc = pca90_esc.fit_transform(X_scaled_esc) 
X_principal90_esc = pd.DataFrame(X_principal90_esc) 
X_principal90_esc.columns = ['P%d' % i for i in range(1, 91, 1)]
X_principal90_esc.head(2) 

# Sving PCA plot
#from google.colab import files
#files.download("esc_PCA_n_selection.png") 




In [None]:
# Labels (Outcomes):
Y_esc = df_esc[["mdpercadj", "hdremit.all"]]
Y_esc.head(5)

## C. Nortryptaline Sample

In [None]:
# Raw Data: Features Only
X_nor = df_nor.drop(["mdpercadj", "hdremit.all"], axis=1)
X_nor.head()

# Standardized & Scaled Features Only
from sklearn.preprocessing import StandardScaler
sc = StandardScaler
X_scaled_nor = sc().fit_transform(X_nor)

  
# Dimensionality Reduction
from sklearn.decomposition import PCA 
## PCA n_components = 2 
pca_nor = PCA(n_components = 2) 
X_principal2_nor = pca_nor.fit_transform(X_scaled_nor) 
X_principal2_nor = pd.DataFrame(X_principal2_nor) 
X_principal2_nor.columns = ['P1', 'P2'] 
X_principal2_nor.head(2) 

## PCA n_components optimised
import matplotlib.pyplot as plt
import numpy as np
pca_nor = PCA().fit(X_scaled_nor)
plt.plot(np.cumsum(pca_nor.explained_variance_ratio_))
plt.title('Nortryptaline Sample (n=204)')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
fig2= plt.gcf()
plt.show()
fig2.savefig('nor_PCA_n_selection')


 ### n_components = 90 ###

pca90_nor = PCA(n_components = 90) 
X_principal90_nor = pca90_esc.fit_transform(X_scaled_nor) 
X_principal90_nor = pd.DataFrame(X_principal90_nor) 
X_principal90_nor.columns = ['P%d' % i for i in range(1, 91, 1)]
X_principal90_nor.head(2) 

# Saving plot
# from google.colab import files
# files.download("nor_PCA_n_selection.png") 
 



# V) Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import metrics
random_state = 123

## A. K-Means

### i. All Sample

In [None]:
## K-Means iterated over cluster size = 2 through cluster size ##
## = 7 for each of the feature sets for the whole sample.      ##

# Raw Data

K = range(2,8,1)
SH_Km_all_raw = []
HM_Km_all_raw = []
for k in K:
    km = KMeans(n_clusters=k,random_state=123)
    km = km.fit(X_all)
    #lbls = km.predict(X_all)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_all,  km.predict(X_all), metric='euclidean', random_state=random_state),3)
    SH_Km_all_raw.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_all), df_all['hdremit.all']),3)
    HM_Km_all_raw.append(hm_score)

# Scaled Data

K = range(2,8,1)
SH_Km_all_scaled = []
HM_Km_all_scaled = []
for k in K:
    km = KMeans(n_clusters=k,random_state=123)
    km = km.fit(X_scaled_all)
    #lbls = km.predict(X_scaled_all)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_scaled_all,  km.predict(X_scaled_all), metric='euclidean', random_state=random_state),3)
    SH_Km_all_scaled.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_scaled_all), df_all['hdremit.all']),3)
    HM_Km_all_scaled.append(hm_score)
        

# PCA (n = 2)

K = range(2,8,1)
SH_Km_all_pca2 = []
HM_Km_all_pca2 = []
for k in K:
    km = KMeans(n_clusters=k,random_state=123)
    km = km.fit(X_principal2_all)
    lbls = km.predict(X_principal2_all)  # storing labels for this run because it was selected as optimal feature set
    df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal2_all,  km.predict(X_principal2_all), metric='euclidean', random_state=random_state),3)
    SH_Km_all_pca2.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_principal2_all), df_all['hdremit.all']),3)
    HM_Km_all_pca2.append(hm_score)   

# PCA (n=90)     

K = range(2,8,1)
SH_Km_all_pca90 = []
HM_Km_all_pca90 = []
for k in K:
    km = KMeans(n_clusters=k,random_state=123)
    km = km.fit(X_principal90_all)
    #lbls = km.predict(X_principal90_all)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal90_all,  km.predict(X_principal90_all), metric='euclidean', random_state=random_state),3)
    SH_Km_all_pca90.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_principal90_all), df_all['hdremit.all']),3)
    HM_Km_all_pca90.append(hm_score)    

In [None]:
# Summary of homogeneity and silhoutte across feature sets in escitalopram sample
pd.DataFrame({'ClusterSize' : [2,3,4,5,6,7],
              'AvgSH_raw': SH_Km_all_raw,
              'Homogen_raw': HM_Km_all_raw,
              'AvgSH_scaled': SH_Km_all_scaled,
              'Homogen_scaled': HM_Km_all_scaled,
              'AvgSH_pca2': SH_Km_all_pca2,
              'Homogen_pca2': HM_Km_all_pca2,
              'AvgSH_pca90': SH_Km_all_pca90,
              'Homogen_pca90': HM_Km_all_pca90
})

PCA n=2 generally showed best performance.

### ii. Escitalopram

In [None]:
## K-Means iterated over cluster size = 2 through cluster size    ##
## = 7 for each of the feature sets for the escitalopram sample.  ##

# Raw Data 
K = range(2,8,1)
SH_Km_esc_raw = []
HM_Km_esc_raw = []
for k in K:
    km = KMeans(n_clusters=k,random_state=123)
    km = km.fit(X_esc)
    #lbls = km.predict(X_all)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_esc,  km.predict(X_esc), metric='euclidean', random_state=random_state),3)
    SH_Km_esc_raw.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_esc), df_esc['hdremit.all']),3)
    HM_Km_esc_raw.append(hm_score)

# Scaled Data
K = range(2,8,1)
SH_Km_esc_scaled = []
HM_Km_esc_scaled = []
for k in K:
    km = KMeans(n_clusters=k,random_state=random_state)
    km = km.fit(X_scaled_esc)
    #lbls = km.predict(X_scaled_esc)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_scaled_esc,  km.predict(X_scaled_esc), metric='euclidean', random_state=random_state),3)
    SH_Km_esc_scaled.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_scaled_esc), df_esc['hdremit.all']),3)
    HM_Km_esc_scaled.append(hm_score)

# PCA n = 2

K = range(2,8,1)
SH_Km_esc_pca2 = []
HM_Km_esc_pca2 = []
for k in K:
    km = KMeans(n_clusters=k,random_state=random_state)
    km = km.fit(X_principal2_esc)
    lbls = km.predict(X_principal2_esc) # labels stored because best performance across feature sets
    df_esc['km_cl_esc_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal2_esc,  km.predict(X_principal2_esc), metric='euclidean', random_state=random_state),3)
    SH_Km_esc_pca2.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_principal2_esc), df_esc['hdremit.all']),3)
    HM_Km_esc_pca2.append(hm_score)

# PCA n = 90

K = range(2,8,1)
SH_Km_esc_pca90 = []
HM_Km_esc_pca90 = []
for k in K:
    km = KMeans(n_clusters=k,random_state=random_state)
    km = km.fit(X_principal90_esc)
    #lbls = km.predict(X_principal90_esc)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal90_esc,  km.predict(X_principal90_esc), metric='euclidean', random_state=random_state),3)
    SH_Km_esc_pca90.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_principal90_esc), df_esc['hdremit.all']),3)
    HM_Km_esc_pca90.append(hm_score)



In [None]:
# Summary of homogeneity and silhoutte across feature sets in escitalopram sample
pd.DataFrame({'ClusterSize' : [2,3,4,5,6,7],
              'AvgSH_raw': SH_Km_esc_raw,
              'Homogen_raw': HM_Km_esc_raw,
              'AvgSH_scaled': SH_Km_esc_scaled,
              'Homogen_scaled': HM_Km_esc_scaled,
              'AvgSH_pca2': SH_Km_esc_pca2,
              'Homogen_pca2': HM_Km_esc_pca2,
              'AvgSH_pca90': SH_Km_esc_pca90,
              'Homogen_pca90': HM_Km_esc_pca90
})

### iii. Nortryptaline

In [None]:
## K-Means iterated over cluster size = 2 through cluster size    ##
## = 7 for each of the feature sets for the noritalopram sample.  ##

# Raw Data 
K = range(2,8,1)
SH_Km_nor_raw = []
HM_Km_nor_raw = []
for k in K:
    km = KMeans(n_clusters=k,random_state=random_state)
    km = km.fit(X_nor)
    #lbls = km.predict(X_all)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_nor,  km.predict(X_nor), metric='euclidean', random_state=random_state),3)
    SH_Km_nor_raw.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_nor), df_nor['hdremit.all']),3)
    HM_Km_nor_raw.append(hm_score)

# Scaled Data
K = range(2,8,1)
SH_Km_nor_scaled = []
HM_Km_nor_scaled = []
for k in K:
    km = KMeans(n_clusters=k,random_state=random_state)
    km = km.fit(X_scaled_nor)
    #lbls = km.predict(X_scaled_nor)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_scaled_nor,  km.predict(X_scaled_nor), metric='euclidean', random_state=random_state),3)
    SH_Km_nor_scaled.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_scaled_nor), df_nor['hdremit.all']),3)
    HM_Km_nor_scaled.append(hm_score)

# PCA n = 2

K = range(2,8,1)
SH_Km_nor_pca2 = []
HM_Km_nor_pca2 = []
for k in K:
    km = KMeans(n_clusters=k,random_state=random_state)
    km = km.fit(X_principal2_nor)
    lbls = km.predict(X_principal2_nor) # labels stored because best performance across feature sets
    df_nor['km_cl_nor_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal2_nor,  km.predict(X_principal2_nor), metric='euclidean', random_state=random_state),3)
    SH_Km_nor_pca2.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_principal2_nor), df_nor['hdremit.all']),3)
    HM_Km_nor_pca2.append(hm_score)

# PCA n = 90

K = range(2,8,1)
SH_Km_nor_pca90 = []
HM_Km_nor_pca90 = []
for k in K:
    km = KMeans(n_clusters=k,random_state=random_state)
    km = km.fit(X_principal90_nor)
    #lbls = km.predict(X_principal90_nor)
    #df_all['km_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal90_nor,  km.predict(X_principal90_nor), metric='euclidean', random_state=random_state),3)
    SH_Km_nor_pca90.append(sh_score)
    hm_score = round(metrics.homogeneity_score(km.predict(X_principal90_nor), df_nor['hdremit.all']),3)
    HM_Km_nor_pca90.append(hm_score)


In [None]:
# Summary of homogeneity and silhoutte across feature sets in nortryptaline sample
pd.DataFrame({'ClusterSize' : [2,3,4,5,6,7],
              'AvgSH_raw': SH_Km_nor_raw,
              'Homogen_raw': HM_Km_nor_raw,
              'AvgSH_scaled': SH_Km_nor_scaled,
              'Homogen_scaled': HM_Km_nor_scaled,
              'AvgSH_pca2': SH_Km_nor_pca2,
              'Homogen_pca2': HM_Km_nor_pca2,
              'AvgSH_pca90': SH_Km_nor_pca90,
              'Homogen_pca90': HM_Km_nor_pca90
})

## B. Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering

#### i. All Sample

In [None]:
## Spectral iterated over cluster size = 2 through cluster size ##
## = 7 for each of the feature sets for the whole sample.      ##

# Raw Data

K = range(2,8,1)
SH_SC_all_raw = []
HM_SC_all_raw = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_all)    
    SC = SC.fit(X_all)
    #lbls = SC.fit_predict(X_all)
    #df_all['SC_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_all,  SC.fit_predict(X_all), metric='euclidean', random_state=random_state),3)
    SH_SC_all_raw.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_all), df_all['hdremit.all']),3)
    HM_SC_all_raw.append(hm_score)

# Scaled Data

K = range(2,8,1)
SH_SC_all_scaled = []
HM_SC_all_scaled = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_scaled_all)    
    SC = SC.fit(X_scaled_all)
    #lbls = SC.fit_predict(X_scaled_all)
    #df_all['SC_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_scaled_all,  SC.fit_predict(X_scaled_all), metric='euclidean', random_state=random_state),3)
    SH_SC_all_scaled.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_scaled_all), df_all['hdremit.all']),3)
    HM_SC_all_scaled.append(hm_score)
        

# PCA (n = 2)

K = range(2,8,1)
SH_SC_all_pca2 = []
HM_SC_all_pca2 = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_principal2_all)    
    SC = SC.fit(X_principal2_all)
    lbls = SC.fit_predict(X_principal2_all)  # storing labels for this run because it was selected as optimal feature set
    df_all['sc_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal2_all,  SC.fit_predict(X_principal2_all), metric='euclidean', random_state=random_state),3)
    SH_SC_all_pca2.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_principal2_all), df_all['hdremit.all']),3)
    HM_SC_all_pca2.append(hm_score)   

# PCA (n=90)     

K = range(2,8,1)
SH_SC_all_pca90 = []
HM_SC_all_pca90 = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_principal90_all)    
    SC = SC.fit(X_principal90_all)
    #lbls = SC.fit_predict(X_principal90_all)
    #df_all['SC_cl_all_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal90_all,  SC.fit_predict(X_principal90_all), metric='euclidean', random_state=random_state),3)
    SH_SC_all_pca90.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_principal90_all), df_all['hdremit.all']),3)
    HM_SC_all_pca90.append(hm_score)    

In [None]:
# Summary of homogeneity and silhoutte across feature sets in escitlapram sample using SC
pd.DataFrame({'ClusterSize' : [2,3,4,5,6,7],
              'AvgSH_raw': SH_SC_all_raw,
              'Homogen_raw': HM_SC_all_raw,
              'AvgSH_scaled': SH_SC_all_scaled,
              'Homogen_scaled': HM_SC_all_scaled,
              'AvgSH_pca2': SH_SC_all_pca2,
              'Homogen_pca2': HM_SC_all_pca2,
              'AvgSH_pca90': SH_SC_all_pca90,
              'Homogen_pca90': HM_SC_all_pca90
})

#### ii. Escitalopram

In [None]:
## Spectral iterated over cluster size = 2 through cluster size ##
## = 7 for each of the feature sets for the whole sample.      ##

# Raw Data

K = range(2,8,1)
SH_SC_esc_raw = []
HM_SC_esc_raw = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_esc)    
    SC = SC.fit(X_esc)
    #lbls = SC.fit_predict(X_esc)
    #df_esc['SC_cl_esc_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_esc,  SC.fit_predict(X_esc), metric='euclidean', random_state=random_state),3)
    SH_SC_esc_raw.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_esc), df_esc['hdremit.all']),3)
    HM_SC_esc_raw.append(hm_score)

# Scaled Data

K = range(2,8,1)
SH_SC_esc_scaled = []
HM_SC_esc_scaled = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_scaled_esc)    
    SC = SC.fit(X_scaled_esc)
    #lbls = SC.fit_predict(X_scaled_esc)
    #df_esc['SC_cl_esc_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_scaled_esc,  SC.fit_predict(X_scaled_esc), metric='euclidean', random_state=random_state),3)
    SH_SC_esc_scaled.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_scaled_esc), df_esc['hdremit.all']),3)
    HM_SC_esc_scaled.append(hm_score)
        

# PCA (n = 2)

K = range(2,8,1)
SH_SC_esc_pca2 = []
HM_SC_esc_pca2 = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_principal2_esc)    
    SC = SC.fit(X_principal2_esc)
    lbls = SC.fit_predict(X_principal2_esc)  # storing labels for this run because it was selected as optimal feature set
    df_esc['sc_cl_esc_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal2_esc,  SC.fit_predict(X_principal2_esc), metric='euclidean', random_state=random_state),3)
    SH_SC_esc_pca2.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_principal2_esc), df_esc['hdremit.all']),3)
    HM_SC_esc_pca2.append(hm_score)   

# PCA (n=90)     

K = range(2,8,1)
SH_SC_esc_pca90 = []
HM_SC_esc_pca90 = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_principal90_esc)    
    SC = SC.fit(X_principal90_esc)
    #lbls = SC.fit_predict(X_principal90_esc)
    #df_esc['SC_cl_esc_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal90_esc,  SC.fit_predict(X_principal90_esc), metric='euclidean', random_state=random_state),3)
    SH_SC_esc_pca90.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_principal90_esc), df_esc['hdremit.all']),3)
    HM_SC_esc_pca90.append(hm_score)    

In [None]:
# Summary of homogeneity and silhoutte across feature sets in escitlapram sample using SC
pd.DataFrame({'ClusterSize' : [2,3,4,5,6,7],
              'AvgSH_raw': SH_SC_esc_raw,
              'Homogen_ra': HM_SC_esc_raw,
              'AvgSH_scaled': SH_SC_esc_scaled,
              'Homogen_scaled': HM_SC_esc_scaled,
              'AvgSH_pca2': SH_SC_esc_pca2,
              'Homogen_pca2': HM_SC_esc_pca2,
              'AvgSH_pca90': SH_SC_esc_pca90,
              'Homogen_pca90': HM_SC_esc_pca90
})

### iii. Nortryptaline

In [None]:
## Spectral iterated over cluster size = 2 through cluster size ##
## = 7 for each of the feature sets for the nortryptaline sample      ##

# Raw Data

K = range(2,8,1)
SH_SC_nor_raw = []
HM_SC_nor_raw = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_nor)    
    SC = SC.fit(X_nor)
    #lbls = SC.fit_predict(X_nor)
    #df_nor['SC_cl_nor_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_nor,  SC.fit_predict(X_nor), metric='euclidean', random_state=random_state),3)
    SH_SC_nor_raw.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_nor), df_nor['hdremit.all']),3)
    HM_SC_nor_raw.append(hm_score)

# Scaled Data

K = range(2,8,1)
SH_SC_nor_scaled = []
HM_SC_nor_scaled = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_scaled_nor)    
    SC = SC.fit(X_scaled_nor)
    #lbls = SC.fit_predict(X_scaled_nor)
    #df_nor['SC_cl_nor_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_scaled_nor,  SC.fit_predict(X_scaled_nor), metric='euclidean', random_state=random_state),3)
    SH_SC_nor_scaled.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_scaled_nor), df_nor['hdremit.all']),3)
    HM_SC_nor_scaled.append(hm_score)
        

# PCA (n = 2)

K = range(2,8,1)
SH_SC_nor_pca2 = []
HM_SC_nor_pca2 = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_principal2_nor)    
    SC = SC.fit(X_principal2_nor)
    lbls = SC.fit_predict(X_principal2_nor)  # storing labels for this run because it was selected as optimal feature set
    df_nor['sc_cl_nor_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal2_nor,  SC.fit_predict(X_principal2_nor), metric='euclidean', random_state=random_state),3)
    SH_SC_nor_pca2.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_principal2_nor), df_nor['hdremit.all']),3)
    HM_SC_nor_pca2.append(hm_score)   

# PCA (n=90)     

K = range(2,8,1)
SH_SC_nor_pca90 = []
HM_SC_nor_pca90 = []
for k in K:
    SC = SpectralClustering(n_clusters = k, affinity ='nearest_neighbors').fit(X_principal90_nor)    
    SC = SC.fit(X_principal90_nor)
    #lbls = SC.fit_predict(X_principal90_nor)
    #df_nor['SC_cl_nor_' + str(k)] = lbls
    sh_score = round(silhouette_score(X_principal90_nor,  SC.fit_predict(X_principal90_nor), metric='euclidean', random_state=random_state),3)
    SH_SC_nor_pca90.append(sh_score)
    hm_score = round(metrics.homogeneity_score(SC.fit_predict(X_principal90_nor), df_nor['hdremit.all']),3)
    HM_SC_nor_pca90.append(hm_score)    

In [None]:
# Summary of homogeneity and silhoutte across feature sets in nortyptalin sample using SC
pd.DataFrame({'ClusterSize' : [2,3,4,5,6,7],
              'AvgSH_raw': SH_SC_nor_raw,
              'Homogen_ra': HM_SC_nor_raw,
              'AvgSH_scaled': SH_SC_nor_scaled,
              'Homogen_scaled': HM_SC_nor_scaled,
              'AvgSH_pca2': SH_SC_nor_pca2,
              'Homogen_pca2': HM_SC_nor_pca2,
              'AvgSH_pca90': SH_SC_nor_pca90,
              'Homogen_pca90': HM_SC_nor_pca90
})

# VI) Plotting Validation Scores by K-size

Below are plots for the variation in homogeneity and silhoutte scores as a function of clusters size across each of the samples. These are based on validation scores for the best performing feature set (i.e., PCA n_components = 2).


## i. All Sample

In [None]:
## AVG SILHOUTTE SCORE##
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bars
barWidth = 0.25
 
# set heights of bars
bars1 = SH_Km_all_pca2 
bars2 = SH_SC_all_pca2 
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
# Make the plot
plt.bar(r1, bars1, color='green', width=barWidth, edgecolor='white')
plt.bar(r2, bars2, color='blue', width=barWidth, edgecolor='white')
 
# Add xticks on the middle of the group bars
plt.xlabel('Cluster Size', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['k=2', 'k=3', 'k=4', 'k=5', 'k=6','k=7'])
 
# Create legend & Show graphic
plt.legend()
plt.title("Average Silhoutte Score")
plt.savefig("SH_all.png")
plt.show()

# save plot
#from google.colab import files
#files.download("SH_all.png") 

In [None]:
## HOMOGENEITY ##
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bars
barWidth = 0.25
 
# set heights of bars
bars1 = HM_Km_all_pca2
bars2 = HM_SC_all_pca2
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
# Make the plot
plt.bar(r1, bars1, color='green', width=barWidth, edgecolor='white', label='K-means Clustering')
plt.bar(r2, bars2, color='blue', width=barWidth, edgecolor='white', label = 'Spectral Clustering')
 
# Add xticks on the middle of the group bars
plt.xlabel('Cluster Size', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['k=2', 'k=3', 'k=4', 'k=5', 'k=6','k=7'])
plt.title("Homogeneity")
 
# Create legend & Show graphic
plt.legend()
plt.legend(loc= 'upper right', ncol = 2)
plt.savefig("HM_all.png")
plt.show()

# save plot
# from google.colab import files
# files.download("HM_all.png") 

## ii. Escitalopram

In [None]:
## AVG SILHOUTTE SCORE##
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bars
barWidth = 0.25
 
# set heights of bars
bars1 = SH_Km_esc_pca2
bars2 = SH_SC_esc_pca2
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
# Make the plot
plt.bar(r1, bars1, color='green', width=barWidth, edgecolor='white')
plt.bar(r2, bars2, color='blue', width=barWidth, edgecolor='white')
 
# Add xticks on the middle of the group bars
plt.xlabel('Cluster Size', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['k=2', 'k=3', 'k=4', 'k=5', 'k=6','k=7'])
 
# Create legend & Show graphic
plt.legend()
plt.savefig("SH_esc.png")
plt.show()

# save plot
from google.colab import files
files.download("SH_esc.png") 

In [None]:
## HOMOGENEITY ##
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bars
barWidth = 0.25
 
# set heights of bars
bars1 = HM_Km_esc_pca2
bars2 = HM_SC_esc_pca2
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
# Make the plot
plt.bar(r1, bars1, color='green', width=barWidth, edgecolor='white')
plt.bar(r2, bars2, color='blue', width=barWidth, edgecolor='white')
 
# Add xticks on the middle of the group bars
plt.xlabel('Cluster Size', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['k=2', 'k=3', 'k=4', 'k=5', 'k=6','k=7'])

 
# Create legend & Show graphic
plt.legend()
plt.savefig("HM_ESC.png")
plt.show()

# save plot
from google.colab import files
files.download("HM_ESC.png") 

## iii. Nortryptaline

In [None]:
## AVG SILHOUTTE SCORE##
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bars
barWidth = 0.25
 
# set heights of bars
bars1 = SH_Km_nor_pca2
bars2 = SH_SC_nor_pca2
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
# Make the plot
plt.bar(r1, bars1, color='green', width=barWidth, edgecolor='white')
plt.bar(r2, bars2, color='blue', width=barWidth, edgecolor='white')
 
# Add xticks on the middle of the group bars
plt.xlabel('Cluster Size', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['k=2', 'k=3', 'k=4', 'k=5', 'k=6','k=7'])
 
# Create legend & Show graphic
plt.legend()
plt.savefig("SH_nor.png")
plt.show()

# save plot
from google.colab import files
files.download("SH_nor.png") 

In [None]:
## HOMOGENEITY ##
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bars
barWidth = 0.25
 
# set heights of bars
bars1 = HM_Km_nor_pca2
bars2 = HM_SC_nor_pca2
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
# Make the plot
plt.bar(r1, bars1, color='green', width=barWidth, edgecolor='white')
plt.bar(r2, bars2, color='blue', width=barWidth, edgecolor='white')
 
# Add xticks on the middle of the group bars
plt.xlabel('Cluster Size', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['k=2', 'k=3', 'k=4', 'k=5', 'k=6','k=7'])

 
# Create legend & Show graphic
plt.legend()
plt.savefig("HM_nor.png")
plt.show()

# save plot
# from google.colab import files
# files.download("HM_nor.png") 

# VI) Prediction Modelling

In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_validate
from numpy import mean

## i. K-Means

### a. All Sample




#### ALL SAMPLE w/o cluster var

In [None]:
df_all.head()

In [None]:
X_sup_all = df_all.drop(["km_cl_all_2","km_cl_all_3","km_cl_all_4", "km_cl_all_5","km_cl_all_6","km_cl_all_7","mdpercadj","hdremit.all","sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6","sc_cl_all_7"],axis=1)
Y_sup_all = df_all["hdremit.all"]

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_all)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_all = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_result = cross_validate(xgbrndm_all,X_sup_all,Y_sup_all,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_result

In [None]:
print(f"Mean Accuracy = {round(mean(cv_result['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_result['test_roc_auc']),3)}")
print(f"Mean Precision =  {round(mean(cv_result['test_precision']),3)}")
print(f"Mean Recall =  {round(mean(cv_result['test_recall']),3)}")

#### K = 2 ; w/cluster var

In [None]:
df_all.head()

In [None]:
X_sup_all_k2 = df_all.drop(["km_cl_all_3","km_cl_all_4", "km_cl_all_5","km_cl_all_6","km_cl_all_7","mdpercadj","hdremit.all","sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6","sc_cl_all_7"],axis=1)
Y_sup_all_k2 = df_all["hdremit.all"]

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_all_k2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_all_k2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results3 = cross_validate(xgbrndm_all_k2,X_sup_all_k2,Y_sup_all_k2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results3

In [None]:
print(f"Mean Accuracy = {round(mean(cv_results3['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results3['test_roc_auc']),3)}")
print(f"Mean Precision =  {round(mean(cv_results3['test_precision']),3)}")
print(f"Mean Recall =  {round(mean(cv_results3['test_recall']),3)}")

#### K = 2 w/o cluster var

##### Cluster 1

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_km2_all_cl1 = df_all.loc[df_all['km_cl_all_2'] == 1]
# Subsetting Outcome for Cluster 1:
Y_km2_all_cl1 = X_km2_all_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km2_all_cl1 = X_km2_all_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km2_all_cl1 = X_km2_all_cl1.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_km2_all_cl1) == len(X_km2_all_cl1):
  print("Same Length")
else:
  print("Not Mathcing")

len(X_km2_all_cl1)

In [None]:
X_km2_all_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_km2_all_cl1.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km2_all_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(2.52,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=random_state)
xgbrndm_all_k2_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_k2_cl1 = cross_validate(xgbrndm_all_k2_cl1,X_km2_all_cl1,Y_km2_all_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_k2_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_k2_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_k2_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_k2_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_k2_cl1['test_recall']),3)}")

In [None]:
m_all_k2_cl1 = xgbrndm_all_k2_cl1.fit(X_km2_all_cl1,Y_km2_all_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_k2_cl1 = pd.DataFrame({'fscore':m_all_k2_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_km2_all_cl1)})
importances_all_k2_cl1

##### Cluster 0

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 2:
X_km2_all_cl0 = df_all.loc[df_all['km_cl_all_2'] == 0]
# Subsetting Outcome for Cluster 1:
Y_km2_all_cl0 = X_km2_all_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km2_all_cl0 = X_km2_all_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km2_all_cl0 = X_km2_all_cl0.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_km2_all_cl0) == len(X_km2_all_cl0):
  print(f"Same Length (n={len(X_km2_all_cl0)})")
else:

  print("Not Mathcing")


In [None]:
X_km2_all_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_km2_all_cl0.value_counts(normalize=True)
##Only slightly imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km2_all_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_k2_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_k2_cl0 = cross_validate(xgbrndm_all_k2_cl0, X_km2_all_cl0, Y_km2_all_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_k2_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_k2_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_k2_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_k2_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_k2_cl0['test_recall']),3)}")

In [None]:
m_all_k2_cl0 = xgbrndm_all_k2_cl0.fit(X_km2_all_cl0,Y_km2_all_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_k2_cl0 = pd.DataFrame({'fscore':m_all_k2_cl0.best_estimator_.feature_importances_,
                                       'varname': list(X_km2_all_cl0)})
importances_all_k2_cl0

#### K = 3 ; w/cluster var

In [None]:
df_all.head()

In [None]:
X_sup_all_k3 = df_all.drop(["km_cl_all_2","km_cl_all_4", "km_cl_all_5","km_cl_all_6","km_cl_all_7","mdpercadj","hdremit.all","sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6","sc_cl_all_7"],axis=1)
Y_sup_all_k3 = df_all["hdremit.all"]

In [None]:
if len(Y_sup_all_k3) == len(X_sup_all_k3):
  print(f"Same Length (n={len(X_sup_all_k3)})")
else:
  print("Not Mathcing")

In [None]:
X_sup_all_k3.head()

In [None]:
# Checking for Imbalance: 
Y_sup_all_k3.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_all_k3)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_k3 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_all_k3 = cross_validate(xgbrndm_all_k3,X_sup_all_k3,Y_sup_all_k3,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_all_k3

In [None]:
from numpy import mean
print(f"Mean Accuracy = {round(mean(cv_results_all_k3['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_all_k3['test_roc_auc']),4)}")
print(f"Mean Precision = {round(mean(cv_results_all_k3['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cv_results_all_k3['test_recall']),3)}")

In [None]:
m_all_k3 = xgbrndm_all_k3.fit(X_sup_all_k3,Y_sup_all_k3)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_k3 = pd.DataFrame({'fscore':m_all_k3.best_estimator_.feature_importances_,
                                     'varname': list(X_sup_all_k3)})
importances_all_k3

#### K = 3; w/o cluster var

##### Cluster 2

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_km3_all_cl2 = df_all.loc[df_all['km_cl_all_3'] == 2]
# Subsetting Outcome for Cluster 1:
Y_km3_all_cl2 = X_km3_all_cl2.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_all_cl2 = X_km3_all_cl2.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_all_cl2 = X_km3_all_cl2.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_km3_all_cl2) == len(X_km3_all_cl2):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_km3_all_cl2))  

In [None]:
X_km3_all_cl2.head()

In [None]:
# Checking for Imbalance: 
Y_km3_all_cl2.value_counts(normalize=True)
## Slightly Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_all_cl2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.2,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_k3_cl2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_k3_cl2 = cross_validate(xgbrndm_all_k3_cl2,X_km3_all_cl2,Y_km3_all_cl2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_k3_cl2

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_k3_cl2['test_accuracy']),4)}")
print(f"Mean AUC = {round(mean(cvresults_all_k3_cl2['test_roc_auc']),4)}")
print(f"Mean Precision = {round(mean(cvresults_all_k3_cl2['test_precision']),4)}")
print(f"Mean Recall = {round(mean(cvresults_all_k3_cl2['test_recall']),4)}")

In [None]:
m_all_k3_cl2 = xgbrndm_all_k3_cl2.fit(X_km3_all_cl2,Y_km3_all_cl2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_k3_cl2 = pd.DataFrame({'fscore':m_all_k3_cl2.best_estimator_.feature_importances_,
                                     'varname': list(X_km3_all_cl2)})
importances_all_k3_cl2

##### Cluster 1

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_km3_all_cl1 = df_all.loc[df_all['km_cl_all_3'] == 1]
# Subsetting Outcome for Cluster 1:
Y_km3_all_cl1 = X_km3_all_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_all_cl1 = X_km3_all_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_all_cl1 = X_km3_all_cl1.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_km3_all_cl1) == len(X_km3_all_cl1):
  print(f"Same Length {len(Y_km3_all_cl1)}")
else:
  print("Not Mathcing")

print(f"The cluster contains {len(X_km3_all_cl1)} patients ")  

In [None]:
X_km3_all_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_km3_all_cl1.value_counts(normalize=True)
## Roughly Balanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_all_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_k3_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_k3_cl1 = cross_validate(xgbrndm_all_k3_cl1,X_km3_all_cl1,Y_km3_all_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_k3_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_k3_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_k3_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_k3_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_k3_cl1['test_recall']),3)}")

In [None]:
m_all_k3_cl1 = xgbrndm_all_k3_cl1.fit(X_km3_all_cl1,Y_km3_all_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_k3_cl1 = pd.DataFrame({'fscore':m_all_k3_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_km3_all_cl1)})
importances_all_k3_cl1

##### Cluster 0

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 0:
X_km3_all_cl0 = df_all.loc[df_all['km_cl_all_3'] == 0]
# Subsetting Outcome for Cluster 0:
Y_km3_all_cl0 = X_km3_all_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_all_cl0 = X_km3_all_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_all_cl0 = X_km3_all_cl0.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_km3_all_cl0) == len(X_km3_all_cl0):
  print(f"Same Length {len(Y_km3_all_cl0)}")
else:
  print("Not Mathcing")

print(f"The cluster contains {len(X_km3_all_cl0)} patients ")  

In [None]:
X_km3_all_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_km3_all_cl0.value_counts(normalize=True)
##Very Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_all_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(3.27,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_k3_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_k3_cl0 = cross_validate(xgbrndm_all_k3_cl0,X_km3_all_cl0,Y_km3_all_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_k3_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_k3_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_k3_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_k3_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_k3_cl0['test_recall']),3)}")

In [None]:
m_all_k3_cl0 = xgbrndm_all_k3_cl0.fit(X_km3_all_cl0,Y_km3_all_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_k3_cl0 = pd.DataFrame({'fscore':m_all_k3_cl0.best_estimator_.feature_importances_,
                                     'varname': list(X_km3_all_cl0)})
importances_all_k3_cl0

### b. Escitalopram

#### ESCITALOPRAM ; w/o cluster var

In [None]:
df_esc.head()

In [None]:
X_sup_esc = df_esc.drop(["km_cl_esc_2","km_cl_esc_3","km_cl_esc_4", "km_cl_esc_5","km_cl_esc_6","km_cl_esc_7","mdpercadj","hdremit.all","sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6","sc_cl_esc_7"],axis=1)
Y_sup_esc = df_esc["hdremit.all"]

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_esc)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_esc = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_result_esc = cross_validate(xgbrndm_esc,X_sup_esc,Y_sup_esc,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_result_esc

In [None]:
print(f"Mean Accuracy = {round(mean(cv_result_esc['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_result_esc['test_roc_auc']),3)}")
print(f"Mean Precision =  {round(mean(cv_result_esc['test_precision']),3)}")
print(f"Mean Recall =  {round(mean(cv_result_esc['test_recall']),3)}")

#### K=2; w/cluster var

In [None]:
df_esc.head()

In [None]:
X_sup_esc_k2 = df_esc.drop(["km_cl_esc_3","km_cl_esc_4", "km_cl_esc_5","km_cl_esc_6","km_cl_esc_7","mdpercadj","hdremit.all","sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6","sc_cl_esc_7"],axis=1)
Y_sup_esc_k2 = df_esc["hdremit.all"]

In [None]:
print(len(X_sup_esc_k2))
print(len(Y_sup_esc_k2))

In [None]:
Y_sup_esc_k2.value_counts(normalize=True)

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_esc_k2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_esc_k2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_esc_k2 = cross_validate(xgbrndm_esc_k2,X_sup_esc_k2,Y_sup_esc_k2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_esc_k2

In [None]:
print(f"Mean Accuracy = {round(mean(cv_results_esc_k2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_esc_k2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_esc_k2['test_precision']),3)}")
print(f"Mean Precision = {round(mean(cv_results_esc_k2['test_recall']),3)}")

In [None]:
m_esc_k2 = xgbrndm_esc_k2.fit(X_sup_esc_k2,Y_sup_esc_k2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importance_esc_k2 = pd.DataFrame({'fscore':m_esc_k2.best_estimator_.feature_importances_,
                           'varname': list(X_sup_esc_k2)})
importance_esc_k2

#### K=3; w/ cluster var

In [None]:
df_esc.head()

In [None]:
X_sup_esc_k3 = df_esc.drop(["km_cl_esc_2","km_cl_esc_4", "km_cl_esc_5","km_cl_esc_6","km_cl_esc_7","mdpercadj","hdremit.all","sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6","sc_cl_esc_7"],axis=1)
Y_sup_esc_k3 = df_esc["hdremit.all"]

In [None]:
print(len(X_sup_esc_k3))
print(len(Y_sup_esc_k3))

In [None]:
Y_sup_esc_k3.value_counts(normalize=True)

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_esc_k3)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.36,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=123)
xgbrndm_esc_k3 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_esc_k3 = cross_validate(xgbrndm_esc_k3,X_sup_esc_k3,Y_sup_esc_k3,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_esc_k3

In [None]:
print(f"Mean Accuracy = {round(mean(cv_results_esc_k3['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_esc_k3['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_esc_k3['test_precision']),3)}")
print(f"Mean Precision = {round(mean(cv_results_esc_k3['test_recall']),3)}")

In [None]:
m_esc_k3 = xgbrndm_esc_k3.fit(X_sup_esc_k3,Y_sup_esc_k3)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importance_esc_k3 = pd.DataFrame({'fscore':m_esc_k3.best_estimator_.feature_importances_,
                           'varname': list(X_sup_esc_k3)})
importance_esc_k3

#### K = 2; w/o cluster var

##### Cluster 1

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_km2_esc_cl1 = df_esc.loc[df_esc['km_cl_esc_2'] == 1]
# Subsetting Outcome for Cluster 1:
Y_km2_esc_cl1 = X_km2_esc_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km2_esc_cl1 = X_km2_esc_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km2_esc_cl1 = X_km2_esc_cl1.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_km2_esc_cl1) == len(X_km2_esc_cl1):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_km2_esc_cl1))  

In [None]:
X_km2_esc_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_km2_esc_cl1.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km2_esc_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_k2_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_k2_cl1 = cross_validate(xgbrndm_esc_k2_cl1,X_km2_esc_cl1,Y_km2_esc_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_k2_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_k2_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_k2_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_k2_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_esc_k2_cl1['test_recall']),3)}")

In [None]:
m_esc_k2_cl1 = xgbrndm_esc_k2_cl1.fit(X_km2_esc_cl1,Y_km2_esc_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_k2_cl1 = pd.DataFrame({'fscore':m_esc_k2_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_km2_esc_cl1)})
importances_esc_k2_cl1

##### Cluster 0

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 0:
X_km2_esc_cl0 = df_esc.loc[df_esc['km_cl_esc_2'] == 0]
# Subsetting Outcome for Cluster 0:
Y_km2_esc_cl0 = X_km2_esc_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km2_esc_cl0 = X_km2_esc_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km2_esc_cl0 = X_km2_esc_cl0.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_km2_esc_cl0) == len(X_km2_esc_cl0):
  print("Same Length")
else:
  print("Not Mathcing")

In [None]:
X_km2_esc_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_km2_esc_cl0.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km2_esc_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(2.19,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_k2_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_k2_cl0 = cross_validate(xgbrndm_esc_k2_cl0,X_km2_esc_cl0,Y_km2_esc_cl0,cv=5,
                                      scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_k2_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_k2_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_k2_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_k2_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_esc_k2_cl0['test_recall']),3)}")

In [None]:
m_esc_k2_cl0 = xgbrndm_esc_k2_cl0.fit(X_km2_esc_cl0,Y_km2_esc_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_k2_cl0 = pd.DataFrame({'fscore':m_esc_k2_cl0.best_estimator_.feature_importances_,
                                      'varname': list(X_km2_esc_cl0)})
importances_esc_k2_cl0

#### K = 3; w/o cluster var

##### Cluster 2

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 2:
X_km3_esc_cl2 = df_esc.loc[df_esc['km_cl_esc_3'] == 2]
# Subsetting Outcome for Cluster 2:
Y_km3_esc_cl2 = X_km3_esc_cl2.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_esc_cl2 = X_km3_esc_cl2.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_esc_cl2 = X_km3_esc_cl2.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_km3_esc_cl2) == len(X_km3_esc_cl2):
  print("Same Length")
else:
  print("Not Matching")


print(len(Y_km3_esc_cl2))  

In [None]:
X_km3_esc_cl2.head()

In [None]:
# Checking for Imbalance: 
Y_km3_esc_cl2.value_counts(normalize=True)
##rouhgly balanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_esc_cl2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_k3_cl2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_k3_cl2 = cross_validate(xgbrndm_esc_k3_cl2,X_km3_esc_cl2,Y_km3_esc_cl2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_k3_cl2

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_k3_cl2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_k3_cl2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_k3_cl2['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_esc_k3_cl2['test_recall']),3)}")

In [None]:
m_esc_k3_cl2 = xgbrndm_esc_k3_cl2.fit(X_km3_esc_cl2,Y_km3_esc_cl2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_k3_cl2 = pd.DataFrame({'fscore':m_esc_k3_cl2.best_estimator_.feature_importances_,
                                       'varname': list(X_km3_esc_cl2)})
importances_esc_k3_cl2

Cluster 2

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 2:
X_km3_esc_cl2 = df_esc.loc[df_esc['km_cl_esc_3'] == 2]
# Subsetting Outcome for Cluster 1:
Y_km3_esc_cl2 = X_km3_esc_cl2.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_esc_cl2 = X_km3_esc_cl2.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_esc_cl2 = X_km3_esc_cl2.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_km3_esc_cl2) == len(X_km3_esc_cl2):
  print("Same Length")
else:
  print("Not Mathcing")


print(len(Y_km3_esc_cl2))  

In [None]:
X_km3_esc_cl2.head()

In [None]:
# Checking for Imbalance: 
Y_km3_esc_cl2.value_counts(normalize=True)
##rouhgly balanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_esc_cl2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(0.72,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_k3_cl2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_k3_cl2 = cross_validate(xgbrndm_esc_k3_cl2,X_km3_esc_cl2,Y_km3_esc_cl2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_k3_cl2

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_k3_cl2['test_accuracy']),2)}")
print(f"Mean AUC = {round(mean(cvresults_esc_k3_cl2['test_roc_auc']),2)}")
print(f"Mean Precision = {round(mean(cvresults_esc_k3_cl2['test_precision']),2)}")
print(f"Mean Recall = {round(mean(cvresults_esc_k3_cl2['test_recall']),2)}")

In [None]:
m_esc_k3_cl2 = xgbrndm_esc_k3_cl2.fit(X_km3_esc_cl2,Y_km3_esc_cl2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_k3_cl2 = pd.DataFrame({'fscore':m_esc_k3_cl2.best_estimator_.feature_importances_,
                                     'varname': list(X_km3_esc_cl2)})
importances_esc_k3_cl2

##### Cluster 1

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_km3_esc_cl1 = df_esc.loc[df_esc['km_cl_esc_3'] == 1]
# Subsetting Outcome for Cluster 1:
Y_km3_esc_cl1 = X_km3_esc_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_esc_cl1 = X_km3_esc_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_esc_cl1 = X_km3_esc_cl1.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_km3_esc_cl1) == len(X_km3_esc_cl1):
  print("Same Length")
else:
  print("Not Matching")


print(len(Y_km3_esc_cl1))  

In [None]:
X_km3_esc_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_km3_esc_cl1.value_counts(normalize=True)
##rouhgly balanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_esc_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.23,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_k3_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_k3_cl1 = cross_validate(xgbrndm_esc_k3_cl1,X_km3_esc_cl1,Y_km3_esc_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_k3_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_k3_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_k3_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_k3_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_esc_k3_cl1['test_recall']),3)}")

In [None]:
m_esc_k3_cl1 = xgbrndm_esc_k3_cl1.fit(X_km3_esc_cl1,Y_km3_esc_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_k3_cl1 = pd.DataFrame({'fscore':m_esc_k3_cl1.best_estimator_.feature_importances_,
                                       'varname': list(X_km3_esc_cl1)})
importances_esc_k3_cl1

##### Cluster 0

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 0:
X_km3_esc_cl0 = df_esc.loc[df_esc['km_cl_esc_3'] == 0]
# Subsetting Outcome for Cluster 0:
Y_km3_esc_cl0 = X_km3_esc_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_esc_cl0 = X_km3_esc_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_esc_cl0 = X_km3_esc_cl0.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_km3_esc_cl0) == len(X_km3_esc_cl0):
  print("Same Length")
else:
  print("Not Mathcing")


print(len(Y_km3_esc_cl0))  

In [None]:
X_km3_esc_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_km3_esc_cl0.value_counts(normalize=True)
##Very  imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_esc_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(3.13,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_k3_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_k3_cl0 = cross_validate(xgbrndm_esc_k3_cl0,X_km3_esc_cl0,Y_km3_esc_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_k3_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_k3_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_k3_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_k3_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_esc_k3_cl0['test_recall']),3)}")

In [None]:
m_esc_k3_cl0 = xgbrndm_esc_k3_cl0.fit(X_km3_esc_cl0,Y_km3_esc_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_k3_cl0 = pd.DataFrame({'fscore':m_esc_k3_cl0.best_estimator_.feature_importances_,
                                     'varname': list(X_km3_esc_cl0)})
importances_esc_k3_cl0

### c. Nortryptaline

##### NORTRYPTALYINE ; w/o cluster var

In [None]:
df_nor.head()

In [None]:
X_sup_nor = df_nor.drop(["km_cl_nor_2","km_cl_nor_3","km_cl_nor_4", "km_cl_nor_5","km_cl_nor_6","km_cl_nor_7","mdpercadj","hdremit.all","sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6","sc_cl_nor_7"],axis=1)
Y_sup_nor = df_nor["hdremit.all"]

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_nor)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_nor = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_result_nor = cross_validate(xgbrndm_nor,X_sup_nor,Y_sup_nor,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_result_nor

In [None]:
print(f"Mean Accuracy = {round(mean(cv_result_nor['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_result_nor['test_roc_auc']),3)}")
print(f"Mean Precision =  {round(mean(cv_result_nor['test_precision']),3)}")
print(f"Mean Recall =  {round(mean(cv_result_nor['test_recall']),3)}")

##### NORTRYPTALINE: K-Means K=2; w/cluster var

In [None]:
df_nor.head()

In [None]:
X_sup_nor_k2 = df_nor.drop(["km_cl_nor_3","km_cl_nor_4", "km_cl_nor_5","km_cl_nor_6","km_cl_nor_7","mdpercadj","hdremit.all","sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6","sc_cl_nor_7"],axis=1)
Y_sup_nor_k2 = df_nor["hdremit.all"]

In [None]:
print(len(X_sup_nor_k2))
print(len(Y_sup_nor_k2))

In [None]:
Y_sup_nor_k2.value_counts(normalize=True)

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_nor_k2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.9,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_nor_k2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_nor_k2 = cross_validate(xgbrndm_nor_k2,X_sup_nor_k2,Y_sup_nor_k2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_nor_k2

In [None]:
print(f"Mean Accuracy = {round(mean(cv_results_nor_k2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_nor_k2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_nor_k2['test_precision']),3)}")
print(f"Mean Precision = {round(mean(cv_results_nor_k2['test_recall']),3)}")

In [None]:
m_nor_k2 = xgbrndm_nor_k2.fit(X_sup_nor_k2,Y_sup_nor_k2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importance_nor_k2 = pd.DataFrame({'fscore':m_nor_k2.best_estimator_.feature_importances_,
                           'varname': list(X_sup_nor_k2)})
importance_nor_k2

##### NORTRYPTALINE: K-Means K=3; w/ cluster var

In [None]:
df_nor.head()

In [None]:
X_sup_nor_k3 = df_nor.drop(["km_cl_nor_2","km_cl_nor_4", "km_cl_nor_5","km_cl_nor_6","km_cl_nor_7","mdpercadj","hdremit.all","sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6","sc_cl_nor_7"],axis=1)
Y_sup_nor_k3 = df_nor["hdremit.all"]

In [None]:
print(len(X_sup_nor_k3))
print(len(Y_sup_nor_k3))

In [None]:
Y_sup_nor_k3.value_counts(normalize=True)

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_nor_k3)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.92,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=123)
xgbrndm_nor_k3 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_nor_k3 = cross_validate(xgbrndm_nor_k3,X_sup_nor_k3,Y_sup_nor_k3,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_nor_k3

In [None]:
print(f"Mean Accuracy = {round(mean(cv_results_nor_k3['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_nor_k3['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_nor_k3['test_precision']),3)}")
print(f"Mean Precision = {round(mean(cv_results_nor_k3['test_recall']),3)}")

In [None]:
m_nor_k3 = xgbrndm_nor_k3.fit(X_sup_nor_k3,Y_sup_nor_k3)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importance_nor_k3 = pd.DataFrame({'fscore':m_nor_k3.best_estimator_.feature_importances_,
                           'varname': list(X_sup_nor_k3)})
importance_nor_k3

##### NORTRYPTALINE: K-Means K = 2 w/o cluster var

###### Cluster 1

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_km2_nor_cl1 = df_nor.loc[df_nor['km_cl_nor_2'] == 1]
# Subsetting Outcome for Cluster 1:
Y_km2_nor_cl1 = X_km2_nor_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km2_nor_cl1 = X_km2_nor_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km2_nor_cl1 = X_km2_nor_cl1.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_km2_nor_cl1) == len(X_km2_nor_cl1):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_km2_nor_cl1))  

In [None]:
X_km2_nor_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_km2_nor_cl1.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km2_nor_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(2.87,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_k2_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_k2_cl1 = cross_validate(xgbrndm_nor_k2_cl1,X_km2_nor_cl1,Y_km2_nor_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_k2_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_k2_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_k2_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_k2_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_k2_cl1['test_recall']),3)}")

In [None]:
m_nor_k2_cl1 = xgbrndm_nor_k2_cl1.fit(X_km2_nor_cl1,Y_km2_nor_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_k2_cl1 = pd.DataFrame({'fscore':m_nor_k2_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_km2_nor_cl1)})
importances_nor_k2_cl1

###### Cluster 0


In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 0:
X_km2_nor_cl0 = df_nor.loc[df_nor['km_cl_nor_2'] ==0]
# Subsetting Outcome for Cluster 0:
Y_km2_nor_cl0 = X_km2_nor_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km2_nor_cl0 = X_km2_nor_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km2_nor_cl0 = X_km2_nor_cl0.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_km2_nor_cl0) == len(X_km2_nor_cl0):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(X_km2_nor_cl0))  

In [None]:
X_km2_nor_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_km2_nor_cl0.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km2_nor_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.45,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_k2_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_k2_cl0 = cross_validate(xgbrndm_nor_k2_cl0,X_km2_nor_cl0,Y_km2_nor_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_k2_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_k2_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_k2_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_k2_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_k2_cl0['test_recall']),3)}")

In [None]:
m_nor_k2_cl0 = xgbrndm_nor_k2_cl0.fit(X_km2_nor_cl0,Y_km2_nor_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_k2_cl0 = pd.DataFrame({'fscore':m_nor_k2_cl0.best_estimator_.feature_importances_,
                                     'varname': list(X_km2_nor_cl0)})
importances_nor_k2_cl0

##### NORTRYPTALINE: K-Means K = 3 w/o cluster var

###### Cluster 2

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 2:
X_km3_nor_cl2 = df_nor.loc[df_nor['km_cl_nor_3'] == 2]
# Subsetting Outcome for Cluster 2:
Y_km3_nor_cl2 = X_km3_nor_cl2.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_nor_cl2 = X_km3_nor_cl2.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_nor_cl2 = X_km3_nor_cl2.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_km3_nor_cl2) == len(X_km3_nor_cl2):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_km3_nor_cl2))  

In [None]:
X_km3_nor_cl2.head()

In [None]:
# Checking for Imbalance: 
Y_km3_nor_cl2.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_nor_cl2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.44,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_k3_cl2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_k3_cl2 = cross_validate(xgbrndm_nor_k3_cl2,X_km3_nor_cl2,Y_km3_nor_cl2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_k3_cl2

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_k3_cl2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_k3_cl2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_k3_cl2['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_k3_cl2['test_recall']),3)}")

In [None]:
m_nor_k3_cl2 = xgbrndm_nor_k3_cl2.fit(X_km3_nor_cl2,Y_km3_nor_cl2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_k3_cl2 = pd.DataFrame({'fscore':m_nor_k3_cl2.best_estimator_.feature_importances_,
                                     'varname': list(X_km3_nor_cl2)})
importances_nor_k3_cl2

###### Cluster 1

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_km3_nor_cl1 = df_nor.loc[df_nor['km_cl_nor_3'] == 1]
# Subsetting Outcome for Cluster 1:
Y_km3_nor_cl1 = X_km3_nor_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_nor_cl1 = X_km3_nor_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_nor_cl1 = X_km3_nor_cl1.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_km3_nor_cl1) == len(X_km3_nor_cl1):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_km3_nor_cl1))  

In [None]:
X_km3_nor_cl2.head()

In [None]:
# Checking for Imbalance: 
Y_km3_nor_cl1.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_nor_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(3.29,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_k3_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_k3_cl1 = cross_validate(xgbrndm_nor_k3_cl1,X_km3_nor_cl1,Y_km3_nor_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_k3_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_k3_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_k3_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_k3_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_k3_cl1['test_recall']),3)}")

In [None]:
m_nor_k3_cl1 = xgbrndm_nor_k3_cl1.fit(X_km3_nor_cl1,Y_km3_nor_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_k3_cl1 = pd.DataFrame({'fscore':m_nor_k3_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_km3_nor_cl1)})
importances_nor_k3_cl1

###### Cluster 0

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 0:
X_km3_nor_cl0 = df_nor.loc[df_nor['km_cl_nor_3'] == 0]
# Subsetting Outcome for Cluster 0:
Y_km3_nor_cl0 = X_km3_nor_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_km3_nor_cl0 = X_km3_nor_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_km3_nor_cl0 = X_km3_nor_cl0.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_km3_nor_cl0) == len(X_km3_nor_cl0):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_km3_nor_cl0))  

In [None]:
X_km3_nor_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_km3_nor_cl1.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_km3_nor_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.5,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_k3_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_k3_cl0 = cross_validate(xgbrndm_nor_k3_cl0,X_km3_nor_cl0,Y_km3_nor_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_k3_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_k3_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_k3_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_k3_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_k3_cl0['test_recall']),3)}")

In [None]:
m_nor_k3_cl0 = xgbrndm_nor_k3_cl0.fit(X_km3_nor_cl0,Y_km3_nor_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_k3_cl0 = pd.DataFrame({'fscore':m_nor_k3_cl0.best_estimator_.feature_importances_,
                                     'varname': list(X_km3_nor_cl0)})
importances_nor_k3_cl0

## ii. Spectral Clustering

### a. All Sample




In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_validate
from numpy import mean

#### K = 2 ; w/cluster var

In [None]:
df_all.head()

In [None]:
X_sup_all_sc2 = df_all.drop(["km_cl_all_2","km_cl_all_3","km_cl_all_4", "km_cl_all_5","km_cl_all_6","km_cl_all_7","mdpercadj","hdremit.all","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6","sc_cl_all_7"],axis=1)
Y_sup_all_sc2 = df_all["hdremit.all"]

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_all_sc2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_all_sc2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_sc2 = cross_validate(xgbrndm_all_sc2,X_sup_all_sc2,Y_sup_all_sc2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_sc2

In [None]:
print(f"Mean Accuracy = {round(mean(cv_results_sc2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_sc2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_sc2['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cv_results_sc2['test_recall']),3)}")

In [None]:
m = xgbrndm_all_sc2.fit(X_sup_all_sc2,Y_sup_all_sc2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importance_all_sc2 = pd.DataFrame({'fscore':m.best_estimator_.feature_importances_,
                           'varname': list(X_sup_all_sc2)})
importance_all_sc2

#### K = 2 w/o cluster var

##### Cluster 1

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc2_all_cl1 = df_all.loc[df_all['sc_cl_all_2'] == 1]
# Subsetting Outcome for Cluster 1:
Y_sc2_all_cl1 = X_sc2_all_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc2_all_cl1 = X_sc2_all_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc2_all_cl1 = X_sc2_all_cl1.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_sc2_all_cl1) == len(X_sc2_all_cl1):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(X_sc2_all_cl1))

In [None]:
X_sc2_all_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_sc2_all_cl1.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc2_all_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=random_state)
xgbrndm_all_sc2_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_sc2_cl1 = cross_validate(xgbrndm_all_sc2_cl1,X_sc2_all_cl1,Y_sc2_all_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_sc2_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_sc2_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_sc2_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_sc2_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_sc2_cl1['test_recall']),3)}")

In [None]:
m_all_sc2_cl1 = xgbrndm_all_sc2_cl1.fit(X_sc2_all_cl1,Y_sc2_all_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_sc2_cl1 = pd.DataFrame({'fscore':m_all_sc2_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_sc2_all_cl1)})
importances_all_sc2_cl1

##### Cluster 0

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 2:
X_sc2_all_cl0 = df_all.loc[df_all['sc_cl_all_2'] == 0]
# Subsetting Outcome for Cluster 1:
Y_sc2_all_cl0 = X_sc2_all_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc2_all_cl0 = X_sc2_all_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc2_all_cl0 = X_sc2_all_cl0.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_sc2_all_cl0) == len(X_sc2_all_cl0):
  print("Same Length")
else:
  print("Not Mathcing")

In [None]:
len(Y_sc2_all_cl0)

In [None]:
X_sc2_all_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_sc2_all_cl0.value_counts(normalize=True)
##Only slightly imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc2_all_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_sc2_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_sc2_cl0 = cross_validate(xgbrndm_all_sc2_cl0, X_sc2_all_cl0, Y_sc2_all_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_sc2_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_sc2_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_sc2_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_sc2_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_sc2_cl0['test_recall']),3)}")

In [None]:
m_all_sc2_cl1 = xgbrndm_all_sc2_cl0.fit(X_sc2_all_cl1,Y_sc2_all_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_sc2_cl1 = pd.DataFrame({'fscore':m_all_sc2_cl1.best_estimator_.feature_importances_,
                                       'varname': list(X_sc2_all_cl1)})
importances_all_sc2_cl1

#### K = 3 ; w/cluster var

In [None]:
df_all.head()

In [None]:
X_sup_all_sc3 = df_all.drop(["km_cl_all_2","km_cl_all_3","km_cl_all_4", "km_cl_all_5","km_cl_all_6","km_cl_all_7","mdpercadj","hdremit.all","sc_cl_all_2","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6","sc_cl_all_7"],axis=1)
Y_sup_all_sc3 = df_all["hdremit.all"]

In [None]:
if len(Y_sup_all_sc3) == len(X_sup_all_sc3):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_sup_all_sc3))  

In [None]:
X_sup_all_sc3.head()

In [None]:
# Checking for Imbalance: 
Y_sup_all_sc3.value_counts(normalize=True)
##Imbalanced

In [None]:
print(len(Y_sup_all_sc3))

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_all_sc3)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_sc3 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_all_sc3 = cross_validate(xgbrndm_all_sc3,X_sup_all_sc3,Y_sup_all_sc3,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_all_sc3

In [None]:
from numpy import mean
print(f"Mean Accuracy = {round(mean(cv_results_all_sc3['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_all_sc3['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_all_sc3['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cv_results_all_sc3['test_recall']),3)}")

In [None]:
m_all_sc3 = xgbrndm_all_sc3.fit(X_sup_all_sc3,Y_sup_all_sc3)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_sc3 = pd.DataFrame({'fscore':m_all_sc3.best_estimator_.feature_importances_,
                                     'varname': list(X_sup_all_sc3)})
importances_all_sc3

#### K = 3; w/o cluster var

##### Cluster 2

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc3_all_cl2 = df_all.loc[df_all['sc_cl_all_3'] == 2]
# Subsetting Outcome for Cluster 1:
Y_sc3_all_cl2 = X_sc3_all_cl2.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_all_cl2 = X_sc3_all_cl2.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_all_cl2 = X_sc3_all_cl2.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_sc3_all_cl2) == len(X_sc3_all_cl2):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_sc3_all_cl2))  

In [None]:
X_sc3_all_cl2.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_all_cl2.value_counts(normalize=True)
## Slightly Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_all_cl2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_sc3_cl2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_sc3_cl2 = cross_validate(xgbrndm_all_sc3_cl2,X_sc3_all_cl2,Y_sc3_all_cl2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_sc3_cl2

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_sc3_cl2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_sc3_cl2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_sc3_cl2['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_sc3_cl2['test_recall']),3)}")

In [None]:
m_all_sc3_cl2 = xgbrndm_all_sc3_cl2.fit(X_sc3_all_cl2,Y_sc3_all_cl2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_sc3_cl2 = pd.DataFrame({'fscore':m_all_sc3_cl2.best_estimator_.feature_importances_,
                                     'varname': list(X_sc3_all_cl2)})
importances_all_sc3_cl2

##### Cluster 1

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc3_all_cl1 = df_all.loc[df_all['sc_cl_all_3'] == 1]
# Subsetting Outcome for Cluster 1:
Y_sc3_all_cl1 = X_sc3_all_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_all_cl1 = X_sc3_all_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_all_cl1 = X_sc3_all_cl1.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_sc3_all_cl1) == len(X_sc3_all_cl1):
  print(f"Same Length {len(Y_sc3_all_cl1)}")
else:
  print("Not Mathcing")

print(f"The cluster contains {len(X_sc3_all_cl1)} patients ")  

In [None]:
X_sc3_all_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_all_cl1.value_counts(normalize=True)
## Roughly Balanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_all_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.79,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_sc3_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_sc3_cl1 = cross_validate(xgbrndm_all_sc3_cl1,X_sc3_all_cl1,Y_sc3_all_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_sc3_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_sc3_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_sc3_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_sc3_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_sc3_cl1['test_recall']),3)}")

In [None]:
m_all_sc3_cl1 = xgbrndm_all_sc3_cl1.fit(X_sc3_all_cl1,Y_sc3_all_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_sc3_cl1 = pd.DataFrame({'fscore':m_all_sc3_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_sc3_all_cl1)})
importances_all_sc3_cl1

##### Cluster 0

In [None]:
df_all.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 0:
X_sc3_all_cl0 = df_all.loc[df_all['sc_cl_all_3'] == 0]
# Subsetting Outcome for Cluster 0:
Y_sc3_all_cl0 = X_sc3_all_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_all_cl0 = X_sc3_all_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_all_cl0 = X_sc3_all_cl0.drop(["km_cl_all_3","km_cl_all_4","km_cl_all_5","km_cl_all_6","km_cl_all_7",
                                    "sc_cl_all_2","sc_cl_all_3","sc_cl_all_4","sc_cl_all_5","sc_cl_all_6",
                                    "sc_cl_all_7","km_cl_all_2"],axis=1)


In [None]:
if len(Y_sc3_all_cl0) == len(X_sc3_all_cl0):
  print(f"Same Length {len(Y_sc3_all_cl0)}")
else:
  print("Not Mathcing")

print(f"The cluster contains {len(X_sc3_all_cl0)} patients ")  

In [None]:
X_sc3_all_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_all_cl0.value_counts(normalize=True)
##Roughl balanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_all_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_all_sc3_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_all_sc3_cl0 = cross_validate(xgbrndm_all_sc3_cl0,X_sc3_all_cl0,Y_sc3_all_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_all_sc3_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_all_sc3_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_all_sc3_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_all_sc3_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_all_sc3_cl0['test_recall']),3)}")

In [None]:
m_all_sc3_cl0 = xgbrndm_all_sc3_cl1.fit(X_sc3_all_cl0,Y_sc3_all_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_all_sc3_cl0 = pd.DataFrame({'fscore':m_all_sc2_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_sc2_all_cl1)})
importances_all_sc3_cl0

### b. Escitalopram



In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_validate
from numpy import mean

#### K = 2 ; w/cluster var

In [None]:
df_esc.head()

In [None]:
X_sup_esc_sc2 = df_esc.drop(["km_cl_esc_2","km_cl_esc_3","km_cl_esc_4", "km_cl_esc_5","km_cl_esc_6","km_cl_esc_7","mdpercadj","hdremit.all","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6","sc_cl_esc_7"],axis=1)
Y_sup_esc_sc2 = df_esc["hdremit.all"]

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_esc_sc2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
Y_sup_esc_sc2.value_counts(normalize=True)

In [None]:
print(len(X_sup_esc_sc2))
print(len(Y_sup_esc_sc2))

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_esc_sc2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_sc2 = cross_validate(xgbrndm_esc_sc2,X_sup_esc_sc2,Y_sup_esc_sc2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_sc2

In [None]:
print(f"Mean Accuracy = {round(mean(cv_results_sc2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_sc2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_sc2['test_precision']),3)}")
print(f"Mean Recall =  {round(mean(cv_results_sc2['test_recall']),3)}")

In [None]:
m = xgbrndm_esc_sc2.fit(X_sup_esc_sc2,Y_sup_esc_sc2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importance_esc_sc2 = pd.DataFrame({'fscore':m.best_estimator_.feature_importances_,
                           'varname': list(X_sup_esc_sc2)})
importance_esc_sc2

#### K = 2 w/o cluster var

##### Cluster 1

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc2_esc_cl1 = df_esc.loc[df_esc['sc_cl_esc_2'] == 1]
# Subsetting Outcome for Cluster 1:
Y_sc2_esc_cl1 = X_sc2_esc_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc2_esc_cl1 = X_sc2_esc_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc2_esc_cl1 = X_sc2_esc_cl1.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_sc2_esc_cl1) == len(X_sc2_esc_cl1):
  print("Same Length")
else:
  print("Not Mathcing")

len(X_sc2_esc_cl1)

In [None]:
X_sc2_esc_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_sc2_esc_cl1.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc2_esc_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=random_state)
xgbrndm_esc_sc2_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_sc2_cl1 = cross_validate(xgbrndm_esc_sc2_cl1,X_sc2_esc_cl1,Y_sc2_esc_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_sc2_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_sc2_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_sc2_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_sc2_cl1['test_precision']),3)}")
print(f"Mean Recesc = {round(mean(cvresults_esc_sc2_cl1['test_recall']),3)}")

In [None]:
m_esc_sc2_cl1 = xgbrndm_esc_sc2_cl1.fit(X_sc2_esc_cl1,Y_sc2_esc_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_sc2_cl1 = pd.DataFrame({'fscore':m_esc_sc2_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_sc2_esc_cl1)})
importances_esc_sc2_cl1

##### Cluster 0

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 2:
X_sc2_esc_cl0 = df_esc.loc[df_esc['sc_cl_esc_2'] == 0]
# Subsetting Outcome for Cluster 1:
Y_sc2_esc_cl0 = X_sc2_esc_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc2_esc_cl0 = X_sc2_esc_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc2_esc_cl0 = X_sc2_esc_cl0.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_sc2_esc_cl0) == len(X_sc2_esc_cl0):
  print("Same Length")
else:
  print("Not Mathcing")

In [None]:
X_sc2_esc_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_sc2_esc_cl0.value_counts(normalize=True)
##Only slightly imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc2_esc_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_sc2_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_sc2_cl0 = cross_validate(xgbrndm_esc_sc2_cl0, X_sc2_esc_cl0, Y_sc2_esc_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_sc2_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_sc2_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_sc2_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_sc2_cl0['test_precision']),3)}")
print(f"Mean Recesc = {round(mean(cvresults_esc_sc2_cl0['test_recall']),3)}")

In [None]:
m_esc_sc2_cl1 = xgbrndm_esc_sc2_cl0.fit(X_sc2_esc_cl1,Y_sc2_esc_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_sc2_cl1 = pd.DataFrame({'fscore':m_esc_sc2_cl1.best_estimator_.feature_importances_,
                                       'varname': list(X_sc2_esc_cl1)})
importances_esc_sc2_cl1

#### K = 3 ; w/cluster var

In [None]:
df_esc.head()

In [None]:
X_sup_esc_sc3 = df_esc.drop(["km_cl_esc_2","km_cl_esc_3","km_cl_esc_4", "km_cl_esc_5","km_cl_esc_6","km_cl_esc_7","mdpercadj","hdremit.all","sc_cl_esc_2","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6","sc_cl_esc_7"],axis=1)
Y_sup_esc_sc3 = df_esc["hdremit.all"]

In [None]:
if len(Y_sup_esc_sc3) == len(X_sup_esc_sc3):
  print("Same Length")
else:
  print("Not Mathcing")

In [None]:
X_sup_esc_sc3.head()

In [None]:
# Checking for Imbalance: 
Y_sup_esc_sc3.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_esc_sc3)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_sc3 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_esc_sc3 = cross_validate(xgbrndm_esc_sc3,X_sup_esc_sc3,Y_sup_esc_sc3,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_esc_sc3

In [None]:
from numpy import mean
print(f"Mean Accuracy = {round(mean(cv_results_esc_sc3['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_esc_sc3['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_esc_sc3['test_precision']),3)}")
print(f"Mean Recesc = {round(mean(cv_results_esc_sc3['test_recall']),3)}")

In [None]:
m_esc_sc3 = xgbrndm_esc_sc3.fit(X_sup_esc_sc3,Y_sup_esc_sc3)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_sc3 = pd.DataFrame({'fscore':m_esc_sc3.best_estimator_.feature_importances_,
                                     'varname': list(X_sup_esc_sc3)})
importances_esc_sc3

#### K = 3; w/o cluster var

##### Cluster 2

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc3_esc_cl2 = df_esc.loc[df_esc['sc_cl_esc_3'] == 2]
# Subsetting Outcome for Cluster 1:
Y_sc3_esc_cl2 = X_sc3_esc_cl2.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_esc_cl2 = X_sc3_esc_cl2.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_esc_cl2 = X_sc3_esc_cl2.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_sc3_esc_cl2) == len(X_sc3_esc_cl2):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_sc3_esc_cl2))  

In [None]:
X_sc3_esc_cl2.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_esc_cl2.value_counts(normalize=True)
## Slightly Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_esc_cl2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.2,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_sc3_cl2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_sc3_cl2 = cross_validate(xgbrndm_esc_sc3_cl2,X_sc3_esc_cl2,Y_sc3_esc_cl2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_sc3_cl2

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_sc3_cl2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_sc3_cl2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_sc3_cl2['test_precision']),3)}")
print(f"Mean Recesc = {round(mean(cvresults_esc_sc3_cl2['test_recall']),3)}")

In [None]:
m_esc_sc3_cl2 = xgbrndm_esc_sc3_cl2.fit(X_sc3_esc_cl2,Y_sc3_esc_cl2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_sc3_cl2 = pd.DataFrame({'fscore':m_esc_sc3_cl2.best_estimator_.feature_importances_,
                                     'varname': list(X_sc3_esc_cl2)})
importances_esc_sc3_cl2

##### Cluster 1

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc3_esc_cl1 = df_esc.loc[df_esc['sc_cl_esc_3'] == 1]
# Subsetting Outcome for Cluster 1:
Y_sc3_esc_cl1 = X_sc3_esc_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_esc_cl1 = X_sc3_esc_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_esc_cl1 = X_sc3_esc_cl1.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_sc3_esc_cl1) == len(X_sc3_esc_cl1):
  print(f"Same Length {len(Y_sc3_esc_cl1)}")
else:
  print("Not Mathcing")

print(f"The cluster contains {len(X_sc3_esc_cl1)} patients ")  

In [None]:
X_sc3_esc_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_esc_cl1.value_counts(normalize=True)
## Roughly Balanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_esc_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.79,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_sc3_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_sc3_cl1 = cross_validate(xgbrndm_esc_sc3_cl1,X_sc3_esc_cl1,Y_sc3_esc_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_sc3_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_sc3_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_esc_sc3_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_sc3_cl1['test_precision']),3)}")
print(f"Mean Recesc = {round(mean(cvresults_esc_sc3_cl1['test_recall']),3)}")

In [None]:
m_esc_sc3_cl1 = xgbrndm_esc_sc3_cl1.fit(X_sc3_esc_cl1,Y_sc3_esc_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_sc3_cl1 = pd.DataFrame({'fscore':m_esc_sc3_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_sc3_esc_cl1)})
importances_esc_sc3_cl1

##### Cluster 0

In [None]:
df_esc.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 0:
X_sc3_esc_cl0 = df_esc.loc[df_esc['sc_cl_esc_3'] == 0]
# Subsetting Outcome for Cluster 0:
Y_sc3_esc_cl0 = X_sc3_esc_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_esc_cl0 = X_sc3_esc_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_esc_cl0 = X_sc3_esc_cl0.drop(["km_cl_esc_3","km_cl_esc_4","km_cl_esc_5","km_cl_esc_6","km_cl_esc_7",
                                    "sc_cl_esc_2","sc_cl_esc_3","sc_cl_esc_4","sc_cl_esc_5","sc_cl_esc_6",
                                    "sc_cl_esc_7","km_cl_esc_2"],axis=1)


In [None]:
if len(Y_sc3_esc_cl0) == len(X_sc3_esc_cl0):
  print(f"Same Length {len(Y_sc3_esc_cl0)}")
else:
  print("Not Mathcing")

print(f"The cluster contains {len(X_sc3_esc_cl0)} patients ")  

In [None]:
X_sc3_esc_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_esc_cl0.value_counts(normalize=True)
##Very Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_esc_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_esc_sc3_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_esc_sc3_cl0 = cross_validate(xgbrndm_esc_sc3_cl0,X_sc3_esc_cl0,Y_sc3_esc_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_esc_sc3_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_esc_sc3_cl0['test_accuracy']),2)}")
print(f"Mean AUC = {round(mean(cvresults_esc_sc3_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_esc_sc3_cl0['test_precision']),2)}")
print(f"Mean Recesc = {round(mean(cvresults_esc_sc3_cl0['test_recall']),2)}")

In [None]:
m_esc_sc3_cl0 = xgbrndm_esc_sc3_cl1.fit(X_sc3_esc_cl0,Y_sc3_esc_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_esc_sc3_cl0 = pd.DataFrame({'fscore':m_esc_sc3_cl0.best_estimator_.feature_importances_,
                                     'varname': list(X_sc3_esc_cl0)})
importances_esc_sc3_cl0

### c. Nortryptaline



In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_validate
from numpy import mean

#### K = 2 ; w/cluster var

In [None]:
df_nor.head()

In [None]:
X_sup_nor_sc2 = df_nor.drop(["km_cl_nor_2","km_cl_nor_3","km_cl_nor_4", "km_cl_nor_5","km_cl_nor_6","km_cl_nor_7","mdpercadj","hdremit.all","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6","sc_cl_nor_7"],axis=1)
Y_sup_nor_sc2 = df_nor["hdremit.all"]

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_nor_sc2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
Y_sup_nor_sc2.value_counts(normalize=True)

In [None]:
print(len(X_sup_nor_sc2))
print(len(Y_sup_nor_sc2))

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(random_state=123)
xgbrndm_nor_sc2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = random_state, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_nor_sc2 = cross_validate(xgbrndm_nor_sc2,X_sup_nor_sc2,Y_sup_nor_sc2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_nor_sc2

In [None]:
print(f"Mean Accuracy = {round(mean(cv_results_nor_sc2['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_nor_sc2['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_nor_sc2['test_precision']),3)}")
print(f"Mean Recall =  {round(mean(cv_results_nor_sc2['test_recall']),3)}")

In [None]:
m_nor_sc2 = xgbrndm_nor_sc2.fit(X_sup_nor_sc2,Y_sup_nor_sc2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importance_nor_sc2 = pd.DataFrame({'fscore':m_nor_sc2.best_estimator_.feature_importances_,
                           'varname': list(X_sup_nor_sc2)})
importance_nor_sc2

#### K = 2 w/o cluster var

##### Cluster 1

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc2_nor_cl1 = df_nor.loc[df_nor['sc_cl_nor_2'] == 1]
# Subsetting Outcome for Cluster 1:
Y_sc2_nor_cl1 = X_sc2_nor_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc2_nor_cl1 = X_sc2_nor_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc2_nor_cl1 = X_sc2_nor_cl1.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_sc2_nor_cl1) == len(X_sc2_nor_cl1):
  print("Same Length")
else:
  print("Not Mathcing")

len(X_sc2_nor_cl1)

In [None]:
X_sc2_nor_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_sc2_nor_cl1.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc2_nor_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=random_state)
xgbrndm_nor_sc2_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_sc2_cl1 = cross_validate(xgbrndm_nor_sc2_cl1,X_sc2_nor_cl1,Y_sc2_nor_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_sc2_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_sc2_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_sc2_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_sc2_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_sc2_cl1['test_recall']),3)}")

In [None]:
m_nor_sc2_cl1 = xgbrndm_nor_sc2_cl1.fit(X_sc2_nor_cl1,Y_sc2_nor_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_sc2_cl1 = pd.DataFrame({'fscore':m_nor_sc2_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_sc2_nor_cl1)})
importances_nor_sc2_cl1

##### Cluster 0

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 2:
X_sc2_nor_cl0 = df_nor.loc[df_nor['sc_cl_nor_2'] == 0]
# Subsetting Outcome for Cluster 1:
Y_sc2_nor_cl0 = X_sc2_nor_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc2_nor_cl0 = X_sc2_nor_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc2_nor_cl0 = X_sc2_nor_cl0.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_sc2_nor_cl0) == len(X_sc2_nor_cl0):
  print("Same Length")
else:
  print("Not Mathcing")

In [None]:
X_sc2_nor_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_sc2_nor_cl0.value_counts(normalize=True)
##Only slightly imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc2_nor_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_sc2_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_sc2_cl0 = cross_validate(xgbrndm_nor_sc2_cl0, X_sc2_nor_cl0, Y_sc2_nor_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_sc2_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_sc2_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_sc2_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_sc2_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_sc2_cl0['test_recall']),3)}")

In [None]:
m_nor_sc2_cl1 = xgbrndm_nor_sc2_cl0.fit(X_sc2_nor_cl1,Y_sc2_nor_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_sc2_cl1 = pd.DataFrame({'fscore':m_nor_sc2_cl1.best_estimator_.feature_importances_,
                                       'varname': list(X_sc2_nor_cl1)})
importances_nor_sc2_cl1

#### K = 3 ; w/cluster var

In [None]:
df_nor.head()

In [None]:
X_sup_nor_sc3 = df_nor.drop(["km_cl_nor_2","km_cl_nor_3","km_cl_nor_4", "km_cl_nor_5","km_cl_nor_6","km_cl_nor_7","mdpercadj","hdremit.all","sc_cl_nor_2","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6","sc_cl_nor_7"],axis=1)
Y_sup_nor_sc3 = df_nor["hdremit.all"]

In [None]:
if len(Y_sup_nor_sc3) == len(X_sup_nor_sc3):
  print("Same Length")
else:
  print("Not Mathcing")

In [None]:
X_sup_nor_sc3.head()

In [None]:
# Checking for Imbalance: 
Y_sup_nor_sc3.value_counts(normalize=True)
##Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sup_nor_sc3)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_sc3 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cv_results_nor_sc3 = cross_validate(xgbrndm_nor_sc3,X_sup_nor_sc3,Y_sup_nor_sc3,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cv_results_nor_sc3

In [None]:
from numpy import mean
print(f"Mean Accuracy = {round(mean(cv_results_nor_sc3['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cv_results_nor_sc3['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cv_results_nor_sc3['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cv_results_nor_sc3['test_recall']),3)}")

In [None]:
m_nor_sc3 = xgbrndm_nor_sc3.fit(X_sup_nor_sc3,Y_sup_nor_sc3)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_sc3 = pd.DataFrame({'fscore':m_nor_sc3.best_estimator_.feature_importances_,
                                     'varname': list(X_sup_nor_sc3)})
importances_nor_sc3

#### K = 3; w/o cluster var

##### Cluster 2

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc3_nor_cl2 = df_nor.loc[df_nor['sc_cl_nor_3'] == 2]
# Subsetting Outcome for Cluster 1:
Y_sc3_nor_cl2 = X_sc3_nor_cl2.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_nor_cl2 = X_sc3_nor_cl2.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_nor_cl2 = X_sc3_nor_cl2.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_sc3_nor_cl2) == len(X_sc3_nor_cl2):
  print("Same Length")
else:
  print("Not Mathcing")

print(len(Y_sc3_nor_cl2))  

In [None]:
X_sc3_nor_cl2.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_nor_cl2.value_counts(normalize=True)
## Slightly Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_nor_cl2)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.2,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_sc3_cl2 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_sc3_cl2 = cross_validate(xgbrndm_nor_sc3_cl2,X_sc3_nor_cl2,Y_sc3_nor_cl2,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_sc3_cl2

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_sc3_cl2['test_accuracy']),2)}")
print(f"Mean AUC = {round(mean(cvresults_nor_sc3_cl2['test_roc_auc']),2)}")
print(f"Mean Precision = {round(mean(cvresults_nor_sc3_cl2['test_precision']),2)}")
print(f"Mean Recall = {round(mean(cvresults_nor_sc3_cl2['test_recall']),2)}")

In [None]:
m_nor_sc3_cl2 = xgbrndm_nor_sc3_cl2.fit(X_sc3_nor_cl2,Y_sc3_nor_cl2)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_sc3_cl2 = pd.DataFrame({'fscore':m_nor_sc3_cl2.best_estimator_.feature_importances_,
                                     'varname': list(X_sc3_nor_cl2)})
importances_nor_sc3_cl2

##### Cluster 1

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 1:
X_sc3_nor_cl1 = df_nor.loc[df_nor['sc_cl_nor_3'] == 1]
# Subsetting Outcome for Cluster 1:
Y_sc3_nor_cl1 = X_sc3_nor_cl1.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_nor_cl1 = X_sc3_nor_cl1.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_nor_cl1 = X_sc3_nor_cl1.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_sc3_nor_cl1) == len(X_sc3_nor_cl1):
  print(f"Same Length {len(Y_sc3_nor_cl1)}")
else:
  print("Not Mathcing")

print(f"The cluster contains {len(X_sc3_nor_cl1)} patients ")  

In [None]:
X_sc3_nor_cl1.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_nor_cl1.value_counts(normalize=True)
## Roughly Balanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_nor_cl1)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(1.79,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_sc3_cl1 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_sc3_cl1 = cross_validate(xgbrndm_nor_sc3_cl1,X_sc3_nor_cl1,Y_sc3_nor_cl1,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_sc3_cl1

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_sc3_cl1['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_sc3_cl1['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_sc3_cl1['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_sc3_cl1['test_recall']),3)}")

In [None]:
m_nor_sc3_cl1 = xgbrndm_nor_sc3_cl1.fit(X_sc3_nor_cl1,Y_sc3_nor_cl1)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_sc3_cl1 = pd.DataFrame({'fscore':m_nor_sc3_cl1.best_estimator_.feature_importances_,
                                     'varname': list(X_sc3_nor_cl1)})
importances_nor_sc3_cl1

##### Cluster 0

In [None]:
df_nor.head()

In [None]:
# Splitting data into X and Y
# Subsetting Cluster 0:
X_sc3_nor_cl0 = df_nor.loc[df_nor['sc_cl_nor_3'] == 0]
# Subsetting Outcome for Cluster 0:
Y_sc3_nor_cl0 = X_sc3_nor_cl0.loc[:,"hdremit.all"]
# Dropping Outcome Variables from X
X_sc3_nor_cl0 = X_sc3_nor_cl0.drop(["hdremit.all","mdpercadj"],axis=1)
# Dropping Other Cluster Vars from X
X_sc3_nor_cl0 = X_sc3_nor_cl0.drop(["km_cl_nor_3","km_cl_nor_4","km_cl_nor_5","km_cl_nor_6","km_cl_nor_7",
                                    "sc_cl_nor_2","sc_cl_nor_3","sc_cl_nor_4","sc_cl_nor_5","sc_cl_nor_6",
                                    "sc_cl_nor_7","km_cl_nor_2"],axis=1)


In [None]:
if len(Y_sc3_nor_cl0) == len(X_sc3_nor_cl0):
  print(f"Same Length {len(Y_sc3_nor_cl0)}")
else:
  print("Not Mathcing")

print(f"The cluster contains {len(X_sc3_nor_cl0)} patients ")  

In [None]:
X_sc3_nor_cl0.head()

In [None]:
# Checking for Imbalance: 
Y_sc3_nor_cl0.value_counts(normalize=True)
##Very Imbalanced

In [None]:
from collections import Counter
# estimating the scale_pos_weight value to include in grid search.
counter = Counter(Y_sc3_nor_cl0)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
import numpy as np
# Defining hyperparameter search grids

## Positive Scaling (for class imbalance)
xgb_scale_pos_weight = [int(x) for x in np.linspace(estimate,20,10)]

## Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

## Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

## Minimum number of instances needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

## Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Learning objective used
xgb_objective = ['reg:logistic']

# Evaluation metric used 
xgb_eval_metric = ['aucpr'] 

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'scale_pos_weight': xgb_scale_pos_weight, 
            'eta': xgb_eta,
            'objective': xgb_objective,
            'eval_metric': xgb_eval_metric}


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(randome_state=123)
xgbrndm_nor_sc3_cl0 = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, 
                                       n_iter = 100, cv = 5, verbose = 2, 
                                       random_state = 123, 
                                       return_train_score = True,
                                       n_jobs = -1)

cvresults_nor_sc3_cl0 = cross_validate(xgbrndm_nor_sc3_cl0,X_sc3_nor_cl0,Y_sc3_nor_cl0,cv=5,
                            scoring = ('accuracy','precision','recall','roc_auc'))

In [None]:
cvresults_nor_sc3_cl0

In [None]:
print(f"Mean Accuracy = {round(mean(cvresults_nor_sc3_cl0['test_accuracy']),3)}")
print(f"Mean AUC = {round(mean(cvresults_nor_sc3_cl0['test_roc_auc']),3)}")
print(f"Mean Precision = {round(mean(cvresults_nor_sc3_cl0['test_precision']),3)}")
print(f"Mean Recall = {round(mean(cvresults_nor_sc3_cl0['test_recall']),3)}")

In [None]:
m_nor_sc3_cl0 = xgbrndm_nor_sc3_cl0.fit(X_sc3_nor_cl0,Y_sc3_nor_cl0)

In [None]:
%load_ext google.colab.data_table

In [None]:
# Feature importance
importances_nor_sc3_cl0 = pd.DataFrame({'fscore':m_nor_sc3_cl0.best_estimator_.feature_importances_,
                                     'varname': list(X_sc3_nor_cl0)})
importances_nor_sc3_cl0