In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans

# **Part 1 - Data Preprocessing**

**Importing Data**

In [2]:
#https://www.kaggle.com/bachrr/covid-chest-xray
df = pd.read_csv('/metadata.csv', index_col=0, usecols=[0,1,2,3,4,5,17,18,20])
df

Unnamed: 0_level_0,offset,sex,age,finding,survival,view,modality,location
patientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,0.0,M,65.0,COVID-19,Y,PA,X-ray,"Cho Ray Hospital, Ho Chi Minh City, Vietnam"
2,3.0,M,65.0,COVID-19,Y,PA,X-ray,"Cho Ray Hospital, Ho Chi Minh City, Vietnam"
2,5.0,M,65.0,COVID-19,Y,PA,X-ray,"Cho Ray Hospital, Ho Chi Minh City, Vietnam"
2,6.0,M,65.0,COVID-19,Y,PA,X-ray,"Cho Ray Hospital, Ho Chi Minh City, Vietnam"
4,0.0,F,52.0,COVID-19,,PA,X-ray,"Changhua Christian Hospital, Changhua City, Ta..."
...,...,...,...,...,...,...,...,...
205,11.0,M,55.0,COVID-19,Y,AP Supine,X-ray,"North Derbyshire, UK"
205,13.0,M,55.0,COVID-19,Y,AP Supine,X-ray,"North Derbyshire, UK"
205,20.0,M,55.0,COVID-19,Y,AP Supine,X-ray,"North Derbyshire, UK"
205,24.0,M,55.0,COVID-19,Y,AP Supine,X-ray,"North Derbyshire, UK"


**Data Cleanup**

In [3]:
df.isnull().sum() #we can see majority of the survival rows (256) are null, so lets use k-means clustering to cluster survivors and non-survivors

offset       96
sex          43
age          54
finding       0
survival    256
view          0
modality      0
location    118
dtype: int64

In [4]:
#First let's remove our mostly null label column
df = df.drop('survival',axis=1)
df.head()

Unnamed: 0_level_0,offset,sex,age,finding,view,modality,location
patientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,0.0,M,65.0,COVID-19,PA,X-ray,"Cho Ray Hospital, Ho Chi Minh City, Vietnam"
2,3.0,M,65.0,COVID-19,PA,X-ray,"Cho Ray Hospital, Ho Chi Minh City, Vietnam"
2,5.0,M,65.0,COVID-19,PA,X-ray,"Cho Ray Hospital, Ho Chi Minh City, Vietnam"
2,6.0,M,65.0,COVID-19,PA,X-ray,"Cho Ray Hospital, Ho Chi Minh City, Vietnam"
4,0.0,F,52.0,COVID-19,PA,X-ray,"Changhua Christian Hospital, Changhua City, Ta..."


In [5]:
#Now let's remove the null rows for the remaining columns
df = df.dropna() 
df.count() #got 185 rows left to cluster

offset      185
sex         185
age         185
finding     185
view        185
modality    185
location    185
dtype: int64

In [6]:
X = df.iloc[:, :].values #take all rows and columns
X #into an array now

array([[0.0, 'M', 65.0, ..., 'PA', 'X-ray',
        'Cho Ray Hospital, Ho Chi Minh City, Vietnam'],
       [3.0, 'M', 65.0, ..., 'PA', 'X-ray',
        'Cho Ray Hospital, Ho Chi Minh City, Vietnam'],
       [5.0, 'M', 65.0, ..., 'PA', 'X-ray',
        'Cho Ray Hospital, Ho Chi Minh City, Vietnam'],
       ...,
       [20.0, 'M', 55.0, ..., 'AP Supine', 'X-ray',
        'North Derbyshire, UK'],
       [24.0, 'M', 55.0, ..., 'AP Supine', 'X-ray',
        'North Derbyshire, UK'],
       [28.0, 'M', 55.0, ..., 'AP', 'X-ray', 'North Derbyshire, UK']],
      dtype=object)

**Encoding Categorical Data**

Gender

In [7]:
#Encoding the Gender
le = LabelEncoder() #creating an instance of the label encoder imported above
X[:, 1] = le.fit_transform(X[:, 1]) #we are encoding column index 1 from array X, which is the gender. 

print(X[0:3,:]) #0 is female, 1 is male

[[0.0 1 65.0 'COVID-19' 'PA' 'X-ray'
  'Cho Ray Hospital, Ho Chi Minh City, Vietnam']
 [3.0 1 65.0 'COVID-19' 'PA' 'X-ray'
  'Cho Ray Hospital, Ho Chi Minh City, Vietnam']
 [5.0 1 65.0 'COVID-19' 'PA' 'X-ray'
  'Cho Ray Hospital, Ho Chi Minh City, Vietnam']]


In [8]:
print(df["finding"].value_counts(), '\n') #so we have 6 different categories for finding
print(df["view"].value_counts(), '\n') #so we have 6 different categories for view
print(df["modality"].value_counts(), '\n') #so we have 6 different categories for finding
#need to encode 14 different datatypes

COVID-19          155
SARS               14
COVID-19, ARDS      9
Streptococcus       4
No Finding          2
ARDS                1
Name: finding, dtype: int64 

PA               94
AP               32
AP Supine        27
Axial            16
L                15
AP semi erect     1
Name: view, dtype: int64 

X-ray    169
CT        16
Name: modality, dtype: int64 



Modality, View, Finding

In [9]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3,4,5])], remainder='passthrough') #creating an instance of the ColumnTransformer and One Hot Encoding 
#this will then split up Modality, View, and Finding into 14 columns, one for each category

X = np.array(ct.fit_transform(X))

print(X[0:3, :]) #encoded the modality, view, findings column into 14 different columns

[[0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1 65.0
  'Cho Ray Hospital, Ho Chi Minh City, Vietnam']
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 3.0 1 65.0
  'Cho Ray Hospital, Ho Chi Minh City, Vietnam']
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 5.0 1 65.0
  'Cho Ray Hospital, Ho Chi Minh City, Vietnam']]


Location

In [10]:
df['location'].nunique() #41 different locations to categorize

41

In [13]:
ct2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough') #creating an instance of the ColumnTransformer and One Hot Encoding
#this will then split up location into 41 columns, one for each category

X = np.array(ct2.fit_transform(X))

print(X[0, :]) #encoded the location column into 41 different columns

[0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0]


# **Part 2 - Creating and Training the Model**

In [15]:
km = KMeans(n_clusters=2) #2 clusters, one for survival, other for not

In [16]:
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [18]:
km.cluster_centers_ #so we got the two custer centroids, but there's no real way to evaluate due to the mostly unlabelled data

array([[ 4.23312883e-01,  5.76687117e-01,  1.84049080e-02,
         3.68098160e-02,  2.45398773e-02,  1.84049080e-02,
         6.13496933e-03,  6.13496933e-02,  1.22699387e-02,
         1.84049080e-02,  6.13496933e-03,  1.22699387e-02,
         1.84049080e-02,  3.06748466e-02,  1.84049080e-02,
         2.45398773e-02,  9.20245399e-02,  1.84049080e-02,
         1.22699387e-02,  1.84049080e-02,  4.90797546e-02,
         2.45398773e-02,  1.22699387e-02,  6.13496933e-03,
         4.29447853e-02,  3.06748466e-02,  1.84049080e-02,
         6.13496933e-03,  6.13496933e-03,  4.90797546e-02,
         2.45398773e-02,  6.13496933e-03,  6.13496933e-03,
         1.04294479e-01,  6.13496933e-03,  4.90797546e-02,
         6.13496933e-03,  3.68098160e-02,  1.84049080e-02,
         6.13496933e-03,  1.22699387e-02,  6.13496933e-03,
         1.22699387e-02,  6.13496933e-03,  6.13496933e-03,
         2.45398773e-02,  1.84049080e-02,  6.13496933e-03,
         1.22699387e-02,  1.84049080e-02,  2.45398773e-0