In [5]:
import pandas as pd
import math
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [6]:
def fetch_diabetes_ct_data_one_hot(oppScrData):    
    ct_data= oppScrData[["L1_HU_BMD", "TAT Area (cm2)", 'Total Body                Area EA (cm2)',
       'VAT Area (cm2)', 'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)', 'Otherabnormalglucose', 'Impairedfastingglucose', 
       'TypeIIorunspecifiedtypediabetesmellituswithoutmentionofcomplication,uncontrolled', 
       'Type2diabetesmellituswithoutcomplications']];
    n = ct_data.shape[0]
    preprocessed_ct_data = []
    for i in range(n):
        row = ct_data.iloc[i]
        ignore = False
        for j in row[:]:
          if pd.isna(j) or j == ' ': # There is an empty string somewhere in Liver column
            ignore = True
            break
        if not ignore:
          preprocessed_ct_data.append(row)
    return np.array(preprocessed_ct_data, dtype=np.float32)


In [7]:
def fetch_diabetes_ct_data(oppScrData):    
    ct_data= oppScrData[["L1_HU_BMD", "TAT Area (cm2)", 'Total Body                Area EA (cm2)',
       'VAT Area (cm2)', 'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)', 'Type 2 Diabetes DX']];
    n = ct_data.shape[0]
    preprocessed_ct_data = []
    for i in range(n):
        row = ct_data.iloc[i]
        ignore = False
        for j in row[:]:
          if pd.isna(j) or j == ' ': # There is an empty string somewhere in Liver column
            ignore = True
            break
        if not ignore:
          preprocessed_ct_data.append(row)
    return np.array(preprocessed_ct_data, dtype=np.float32)


**ONE HOT**

In [7]:
df = pd.read_excel(r'sample_data/OppScrData.xlsx')  

In [5]:
# Fetch rows with the below specified diabetes values and convert it to one-hot representation
p = ['Otherabnormalglucose', 'Impairedfastingglucose', 'TypeIIorunspecifiedtypediabetesmellituswithoutmentionofcomplication,uncontrolled', 'Type2diabetesmellituswithoutcomplications'];
df = df.loc[df["Type 2 Diabetes DX"].isin(p)]
diabetes_one_hot = pd.get_dummies(df['Type 2 Diabetes DX'])
df = pd.concat([df, diabetes_one_hot], axis=1)
df = df.drop('Type 2 Diabetes DX', axis=1)

In [6]:
# Fetch corresponding ct data values
diabetes_ct_data = fetch_diabetes_ct_data(df)

KeyError: ignored

In [7]:
diabetes_ct_data[0]

array([106.       , 315.53076  , 588.89233  , 202.31795  , 113.21282  ,
         1.7870586,  -3.181874 , 168.92395  ,  47.814583 , 431.519    ,
        53.       ,   1.       ,   0.       ,   0.       ,   0.       ],
      dtype=float32)

In [12]:
X = diabetes_ct_data[:, :-4]
y = diabetes_ct_data[:, -4:]

In [13]:
y

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]], dtype=float32)

**NORMAL REPRESENTATION**

In [8]:
df = pd.read_excel(r'sample_data/OppScrData.xlsx')  
p = ['Otherabnormalglucose', 'Impairedfastingglucose', 'TypeIIorunspecifiedtypediabetesmellituswithoutmentionofcomplication,uncontrolled', 'Type2diabetesmellituswithoutcomplications'];
df = df.loc[df["Type 2 Diabetes DX"].isin(p)]
r = {"Otherabnormalglucose":1, "Impairedfastingglucose":2, "Type2diabetesmellituswithoutcomplications":3, "TypeIIorunspecifiedtypediabetesmellituswithoutmentionofcomplication,uncontrolled":4}
df["Type 2 Diabetes DX"].replace(r, inplace=True)
diabetes_ct_data = fetch_diabetes_ct_data(df)

In [9]:
X = diabetes_ct_data[:, :-1]
y = diabetes_ct_data[:, -1]

In [10]:
X[0]

array([106.       , 315.53076  , 588.89233  , 202.31795  , 113.21282  ,
         1.7870586,  -3.181874 , 168.92395  ,  47.814583 , 431.519    ,
        53.       ], dtype=float32)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)


In [43]:
gnb = GaussianNB().fit(X_train, y_train)
gnb_predictions = gnb.predict(X_test)
 
# accuracy on X_test
accuracy = gnb.score(X_test, y_test)
print(accuracy)
 
# creating a confusion matrix
cm = confusion_matrix(y_test, gnb_predictions)
print(cm)


0.3745247148288973
[[ 29 150  29  12]
 [ 25 144  13   7]
 [  8  23  10   8]
 [ 15  27  12  14]]


In [19]:
# TODO - tSNE

In [45]:
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(X_train, y_train)
accuracy = neigh.score(X_test, y_test)
print(accuracy)

# creating a confusion matrix
neigh_predictions = neigh.predict(X_test)
cm = confusion_matrix(y_test, neigh_predictions)
print(cm)

0.3935361216730038
[[143  65   4   8]
 [126  55   3   5]
 [ 35  10   1   3]
 [ 44  15   1   8]]


check

In [74]:
knn_pipe = Pipeline([('mms', StandardScaler()),
                     ('knn', KNeighborsClassifier())])
params = [{'knn__n_neighbors': [ 11, 12, 13],
         'knn__weights': ['uniform', 'distance'],
         'knn__leaf_size': [15, 20]}]
gs_knn = GridSearchCV(knn_pipe,
                      param_grid=params,
                      scoring='accuracy',
                      cv=3)
gs_knn.fit(X, y)
print(gs_knn.best_params_)
print(gs_knn.score(X, y))

{'knn__leaf_size': 15, 'knn__n_neighbors': 11, 'knn__weights': 'uniform'}
0.5135520684736091


0.5135520684736091


In [None]:

SVC_pipe = Pipeline([('mms', StandardScaler()),
                     ('svc', SVC())])

params = {'C':[0.1,1,100,1000],
          'kernel':['rbf','poly','sigmoid','linear'],
          'degree':[1,2,3,4,5,6],
          'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

gs_SVC = GridSearchCV(SVC(),
                      param_grid=params,
                      scoring='accuracy',
                      cv=3)
gs_SVC.fit(X, y)
print(gs_SVC.best_params_)
print(gs_SVC.score(X, y))

In [18]:
from sklearn.metrics.pairwise import rbf_kernel
s = SVC(kernel='rbf')
s.fit(X,y)
print(s.score(X, y))

0.43128863528292916


0.43128863528292916
