In [None]:
# save files in drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [None]:
# Define paths
base_path = os.path.abspath("gdrive/My Drive/pharmahacks/")
dataset_path = os.path.join(base_path, "dataset")
data = os.path.join(dataset_path, 'PSP_Data_V3.xlsx')

# Read the metadata
#xls = pd.ExcelFile(data)
#xls.sheet_names[:5]

In [None]:
# Helpers

import pickle

def save_obj(obj, name):
    with open('{}'.format(dataset_path) + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open('{}'.format(dataset_path) + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def get_unique(arr):
  '''
  Return unique elements in list
  '''
  arr = set(arr)
  return arr


def read_column_vals(col_name):
  '''
  Return unique column names and all values for a column
  '''
  # read column values
  vals = df[col_name].values
  # get unique values
  unique_vals = get_unique(vals)
  num = len(vals)

  return num, vals, unique_vals

In [None]:
# Go through all worksheets and get column values

sheetnames = [xls.sheet_names[-1]] # select latest sheet

# Features
x1 = [] # gender id
x2 = [] # age
x3 = [] # diag id
x4 = [] # bio
x5 = [] # patient rec drug
x6 = [] # freq
x7 = [] # status group
x8 = [] # case state
x9 = [] # patient id


# start date, last date


# Target
y = []

for name in sheetnames[:]:
  print(name)
  
  # Get worksheet
  df = pd.read_excel(data, sheet_name=name, header=1)
  # Mark unknwon values as missing or NaN
  df = df.loc[:,].replace('Unknown', np.NaN)
  # Drop rows with missing values
  df.dropna(inplace=True)
  
  # Read target column
  occurences, y_all, uniq = read_column_vals("Dosage")
  
  print(occurences)
  print(uniq)

  for val in y_all:
    y.append(val)
  

  # Read feature column

  _, gender, _ = read_column_vals("Gender ID")
  _, diagnosis, _ = read_column_vals("Diagnosis ID")
  _, age, _ = read_column_vals("Age Range")
  _, bio, _ = read_column_vals("Biologic Line of Therapy")
  _, drug, _ = read_column_vals("Patient Receiving Free Drug")
  _, frequency, _ = read_column_vals("Frequency")
  _, status, _ = read_column_vals("Status Group")
  _, case, _ = read_column_vals("Case State")
  _, names, _ = read_column_vals("Patient ID")


  for val in gender:
    x1.append(val)

  for val in diagnosis:
    x2.append(val)

  for val in age:
    x3.append(val)
  
  for val in bio:
    x4.append(val)

  for val in drug:
    x5.append(val)
  
  for val in frequency:
    x6.append(val)

  for val in status:
    x7.append(val)
  
  for val in case:
    x8.append(val)

  for val in names:
    x9.append(val)

raw_features = [x1, x2, x3, x4, x5, x6, x7, x8, x9]


raw_features = {"Gender ID": x1, 
                "Age": x2,
                "Diagnosis ID": x3,
                "Bio Therapy": x4,
                "Patient Rec Drug": x5,
                "Frequency": x6,
                "Status Group": x7,
                "Case State": x8,
                "Patient ID": x9}

print(len(raw_features), len(raw_features["Age"]))

for _, feat in raw_features.items():
  print(len(feat))

print("Y:", len(y))

save_obj(raw_features, 'raw_features')
save_obj(y, 'target')

### Load data

In [None]:
raw_features = load_obj('raw_features')
target = load_obj('target')

In [None]:
for _, feat in raw_features.items():
  print(len(feat))

print("Y:", len(target))

In [None]:
# worksheet
#df = pd.read_excel(data, sheet_name="09-2019", header=1)
#df.head()
#df.describe()
# Number of unknowns in each column
#print((df.loc[:,] == "Unknown").sum())
# mark unknwon values as missing or NaN
#df = df.loc[:,].replace('Unknown', np.NaN)

# count the number of NaN values in each column
#print(df.isnull().sum())
# print the first 20 rows of data
#df.head()
# drop rows with missing values
#df.dropna(inplace=True)
#df.describe()
# read dosage column
#freq, y_all, uniques = read_column_vals("Dosage")
#print(freq)
#print(uniques)
#help(encoder.fit_transform)


# Read gender id, diagnosis id

#freq, gender, g_u = read_column_vals("Gender ID")
#print(freq)
#print(g_u)

#freq, diagnosis, d_u = read_column_vals("Diagnosis ID")
#print(freq)
#print(d_u)
#set(gender)
#set(diagnosis)

#feature = np.stack((gender, diagnosis), axis=1)
#feature.shape

### Label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

def transform_to_numeric(arr):
  arr = encoder.fit_transform(arr)
  print(encoder.classes_)
  return arr

In [None]:
y_all = transform_to_numeric(target)
y_all.shape

In [None]:
features = []

for _, vals in raw_features.items():
  vals = transform_to_numeric(vals)
  print(vals.shape)
  features.append(vals)

features = np.stack(features, axis=1)
features.shape

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# length of last column
#len(features[:, -1])

In [None]:
# remove patient ids
features = features[:, :-1]
features.shape

In [None]:
features[0]

In [None]:
print(features.shape, y_all.shape)
print(type(features), type(y_all))

In [None]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(features, y_all, test_size=test_size, random_state=seed)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train[0], y_train[0]

In [None]:
# Define class weights for imbalacned data
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
print(class_weights)

In [None]:
# Preprocessing

#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler() # standardize -> z = (x - u) / s
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

#X_train.shape, X_test.shape

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
#from keras.utils import np_utils
# one hot encoded vectors
#num_classes = 3
#y_c = np_utils.to_categorical(y_train,num_classes)
#print(y_c.shape)

In [None]:
# fit model no training data
model = XGBClassifier()
model_1 = XGBClassifier(max_depth=3, learning_rate=0.01, n_estimators=500, objective='multi:softmax', num_class = 3)

# Fit
model_1.fit(X_train, y_train, verbose=True)

# Make predictions for test data
y_pred = model_1.predict(X_test)

#By default, the predictions made by XGBoost are probabilities. Because this is a binary
# classification problem, each prediction is the probability of the input pattern belonging to 
# the first class. We can easily convert them to binary class values by rounding them to 0 or 1.

predictions = [round(value) for value in y_pred]
print(predictions[:20])

# Evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
cm = multilabel_confusion_matrix(y_test, y_pred)
cm

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
    plt.figure(figsize=[7, 7])
    norm_cm = cm
    if normalized:
        norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)
        plt.savefig('confusion-matrix.png')

plot_confusion_matrix(cm, ['Dosage 1', 'Dosage 2', 'Dosage 3'])

In [None]:
#https://www.tensorflow.org/tutorials/structured_data/feature_columns
#https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/
#https://machinelearningmastery.com/handle-missing-data-python/
#https://github.com/hasibzunair/coursera-tensorflow-specialization/blob/master/Introduction%20to%20TensorFlow%20for%20Artificial%20Intelligence%2C%20Machine%20Learning%2C%20and%20Deep%20Learning/Week%201/Exercise_1_House_Prices_Question.ipynb

In [None]:
X_train.shape, y_train.shape

### Deep learning

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation

#model = tf.keras.Sequential([keras.layers.Dense(units=100, input_shape=[8])])
#model.compile(optimizer='sgd', loss='mean_squared_error', metrics=['acc'])
#print(model.summary())
#model.fit(X_train, y_train, epochs=500)
#scores = model.evaluate(X_test, y_test, verbose=1)
#print('Test loss:', scores[0])
#print('Test accuracy:', scores[1])

In [None]:
from keras.utils import np_utils
# one hot encoded vectors
num_classes = 3
y_c = np_utils.to_categorical(y_train,num_classes)
y_test_c = np_utils.to_categorical(y_test,num_classes)
print(y_c.shape, y_test_c.shape)

In [None]:
from keras import backend as K
import tensorflow as tf

def focal_loss(gamma=2., alpha=.25):
	def focal_loss_fixed(y_true, y_pred):
		pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
		pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
		return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
	return focal_loss_fixed

# Baseline model

def create_baseline():
  model = Sequential()
  model.add(Dense(100, input_dim=8, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(1024, activation='relu'))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(50, activation='relu'))
  model.add(Dense(3, activation='softmax'))

  adadelta = optimizers.Adadelta(lr=0.01)

  model.compile(loss=[focal_loss(alpha=.25, gamma=2)], optimizer='sgd', metrics=['accuracy'])
  return model

model = None
model = create_baseline()
model.summary()

In [None]:
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, verbose=1, patience=8, mode='min', restore_best_weights=True)

model.fit(X_train, y_c, validation_data=(X_test, y_test_c), 
          batch_size=64, callbacks=[early_stopping], 
          epochs=1000, class_weight = class_weights, shuffle=True)

In [None]:
# Plot and save accuravy loss graphs individually
def plot_loss_accu(history):
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(len(loss))
    plt.plot(epochs, loss, 'g')
    plt.plot(epochs, val_loss, 'y')
    plt.title('Training and validation loss')
    plt.ylabel('Loss %')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc='upper right')
    plt.grid(True)
    plt.savefig('{}/{}_loss.jpg'.format(base_path, "loss"), dpi=300)
    #plt.savefig('{}/{}_loss.pdf'.format(output_path, EXP_NAME), dpi=300)
    plt.show()
    
    loss = history.history['acc']
    val_loss = history.history['val_acc']
    epochs = range(len(loss))
    plt.plot(epochs, loss, 'r')
    plt.plot(epochs, val_loss, 'b')
    plt.title('Training and validation accuracy')
    plt.ylabel('Accuracy %')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc='lower right')
    plt.grid(True)
    plt.savefig('{}/{}_acc.jpg'.format(base_path, "acc"), dpi=300)
    #plt.savefig('{}/{}_acc.pdf'.format(output_path, EXP_NAME), dpi=300)
    plt.show()

plot_loss_accu(model.history)
print("Done!")

In [None]:
y_t = np.array([np.argmax(y) for y in y_test_c])
y_t.shape

In [None]:
y_pred = model.predict(X_test)
y_pred_flat = np.array([np.argmax(y) for y in y_pred])
y_pred_flat.shape

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
cm = multilabel_confusion_matrix(y_t, y_pred_flat)
cm

In [None]:
classes=['Dosage 1', 'Dosage 2', 'Dosage 3']

def pretty_print_conf_matrix(y_true, y_pred, 
                             classes,
                             normalize=False,
                             title='Confusion matrix',
                             cmap=plt.cm.Blues):
    """
    Mostly stolen from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py

    Normalization changed, classification_report stats added below plot
    """

    cm = confusion_matrix(y_true, y_pred)

    # Configure Confusion Matrix Plot Aesthetics (no text yet) 
    plt.figure(num=None, figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=18)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.ylabel('True label', fontsize=14)
    plt.xlabel('Predicted label', fontsize=14)

    # Calculate normalized values (so all cells sum to 1) if desired
    if normalize:
        cm = np.round(cm.astype('float') / cm.sum(),2) #(axis=1)[:, np.newaxis]
    print('\n')
    # Place Numbers as Text on Confusion Matrix Plot
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if i==0:
          plt.text(j, i+0.4, cm[i, j],
                   horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=20)
        elif i==1:
          plt.text(j, i+0.2, cm[i, j],
                   horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=20)
        else:
          plt.text(j, i-0.2, cm[i, j],
                   horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=20)

    # Add Precision, Recall, F-1 Score as Captions Below Plot
    rpt = classification_report(y_true, y_pred)
    rpt = rpt.replace('avg / total', '      avg')
    rpt = rpt.replace('support', 'N Obs')

    plt.annotate(rpt, 
                 xy = (0,0), 
                 xytext = (-120, -260), 
                 xycoords='axes fraction', textcoords='offset points',
                 fontsize=20, ha='left')    

    # Plot
    plt.tight_layout()
    #plt.savefig('{}/{}.jpg'.format(base_path, "cf"), dpi=300)

import itertools
  
pretty_print_conf_matrix(y_test, predictions, 
                         classes= ['Dosage 1', 'Dosage 2', 'Dosage 3'],
                         normalize=False, 
                         title='Confusion Matrix')



### Fancy graphs

In [None]:
dataset = pd.DataFrame({'Gender ID': features[:, 0], 
                        'Age': features[:, 1], 
                        'Diagnosis ID': features[:, 2],
                        'Bio of Therapy': features[:, 3],
                        'Patient Receiving Free Drug': features[:, 4],
                        'Frequency': features[:, 5],
                        'Status Group': features[:, 6],
                        'Case State': features[:, 7],
                        'Dosage': y_all})
dataset.head()

In [None]:
feat_name = "Dosage"

fig, ax = plt.subplots(1, 1, figsize= (10, 5))
dataset[feat_name].value_counts().plot(kind='bar', ax=ax, title="Histogram of {}".format(feat_name), color='g')

ax.set_xlabel(feat_name)
ax.set_ylabel("Number of occurences")
plt.savefig('{}/{}_hist.png'.format(base_path, feat_name), dpi=300)

In [None]:
# Normalize
kwargs = dict(alpha=0.5, bins=100, density=True, stacked=True)

# Plot
plt.hist(dataset["Gender ID"].values, **kwargs, rwidth=0.2, color='g', label='Gender ID')
plt.hist(dataset["Age"].values, **kwargs, color='b', label='Age')
plt.hist(dataset["Diagnosis ID"].values, **kwargs, color='r', label='Diagnosis ID')
plt.hist(dataset["Bio of Therapy"].values, **kwargs, color='pink', label='Bio of Therapy')
plt.hist(dataset["Patient Receiving Free Drug"].values, **kwargs, color='olive', label='Patient Receiving Free Drug')
plt.hist(dataset["Frequency"].values, **kwargs, color='dodgerblue', label='Frequency')
plt.hist(dataset["Status Group"].values, **kwargs, color='orange', label='Status Group')
plt.hist(dataset["Case State"].values, **kwargs, color='deeppink', label='Case State')
plt.hist(dataset["Dosage"].values, **kwargs, color='gold', label='Dosage')
plt.gca().set(title='Feature Histogram', ylabel='# of occurences')
plt.ylim(0,10)
plt.legend()

In [None]:
sns.pairplot(dataset)

In [None]:
g = sns.pairplot(dataset, hue="Gender ID")

In [None]:
g = sns.pairplot(dataset, vars=["Age", "Dosage"])

In [None]:
g = sns.pairplot(dataset, diag_kind="kde")

In [None]:
g = sns.pairplot(dataset, kind="reg")

In [None]:
plt.figure(figsize=(10,11))
sns.heatmap(dataset.corr(),annot=True)
plt.plot()

In [None]:
x = "Age"
y = "Dosage"

data = dataset
plt.figure(figsize=(12,10))
plt.subplot(2,2,1)
sns.violinplot(x=x,y=y,data=data)
plt.subplot(2,2,2)
sns.violinplot(x=x,y=y,data=data)
plt.subplot(2,2,3)
sns.violinplot(x=x,y=y,data=data)
plt.subplot(2,2,4)
sns.violinplot(x=x,y=y,data=data)

In [None]:
x = "Age"
y = "Dosage"

data = dataset
plt.figure(figsize=(12,10))
plt.subplot(2,2,1)
sns.boxplot(x=x,y=y,data=data)
plt.subplot(2,2,2)
sns.boxplot(x=x,y=y,data=data)
plt.subplot(2,2,3)
sns.boxplot(x=x,y=y,data=data)
plt.subplot(2,2,4)
sns.boxplot(x=x,y=y,data=data)

In [None]:
#https://www.kaggle.com/lnbalon/iris-dataset-eda-and-classification-analysis