In [56]:
import numpy as np 
import pandas as pd
import random
import keras
import torch
import tensorflow as tf
import optuna
from optuna import Trial
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from sklearn.metrics import log_loss
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn import metrics

In [30]:
data = pd.read_excel("/Users/ziyuewang/Desktop/Inference Analytics/complain data/User complain input_top11.xlsx")

In [31]:
data["common complains"] = data["common complains"].map(lambda x: x.split("\n"))

In [32]:
data = data.explode("common complains")

In [33]:
data.head()

Unnamed: 0,common complains,specialities
0,Chest pain or discomfort (angina),Cardiology
0,Shortness of breath,Cardiology
0,Rapid or irregular heartbeats (arrhythmias),Cardiology
0,Heart failure,Cardiology
0,High blood pressure (hypertension),Cardiology


In [34]:
#data.to_excel("11 specialities_explode.xlsx")

In [35]:
new_data_dummy = pd.get_dummies(data.specialities)

In [36]:
new_data_dummy

Unnamed: 0,Cardiology,Dermatology,Internal Medicine,Neurology,Obstetrics,Obstetrics and Gynecology (OBGYN),Ophthalmology,Optometry,Pediatric Cardiology,Pediatric Dermatology,Primary Care
0,1,0,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
10,0,0,0,0,0,0,0,1,0,0,0
10,0,0,0,0,0,0,0,1,0,0,0
10,0,0,0,0,0,0,0,1,0,0,0
10,0,0,0,0,0,0,0,1,0,0,0


In [37]:
data = pd.concat([data,new_data_dummy],axis=1)

In [38]:
data = data.drop("specialities",axis=1)

In [39]:
data.head()

Unnamed: 0,common complains,Cardiology,Dermatology,Internal Medicine,Neurology,Obstetrics,Obstetrics and Gynecology (OBGYN),Ophthalmology,Optometry,Pediatric Cardiology,Pediatric Dermatology,Primary Care
0,Chest pain or discomfort (angina),1,0,0,0,0,0,0,0,0,0,0
0,Shortness of breath,1,0,0,0,0,0,0,0,0,0,0
0,Rapid or irregular heartbeats (arrhythmias),1,0,0,0,0,0,0,0,0,0,0
0,Heart failure,1,0,0,0,0,0,0,0,0,0,0
0,High blood pressure (hypertension),1,0,0,0,0,0,0,0,0,0,0


In [41]:
data.columns[1:]

Index(['Cardiology', 'Dermatology', 'Internal Medicine', 'Neurology',
       'Obstetrics', 'Obstetrics and Gynecology (OBGYN)', 'Ophthalmology',
       'Optometry', 'Pediatric Cardiology', 'Pediatric Dermatology',
       'Primary Care'],
      dtype='object')

In [18]:
X = data["common complains"]

In [42]:
specialities_labels = data[data.columns[1:]]

In [19]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [24]:
sentence_embedding = model.encode(X.values)

In [82]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(sentence_embedding, specialities_labels, test_size=0.3, random_state=42)

In [75]:
def objective(trial):
    keras.backend.clear_session()
    train_x, valid_x, train_y, valid_y = train_test_split(x_train, y_train, train_size=0.8, test_size=0.2,
                                                                random_state=42)
    #optimum number of hidden layers
    n_layers = trial.suggest_int('n_layers', 1, 3)
    model = keras.Sequential()
    for i in range(n_layers):
        #optimum number of hidden nodes
        num_hidden = trial.suggest_int(f'n_units_l{i}', 48, len(sentence_embedding[0]), log=True)
        #optimum activation function
        model.add(keras.layers.Dense(num_hidden, input_shape=(len(sentence_embedding[0]),),
                               activation=trial.suggest_categorical(f'activation{i}', ['relu', 'linear','swish'])))
        #optimum dropout value
        model.add(keras.layers.Dropout(rate = trial.suggest_float(f'dropout{i}', 0.0, 0.6))) 
    model.add(keras.layers.Dense(11,activation=tf.keras.activations.sigmoid)) #output Layer
    val_ds = (valid_x,valid_y)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,patience=1,min_lr=1e-05,verbose=0)
    early_stoping = EarlyStopping(monitor="val_loss",min_delta=0,patience=5,verbose=0,mode="auto", baseline=None,restore_best_weights=True)
    model.compile(loss='binary_crossentropy',metrics='categorical_crossentropy', optimizer='Adam')
    #optimum batch size
    histroy = model.fit(train_x,train_y, validation_data=val_ds,epochs=30,callbacks=[reduce_lr,early_stoping],verbose=0,
                       batch_size=trial.suggest_int('size', 8, 128))
    return min(histroy.history['val_loss'])

In [76]:
if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50, timeout=1200)
    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))

[32m[I 2023-02-12 11:07:18,857][0m A new study created in memory with name: no-name-21f3a890-5aee-4c77-b5aa-7fdba477fb6c[0m
[32m[I 2023-02-12 11:07:22,078][0m Trial 0 finished with value: 0.18274544179439545 and parameters: {'n_layers': 1, 'n_units_l0': 503, 'activation0': 'relu', 'dropout0': 0.24719261550579022, 'size': 40}. Best is trial 0 with value: 0.18274544179439545.[0m
[32m[I 2023-02-12 11:07:24,019][0m Trial 1 finished with value: 0.29155340790748596 and parameters: {'n_layers': 2, 'n_units_l0': 568, 'activation0': 'swish', 'dropout0': 0.5171165229656651, 'n_units_l1': 546, 'activation1': 'relu', 'dropout1': 0.3191537230731771, 'size': 118}. Best is trial 0 with value: 0.18274544179439545.[0m
[32m[I 2023-02-12 11:07:27,042][0m Trial 2 finished with value: 0.4127403795719147 and parameters: {'n_layers': 1, 'n_units_l0': 89, 'activation0': 'relu', 'dropout0': 0.5462005388021194, 'size': 102}. Best is trial 0 with value: 0.18274544179439545.[0m
[32m[I 2023-02-12 11:0

[32m[I 2023-02-12 11:08:21,759][0m Trial 24 finished with value: 0.18753819167613983 and parameters: {'n_layers': 2, 'n_units_l0': 189, 'activation0': 'linear', 'dropout0': 0.080900163280017, 'n_units_l1': 48, 'activation1': 'linear', 'dropout1': 0.22546972203966864, 'size': 32}. Best is trial 12 with value: 0.16334299743175507.[0m
[32m[I 2023-02-12 11:08:24,093][0m Trial 25 finished with value: 0.15930889546871185 and parameters: {'n_layers': 3, 'n_units_l0': 404, 'activation0': 'linear', 'dropout0': 0.00027060300753695143, 'n_units_l1': 109, 'activation1': 'relu', 'dropout1': 0.13890294472591902, 'n_units_l2': 147, 'activation2': 'relu', 'dropout2': 0.014257357486020639, 'size': 17}. Best is trial 25 with value: 0.15930889546871185.[0m
[32m[I 2023-02-12 11:08:25,578][0m Trial 26 finished with value: 0.33610987663269043 and parameters: {'n_layers': 3, 'n_units_l0': 365, 'activation0': 'linear', 'dropout0': 0.05771787822058421, 'n_units_l1': 107, 'activation1': 'relu', 'dropout

[32m[I 2023-02-12 11:09:12,492][0m Trial 46 finished with value: 0.1596813052892685 and parameters: {'n_layers': 1, 'n_units_l0': 455, 'activation0': 'linear', 'dropout0': 0.06905476307744957, 'size': 22}. Best is trial 41 with value: 0.15399551391601562.[0m
[32m[I 2023-02-12 11:09:15,651][0m Trial 47 finished with value: 0.17181110382080078 and parameters: {'n_layers': 1, 'n_units_l0': 386, 'activation0': 'linear', 'dropout0': 0.025694584665453022, 'size': 31}. Best is trial 41 with value: 0.15399551391601562.[0m
[32m[I 2023-02-12 11:09:18,498][0m Trial 48 finished with value: 0.25041767954826355 and parameters: {'n_layers': 1, 'n_units_l0': 330, 'activation0': 'linear', 'dropout0': 0.21747759477218548, 'size': 97}. Best is trial 41 with value: 0.15399551391601562.[0m
[32m[I 2023-02-12 11:09:21,724][0m Trial 49 finished with value: 0.1679384559392929 and parameters: {'n_layers': 1, 'n_units_l0': 354, 'activation0': 'linear', 'dropout0': 0.0911320509881188, 'size': 23}. Best

Number of finished trials: 50
Best trial:
  Value: 0.15399551391601562


In [77]:
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

  Params: 
    n_layers: 2
    n_units_l0: 459
    activation0: linear
    dropout0: 0.09056134729351364
    n_units_l1: 78
    activation1: swish
    dropout1: 0.14154633177645481
    size: 8


In [83]:
def wider_model():
    model = keras.Sequential()
    model.add(keras.layers.Dense(459,input_shape=(len(sentence_embedding[0]),),activation=tf.keras.activations.relu))
    model.add(keras.layers.Dropout(0.09056134729351364))
    model.add(keras.layers.Dense(11,activation=tf.keras.activations.sigmoid))
    return model

In [85]:
skf = KFold(n_splits=5, shuffle=True, random_state=42)
Final_Subbmission = []
val_loss_print = []
i=1
for train_index, test_index in skf.split(x_train,y_train):
    keras.backend.clear_session()
    print('#################')
    print(i)
    print('#################')
    X_train, X_test = x_train[train_index], x_train[test_index]
    train_y, test_y = y_train.iloc[train_index], y_train.iloc[test_index]
    model = wider_model()
    val_ds = (X_test,test_y)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,patience=1,min_lr=1e-05,verbose=1)
    early_stoping = EarlyStopping(monitor="val_loss",min_delta=0,patience=5,verbose=1,mode="auto", baseline=None,restore_best_weights=True)
    model.compile(loss='binary_crossentropy',metrics='categorical_crossentropy', optimizer='Adam')
    histroy = model.fit(X_train,train_y, validation_data=val_ds,epochs=200,callbacks=[reduce_lr,early_stoping],verbose=1,batch_size=34)
    print(min(histroy.history['val_loss']))
    val_loss_print.append(min(histroy.history['val_loss']))
    Test_seq_pred = model.predict(x_test)
    Final_Subbmission.append(Test_seq_pred)
    i=i+1

#################
1
#################
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200

Epoch 00050: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 51/200

Epoch 00051: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 52/200

Epoch 00052: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 53/200
Epoch 54/200
Restoring model weights from the end of 


Epoch 00038: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 39/200
Epoch 40/200
Restoring model weights from the end of the best epoch.
Epoch 00040: early stopping
0.15940071642398834
#################
3
#################
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200

Epoch 00038: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 39/200

Epoch 00039: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 40/200

Epoch 00040: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 41/200
Epoch 42/200


Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Restoring model weights from the end of the best epoch.
Epoch 00051: early stopping
0.18224507570266724
#################
4
#################
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200

Epoch 00038: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 39/200

Epoch 00039: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 40/200

Epoch 00040: ReduceLROnPlateau reducing learning rate to 1e-05.
E

Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/2

Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
0.14692209661006927
#################
5
#################
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200


Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200

Epoch 00051: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 52/200

Epoch 00052: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 53/200

Epoch 00053: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 54/200
Epoch 55/200
Restoring model weights from the end of the best epoch.
Epoch 00055: early stopping
0.12115444988012314


In [87]:
test_prob =np.mean(Final_Subbmission,0)
test_prob = pd.DataFrame(test_prob)
test_prob.columns = specialities_labels.columns

In [88]:
y_test1= y_test.reset_index(drop=True)
print("precision: {:.2f} ".format( metrics.average_precision_score(y_test1, test_prob, average='weighted')))

precision: 0.77 


In [92]:
test_prob

Unnamed: 0,Cardiology,Dermatology,Internal Medicine,Neurology,Obstetrics,Obstetrics and Gynecology (OBGYN),Ophthalmology,Optometry,Pediatric Cardiology,Pediatric Dermatology,Primary Care
0,0.033538,0.058927,0.058335,0.716033,0.01364,0.018765,0.053051,0.047617,0.023775,0.007814,0.027823
1,0.001995,0.003762,0.005888,0.063716,0.003783,0.011201,0.163877,0.200203,0.007098,0.003783,0.003913
2,0.002918,0.013443,0.002398,0.006187,0.000301,0.003011,0.549651,0.013646,0.001975,0.018217,0.001097
3,0.001543,0.575407,0.011175,0.018798,0.005517,0.017264,0.004405,0.005444,0.002961,0.171417,0.019914
4,0.039048,0.009366,0.025215,0.561007,0.002206,0.003529,0.093352,0.009197,0.009129,0.008846,0.006568
5,0.034844,0.012249,0.008247,0.006757,0.170622,0.634945,0.027707,0.018917,0.019547,0.052981,0.01887
6,0.187277,0.002218,0.017119,0.272148,0.008269,0.021935,0.010136,0.050219,0.096232,0.002369,0.024557
7,0.005995,0.622884,0.006937,0.018469,0.00199,0.012093,0.028813,0.002994,0.002844,0.217218,0.006386
8,0.011525,0.015204,0.53793,0.005446,0.062644,0.005426,0.003542,0.015826,0.018893,0.019864,0.469686
9,0.856081,0.002387,0.010975,0.00909,0.004723,0.009906,0.005514,0.007411,0.111084,0.005296,0.013802
