<a href="https://colab.research.google.com/github/hasindu-s/surgery-duration-prediction/blob/main/Complication_Classification_%2B_Statistical_Duration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('Knee Data with ASA.csv')
df.head()

Unnamed: 0,ahrq_ccs,age,gender,asa_status,bmi,baseline_cancer,baseline_cvd,baseline_dementia,baseline_diabetes,baseline_digestive,baseline_osteoart,baseline_psych,baseline_pulmonary,hour,complication
0,Arthroplasty knee,65.8,F,I-II,33.99,No,Yes,No,No,No,Yes,No,No,7.62,No
1,Arthroplasty knee,65.1,F,I-II,30.12,No,Yes,No,No,Yes,Yes,No,No,10.02,Yes
2,Arthroplasty knee,65.8,M,III,44.81,No,Yes,No,No,No,Yes,No,No,6.7,No
3,Arthroplasty knee,62.2,F,I-II,40.03,No,No,No,No,No,Yes,No,No,16.97,No
4,Arthroplasty knee,70.1,M,IV-VI,26.79,No,Yes,No,Yes,No,Yes,No,Yes,6.9,No


## Preprocessing

In [None]:
df.isnull().sum()

ahrq_ccs              0
age                   0
gender                0
asa_status            0
bmi                   0
baseline_cancer       0
baseline_cvd          0
baseline_dementia     0
baseline_diabetes     0
baseline_digestive    0
baseline_osteoart     0
baseline_psych        0
baseline_pulmonary    0
hour                  0
complication          0
dtype: int64

In [None]:
df.drop('ahrq_ccs', axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

labels = ['baseline_cancer',	'baseline_cvd', 'baseline_dementia',	
          'baseline_diabetes',	'baseline_digestive',	'baseline_osteoart',	'baseline_psych',	'baseline_pulmonary']

le_baseline = LabelEncoder()
le_gender = LabelEncoder()
le_asa = LabelEncoder()
le_compl = LabelEncoder()

for label in labels:
  df[label] = le_baseline.fit_transform(df[label])

df['gender'] = le_gender.fit_transform(df['gender'])
df['asa_status'] = le_asa.fit_transform(df['asa_status'])
df['complication'] = le_compl.fit_transform(df['complication'])

df['age'] = df['age'].astype(int)

df.head()

Unnamed: 0,age,gender,asa_status,bmi,baseline_cancer,baseline_cvd,baseline_dementia,baseline_diabetes,baseline_digestive,baseline_osteoart,baseline_psych,baseline_pulmonary,hour,complication
0,65,0,0,33.99,0,1,0,0,0,1,0,0,7.62,0
1,65,0,0,30.12,0,1,0,0,1,1,0,0,10.02,1
2,65,1,1,44.81,0,1,0,0,0,1,0,0,6.7,0
3,62,0,0,40.03,0,0,0,0,0,1,0,0,16.97,0
4,70,1,2,26.79,0,1,0,1,0,1,0,1,6.9,0


## Training Model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['age', 'bmi', 'asa_status', 'gender', 'hour', 'complication'], axis=1), df['complication'], test_size=0.2, random_state=4)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [None]:
from sklearn.metrics import accuracy_score

y_pred = rfc.predict(X_test)
print('Accuracy score:', accuracy_score(y_test, y_pred))

Accuracy score: 0.9377049180327869


## Saving Model

In [None]:
import joblib

joblib.dump(rfc, 'random_forest_classifier_model.sav')

['random_forest_classifier_model.sav']

In [None]:
model = joblib.load('random_forest_classifier_model.sav')

## Choosing discriminating features

In [None]:
len(df)

3048

In [None]:
c_y = df[df['complication'] == 1]
c_n = df[df['complication'] == 0]

In [None]:
c_y[['bmi', 'hour', 'age']].corr()

Unnamed: 0,bmi,hour,age
bmi,1.0,0.06152,-0.115922
hour,0.06152,1.0,-0.116927
age,-0.115922,-0.116927,1.0


In [None]:
c_n[['bmi', 'hour', 'age']].corr()

Unnamed: 0,bmi,hour,age
bmi,1.0,0.053111,-0.216062
hour,0.053111,1.0,-0.086568
age,-0.216062,-0.086568,1.0


In [None]:
df = df[df['age'] > 34]

In [None]:
step = 5
start = 35

hour_y = []
hour_n = []

for i in range(34, 90, 5):
  rows = np.array(c_y.query(f'{i} < age <= {i+5}')['hour'])
  hour_y.append(rows)

hour_y

[array([11.68, 10.97]),
 array([10.72]),
 array([12.1 , 12.05,  8.2 ,  7.93, 11.9 ,  7.7 , 12.5 , 14.23, 11.87,
         7.38,  9.25,  7.65,  8.88, 10.35]),
 array([ 7.85, 13.82, 10.38, 10.6 ,  7.25, 10.08,  8.73, 12.32, 12.95,
        11.47, 15.65,  7.75, 10.5 , 12.78,  7.53,  7.45, 10.37,  7.3 ,
        14.87]),
 array([13.08, 11.17,  6.47,  7.67, 17.75, 12.63,  7.4 ,  8.88,  7.35,
        11.47, 15.1 ,  9.43,  9.03, 11.53,  6.63,  7.  ,  7.35,  7.77,
         7.25,  7.6 , 10.08,  7.92,  7.47, 16.05, 12.62,  7.83,  7.07,
         8.13, 14.47, 14.23, 11.82,  7.37,  7.88]),
 array([10.17,  7.68, 10.48,  6.85, 15.32, 14.32,  8.32,  9.82,  7.47,
         8.1 , 11.37, 11.85,  7.25, 10.27, 11.43,  7.23, 13.  , 12.7 ,
         8.92, 10.63, 14.95,  7.63, 13.35,  7.95,  7.2 ,  9.1 ,  9.23,
        12.22, 11.38,  7.92]),
 array([10.02,  7.32, 12.83,  9.05, 11.12, 10.35,  7.6 ,  9.6 ,  8.25,
        12.87,  7.85, 14.83,  8.95, 11.72, 12.97, 12.27,  7.33,  8.68,
         7.72,  8.97, 13.67, 12.0

In [None]:
for i in range(34, 90, 5):
  rows = np.array(c_n.query(f'{i} < age <= {i+5}')['hour'])
  hour_n.append(rows)

In [None]:
stats_n = [[arr.mean(), arr.std()] for arr in hour_n]
stats_n
joblib.dump(stats_n, 'stats_no')

['stats_no']

In [None]:
sta = joblib.load('stats_no')
sta

[[10.6556, 2.6498106800298014],
 [10.695166666666664, 2.9930344087935614],
 [10.15461038961039, 2.670879064666514],
 [9.65, 2.6976131879431313],
 [9.77910843373494, 2.5804779468201735],
 [9.888246346555324, 2.6496677725061764],
 [9.700325581395349, 2.5868820931314587],
 [9.730980926430517, 2.665436581743847],
 [9.298658536585366, 2.309751339269315],
 [9.40360946745562, 2.310153941212515],
 [9.449411764705882, 2.5209789662146895],
 [9.2325, 2.0031979640897535]]

In [None]:
stats_y = [[arr.mean(), arr.std()] for arr in hour_y]
joblib.dump(stats_n, 'stats_yes')
stats_y

[[11.325, 0.35499999999999954],
 [10.72, 0.0],
 [10.142142857142858, 2.1830619995496123],
 [10.507894736842108, 2.6052206269787317],
 [9.863636363636363, 3.0708253931217455],
 [10.137000000000002, 2.458210934806043],
 [10.230869565217393, 2.4960044820254463],
 [9.2584375, 1.8267955368879545],
 [9.783225806451613, 2.6507301332527287],
 [9.939130434782609, 2.5179717735812654],
 [7.454000000000001, 0.6008527273800128],
 [8.765, 1.9349999999999996]]

## Function for returning predictions

In [None]:
def duration_predict(age, X):
  model = joblib.load('random_forest_classifier_model.sav')
  stats_yes = joblib.load('stats_yes')
  stats_no = joblib.load('stats_no')

  complication = model.predict(X)[0]

  ranges = [(35, 40), (40, 45), (45, 50), (50, 55), (55, 60), (60, 65), (65, 70), (70, 75), (75, 80), (80, 85), (85, 90), (90, 95)]
  index = None
  
  for r in ranges:
    index = 0 if index == None else index + 1
    if r[0] <= age < r[1]:
      break

  if index >= 0:
    if complication == 1:
      return stats_yes[index]
    else:
      return stats_no[index]
  else:
    return 'Hello'

In [None]:
X = [[0, 1, 0, 1, 1, 1, 0, 1]]
print(duration_predict(40, X))

[10.695166666666664, 2.9930344087935614]


  "X does not have valid feature names, but"
