In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.api.types import is_numeric_dtype, is_string_dtype
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, StandardScaler  # Added StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [3]:
df=pd.read_csv("../datasets/diabetes_data_ml.csv")

In [4]:
df.head(10)

Unnamed: 0,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,readmitted,total_visits,race_Asian,...,diag_2_Unknown,diag_3_Diabetes,diag_3_Digestive,diag_3_Genitourinary,diag_3_Injury,diag_3_Musculoskeletal,diag_3_Neoplasms,diag_3_Other,diag_3_Respiratory,diag_3_Unknown
0,0,5,1.0,41.0,0.0,1.0,1.5,0,0.0,False,...,True,False,False,False,False,False,False,False,False,True
1,0,15,3.0,59.0,0.0,18.0,9.0,0,0.0,False,...,False,False,False,False,False,False,False,True,False,False
2,0,25,2.0,11.0,5.0,13.0,6.0,0,3.0,False,...,False,False,False,False,False,False,False,True,False,False
3,1,35,2.0,44.0,1.0,16.0,7.0,0,0.0,False,...,False,False,False,False,False,False,False,False,False,False
4,1,45,1.0,51.0,0.0,8.0,5.0,0,0.0,False,...,False,True,False,False,False,False,False,False,False,False
5,1,55,3.0,31.0,5.0,16.0,9.0,0,0.0,False,...,False,True,False,False,False,False,False,False,False,False
6,1,65,4.0,70.0,1.0,21.0,7.0,0,0.0,False,...,False,False,False,False,False,False,False,True,False,False
7,1,75,5.0,73.0,0.0,12.0,8.0,0,0.0,False,...,False,True,False,False,False,False,False,False,False,False
8,0,85,12.0,68.0,2.0,28.0,8.0,0,0.0,False,...,False,False,False,False,False,False,False,True,False,False
9,0,95,12.0,33.0,3.0,18.0,8.0,0,0.0,False,...,False,False,False,False,False,False,False,False,True,False


In [5]:
X = df.drop('readmitted', axis=1)
y = df.readmitted

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
print('X_train:', X_train.shape)
print('Y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('Y_test:', y_test.shape)

X_train: (69524, 70)
Y_train: (69524,)
X_test: (29796, 70)
Y_test: (29796,)


In [7]:
scores = pd.DataFrame(columns=['Model_Name', 'Train_Accuracy', 'Test_Accuracy','Train_f1', 'Train_precision', 'Train_recall', 'Train_auc_roc','Test_f1',
                               'Test_precision', 'Test_recall', 'Test_auc_roc'])
scores

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc


In [8]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("lr", LogisticRegression()),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Logistic Regression Base",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.8857660663943386
Testing Accuracy
0.8865619546247818
0.8865619546247818
Cross validation score 0.8857229188364742
Cross validation score 0.8857229188364742
[[26414     6]
 [ 3374     2]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     26420
           1       0.25      0.00      0.00      3376

    accuracy                           0.89     29796
   macro avg       0.57      0.50      0.47     29796
weighted avg       0.81      0.89      0.83     29796

precision 0.25
recall 0.0005924170616113745
f1_score 0.001182033096926714
roc_auc_score 0.5001826581901546
[[61575    12]
 [ 7930     7]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     61587
           1       0.37      0.00      0.00      7937

    accuracy                           0.89     69524
   macro avg       0.63      0.50      0.47     69524
weighted avg       0.83      0.89      0.83     69524

p

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183


In [9]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("lr", LogisticRegression(class_weight='balanced')),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Logistic Regression Balanced",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.5997353431908405
Testing Accuracy
0.6034367029131427
0.6034367029131427
Cross validation score 0.5997353600318451
Cross validation score 0.5997353600318451
[[15846 10574]
 [ 1242  2134]]
              precision    recall  f1-score   support

           0       0.93      0.60      0.73     26420
           1       0.17      0.63      0.27      3376

    accuracy                           0.60     29796
   macro avg       0.55      0.62      0.50     29796
weighted avg       0.84      0.60      0.68     29796

precision 0.1679257160843563
recall 0.6321090047393365
f1_score 0.26535687639890576
roc_auc_score 0.6159409520290172
[[36764 24823]
 [ 3005  4932]]
              precision    recall  f1-score   support

           0       0.92      0.60      0.73     61587
           1       0.17      0.62      0.26      7937

    accuracy                           0.60     69524
   macro avg       0.55      0.61      0.49     69524
weighted avg       0.84      0.60      0.67   

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941


In [10]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("dt", DecisionTreeClassifier(random_state=10)),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Decision Tree",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.9999856164777631
Testing Accuracy
0.7897368774332125
0.9999856164777631
Testing Accuracy
0.7897368774332125
Cross validation score 0.7901013545805852
Cross validation score 0.7901013545805852
[[22943  3477]
 [ 2788   588]]
              precision    recall  f1-score   support

           0       0.89      0.87      0.88     26420
           1       0.14      0.17      0.16      3376

    accuracy                           0.79     29796
   macro avg       0.52      0.52      0.52     29796
weighted avg       0.81      0.79      0.80     29796

precision 0.14464944649446496
recall 0.17417061611374407
f1_score 0.15804327375352775
roc_auc_score 0.5212828856496049
[[61587     0]
 [    1  7936]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     61587
           1       1.00      1.00      1.00      7937

    accuracy                           1.00     69524
   macro avg       1.00      1.00      1.00     69524
weighted

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283


In [11]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("dt", DecisionTreeClassifier(random_state=10,class_weight='balanced')),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Decision Tree Balanced",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.9999856164777631
Testing Accuracy
0.7961471338434689
0.7961471338434689
Cross validation score 0.8011478493922883
Cross validation score 0.8011478493922883
[[23175  3245]
 [ 2829   547]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     26420
           1       0.14      0.16      0.15      3376

    accuracy                           0.80     29796
   macro avg       0.52      0.52      0.52     29796
weighted avg       0.81      0.80      0.80     29796

precision 0.14425105485232068
recall 0.1620260663507109
f1_score 0.15262276785714285
roc_auc_score 0.5196012239399277
[[61586     1]
 [    0  7937]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     61587
           1       1.00      1.00      1.00      7937

    accuracy                           1.00     69524
   macro avg       1.00      1.00      1.00     69524
weighted avg       1.00      1.00      1.00  

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601


In [12]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("dt", DecisionTreeClassifier(random_state=10,class_weight='balanced',criterion='entropy',max_depth=5)),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Decision Tree Tuned",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.6439215235026753
Testing Accuracy
0.6468317895019465
0.6468317895019465
Cross validation score 0.6252374602193331
Cross validation score 0.6252374602193331
[[17376  9044]
 [ 1479  1897]]
              precision    recall  f1-score   support

           0       0.92      0.66      0.77     26420
           1       0.17      0.56      0.26      3376

    accuracy                           0.65     29796
   macro avg       0.55      0.61      0.52     29796
weighted avg       0.84      0.65      0.71     29796

precision 0.17338451695457455
recall 0.5619075829383886
f1_score 0.264999650764825
roc_auc_score 0.6097955779945539
[[40243 21344]
 [ 3412  4525]]
              precision    recall  f1-score   support

           0       0.92      0.65      0.76     61587
           1       0.17      0.57      0.27      7937

    accuracy                           0.64     69524
   macro avg       0.55      0.61      0.52     69524
weighted avg       0.84      0.64      0.71    

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796


In [13]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("rf", RandomForestClassifier(random_state=10)),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Random Forest",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.9999424659110523
Testing Accuracy
0.9999424659110523
Testing Accuracy
0.886796885487985
0.886796885487985
Cross validation score 0.8857948343787718
Cross validation score 0.8857948343787718
[[26413     7]
 [ 3366    10]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     26420
           1       0.59      0.00      0.01      3376

    accuracy                           0.89     29796
   macro avg       0.74      0.50      0.47     29796
weighted avg       0.85      0.89      0.83     29796

precision 0.5882352941176471
recall 0.002962085308056872
f1_score 0.0058944886531093425
roc_auc_score 0.5013485672566024
[[61587     0]
 [    4  7933]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     61587
           1       1.00      1.00      1.00      7937

    accuracy                           1.00     69524
   macro avg       1.00      1.00      1.00     69524
weighted

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349


In [14]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("rf", RandomForestClassifier(random_state=10,class_weight='balanced')),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Random Forest Balanced",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.9999712329555261
Testing Accuracy
0.9999712329555261
Testing Accuracy
0.8867633239360988
0.8867633239360988
Cross validation score 0.8857372990484087
Cross validation score 0.8857372990484087
[[26415     5]
 [ 3369     7]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     26420
           1       0.58      0.00      0.00      3376

    accuracy                           0.89     29796
   macro avg       0.74      0.50      0.47     29796
weighted avg       0.85      0.89      0.83     29796

precision 0.5833333333333334
recall 0.0020734597156398106
f1_score 0.004132231404958678
roc_auc_score 0.5009421045739441
[[61586     1]
 [    1  7936]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     61587
           1       1.00      1.00      1.00      7937

    accuracy                           1.00     69524
   macro avg       1.00      1.00      1.00     69524
weight

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942


In [15]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("rf", RandomForestClassifier(random_state=10,n_estimators=5,max_depth=3,class_weight='balanced')),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Random Forest Tuned",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.6542776595132616
Testing Accuracy
0.6594844945630286
0.6542776595132616
Testing Accuracy
0.6594844945630286
Cross validation score 0.6054315770461156
Cross validation score 0.6054315770461156
[[17858  8562]
 [ 1584  1792]]
              precision    recall  f1-score   support

           0       0.92      0.68      0.78     26420
           1       0.17      0.53      0.26      3376

    accuracy                           0.66     29796
   macro avg       0.55      0.60      0.52     29796
weighted avg       0.83      0.66      0.72     29796

precision 0.17307320842186594
recall 0.5308056872037915
f1_score 0.261034231609614
roc_auc_score 0.6033665074928874
[[41330 20257]
 [ 3779  4158]]
              precision    recall  f1-score   support

           0       0.92      0.67      0.77     61587
           1       0.17      0.52      0.26      7937

    accuracy                           0.65     69524
   macro avg       0.54      0.60      0.52     69524
weighted av

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942
7,Random Forest Tuned,0.605432,0.659484,0.257047,0.170305,0.523876,0.597479,0.261034,0.173073,0.530806,0.603367


In [16]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("xg", XGBClassifier()),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "XG boost",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.8922961854899027
Testing Accuracy
0.885756477379514
0.8922961854899027
Testing Accuracy
0.885756477379514
Cross validation score 0.8842701781238711
Cross validation score 0.8842701781238711
[[26344    76]
 [ 3328    48]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     26420
           1       0.39      0.01      0.03      3376

    accuracy                           0.89     29796
   macro avg       0.64      0.51      0.48     29796
weighted avg       0.83      0.89      0.84     29796

precision 0.3870967741935484
recall 0.014218009478672985
f1_score 0.027428571428571427
roc_auc_score 0.5056707004244236
[[61561    26]
 [ 7462   475]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     61587
           1       0.95      0.06      0.11      7937

    accuracy                           0.89     69524
   macro avg       0.92      0.53      0.53     69524
weighted 

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942
7,Random Forest Tuned,0.605432,0.659484,0.257047,0.170305,0.523876,0.597479,0.261034,0.173073,0.530806,0.603367
8,XG boost,0.88427,0.885756,0.112586,0.948104,0.059846,0.529712,0.027429,0.387097,0.014218,0.505671


In [17]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("xg", XGBClassifier(scale_pos_weight=7.73)),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "XG boost balanced",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.7389390713998044
Testing Accuracy
0.6833467579540878
0.7389390713998044
Testing Accuracy
0.6833467579540878
Cross validation score 0.6821529207936975
Cross validation score 0.6821529207936975
[[18654  7766]
 [ 1669  1707]]
              precision    recall  f1-score   support

           0       0.92      0.71      0.80     26420
           1       0.18      0.51      0.27      3376

    accuracy                           0.68     29796
   macro avg       0.55      0.61      0.53     29796
weighted avg       0.83      0.68      0.74     29796

precision 0.1801963475139871
recall 0.505627962085308
f1_score 0.26570161102031287
roc_auc_score 0.6058419901266812
[[45137 16450]
 [ 1700  6237]]
              precision    recall  f1-score   support

           0       0.96      0.73      0.83     61587
           1       0.27      0.79      0.41      7937

    accuracy                           0.74     69524
   macro avg       0.62      0.76      0.62     69524
weighted av

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942
7,Random Forest Tuned,0.605432,0.659484,0.257047,0.170305,0.523876,0.597479,0.261034,0.173073,0.530806,0.603367
8,XG boost,0.88427,0.885756,0.112586,0.948104,0.059846,0.529712,0.027429,0.387097,0.014218,0.505671
9,XG boost balanced,0.682153,0.683347,0.407328,0.274915,0.785813,0.759356,0.265702,0.180196,0.505628,0.605842


In [18]:
pipe = Pipeline((
    ("pt", PowerTransformer()),
    ("xg", XGBClassifier(learning_rate=.1,max_depth=4,gamma=0,scale_pos_weight=7.73)),
))

pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "XG boost Tuned",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.6348599044934123
Testing Accuracy
0.6295140287286884
0.6348599044934123
Testing Accuracy
0.6295140287286884
Cross validation score 0.6229216078279001
Cross validation score 0.6229216078279001
[[16709  9711]
 [ 1328  2048]]
              precision    recall  f1-score   support

           0       0.93      0.63      0.75     26420
           1       0.17      0.61      0.27      3376

    accuracy                           0.63     29796
   macro avg       0.55      0.62      0.51     29796
weighted avg       0.84      0.63      0.70     29796

precision 0.1741644697678374
recall 0.6066350710900474
f1_score 0.2706309877766766
roc_auc_score 0.6195363092013448
[[39087 22500]
 [ 2886  5051]]
              precision    recall  f1-score   support

           0       0.93      0.63      0.75     61587
           1       0.18      0.64      0.28      7937

    accuracy                           0.63     69524
   macro avg       0.56      0.64      0.52     69524
weighted av

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942
7,Random Forest Tuned,0.605432,0.659484,0.257047,0.170305,0.523876,0.597479,0.261034,0.173073,0.530806,0.603367
8,XG boost,0.88427,0.885756,0.112586,0.948104,0.059846,0.529712,0.027429,0.387097,0.014218,0.505671
9,XG boost balanced,0.682153,0.683347,0.407328,0.274915,0.785813,0.759356,0.265702,0.180196,0.505628,0.605842


In [19]:
from imblearn.pipeline import Pipeline 

In [20]:
pipe = Pipeline([
    ("pt", PowerTransformer()),
    ("smote", SMOTE(random_state=42)),     
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier())
])
pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Random forest after SMOTE",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.9999712329555261
Testing Accuracy
0.9999712329555261
Testing Accuracy
0.8856893542757417
0.8856893542757417
Cross validation score 0.8849893594086786
Cross validation score 0.8849893594086786
[[26364    56]
 [ 3350    26]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     26420
           1       0.32      0.01      0.02      3376

    accuracy                           0.89     29796
   macro avg       0.60      0.50      0.48     29796
weighted avg       0.82      0.89      0.83     29796

precision 0.3170731707317073
recall 0.007701421800947867
f1_score 0.015037593984962405
roc_auc_score 0.5027909077210644
[[61587     0]
 [    2  7935]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     61587
           1       1.00      1.00      1.00      7937

    accuracy                           1.00     69524
   macro avg       1.00      1.00      1.00     69524
weighte

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942
7,Random Forest Tuned,0.605432,0.659484,0.257047,0.170305,0.523876,0.597479,0.261034,0.173073,0.530806,0.603367
8,XG boost,0.88427,0.885756,0.112586,0.948104,0.059846,0.529712,0.027429,0.387097,0.014218,0.505671
9,XG boost balanced,0.682153,0.683347,0.407328,0.274915,0.785813,0.759356,0.265702,0.180196,0.505628,0.605842


In [21]:
pipe = Pipeline([
    ("pt", PowerTransformer()),
    ("smote", SMOTE(random_state=42)),     
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(random_state=10,class_weight='balanced'))
])
pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Random forest balanced after SMOTE",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.9999856164777631
Testing Accuracy
0.9999856164777631
Testing Accuracy
0.8859914082427172
0.8859914082427172
Cross validation score 0.8850612759854494
Cross validation score 0.8850612759854494
[[26367    53]
 [ 3344    32]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     26420
           1       0.38      0.01      0.02      3376

    accuracy                           0.89     29796
   macro avg       0.63      0.50      0.48     29796
weighted avg       0.83      0.89      0.84     29796

precision 0.3764705882352941
recall 0.009478672985781991
f1_score 0.01849176538572667
roc_auc_score 0.503736308483807
[[61587     0]
 [    1  7936]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     61587
           1       1.00      1.00      1.00      7937

    accuracy                           1.00     69524
   macro avg       1.00      1.00      1.00     69524
weighted 

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942
7,Random Forest Tuned,0.605432,0.659484,0.257047,0.170305,0.523876,0.597479,0.261034,0.173073,0.530806,0.603367
8,XG boost,0.88427,0.885756,0.112586,0.948104,0.059846,0.529712,0.027429,0.387097,0.014218,0.505671
9,XG boost balanced,0.682153,0.683347,0.407328,0.274915,0.785813,0.759356,0.265702,0.180196,0.505628,0.605842


In [22]:
pipe = Pipeline([
    ("pt", PowerTransformer()),
    ("smote", SMOTE(random_state=42)),     
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(class_weight='balanced'))
])
pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "Logistic regression balanced after SMOTE",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.6097750417122145
Testing Accuracy
0.6133373607195597
0.6133373607195597
Cross validation score 0.6084948414959476
Cross validation score 0.6084948414959476
[[16220 10200]
 [ 1321  2055]]
              precision    recall  f1-score   support

           0       0.92      0.61      0.74     26420
           1       0.17      0.61      0.26      3376

    accuracy                           0.61     29796
   macro avg       0.55      0.61      0.50     29796
weighted avg       0.84      0.61      0.68     29796

precision 0.16768665850673195
recall 0.6087085308056872
f1_score 0.2629390314119378
roc_auc_score 0.6113186862961063
[[37639 23948]
 [ 3182  4755]]
              precision    recall  f1-score   support

           0       0.92      0.61      0.74     61587
           1       0.17      0.60      0.26      7937

    accuracy                           0.61     69524
   macro avg       0.54      0.61      0.50     69524
weighted avg       0.84      0.61      0.68   

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942
7,Random Forest Tuned,0.605432,0.659484,0.257047,0.170305,0.523876,0.597479,0.261034,0.173073,0.530806,0.603367
8,XG boost,0.88427,0.885756,0.112586,0.948104,0.059846,0.529712,0.027429,0.387097,0.014218,0.505671
9,XG boost balanced,0.682153,0.683347,0.407328,0.274915,0.785813,0.759356,0.265702,0.180196,0.505628,0.605842


In [23]:
pipe = Pipeline([
    ("pt", PowerTransformer()),
    ("smote", SMOTE(random_state=42)),     
    ("scaler", StandardScaler()),
    ("xg", XGBClassifier(learning_rate=.1,max_depth=4,gamma=0,scale_pos_weight=7.73))
])
pipe.fit(X_train, y_train)

print("Training Accuracy")
print(pipe.score(X_train, y_train))
print("Testing Accuracy")
print(pipe.score(X_test, y_test))

te_acc = pipe.score(X_test, y_test)
score = cross_val_score(estimator=pipe, X=X_train, y=y_train, scoring='accuracy', cv=5)
print("Cross validation score", np.mean(score))
tr_acc = np.mean(score)

# Predictions
pred = pipe.predict(X_test)
predx = pipe.predict(X_train)

# Test metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
p = precision_score(y_test, pred)
print("precision", p)
r = recall_score(y_test, pred)
print("recall", r)
f = f1_score(y_test, pred)
print("f1_score", f)
au = roc_auc_score(y_test, pred)
print("roc_auc_score", au)

# Train metrics
print(confusion_matrix(y_train, predx))
print(classification_report(y_train, predx))
tp = precision_score(y_train, predx)
print("precision", tp)
tr = recall_score(y_train, predx)
print("recall", tr)
tf = f1_score(y_train, predx)
print("f1_score", tf)
tau = roc_auc_score(y_train, predx)
print("roc_auc_score", tau)

# Save results
add = pd.Series({
    'Model_Name': "XGBclassifier after SMOTE",
    'Train_Accuracy': tr_acc,
    'Test_Accuracy': te_acc,
    'Test_f1': f,
    'Test_recall': r,
    'Test_precision': p,
    'Test_auc_roc': au,
    'Train_f1': tf,
    'Train_recall': tr,
    'Train_precision': tp,
    'Train_auc_roc': tau
})
scores = pd.concat([scores, add.to_frame().T], ignore_index=True)
scores

Training Accuracy
0.46897474253495197
Testing Accuracy
0.4635185930997449
0.46897474253495197
Testing Accuracy
0.4635185930997449
Cross validation score 0.4702692506152012
Cross validation score 0.4702692506152012
[[11218 15202]
 [  783  2593]]
              precision    recall  f1-score   support

           0       0.93      0.42      0.58     26420
           1       0.15      0.77      0.24      3376

    accuracy                           0.46     29796
   macro avg       0.54      0.60      0.41     29796
weighted avg       0.85      0.46      0.55     29796

precision 0.14571508850800788
recall 0.768068720379147
f1_score 0.24495772519011855
roc_auc_score 0.5963356470934341
[[26301 35286]
 [ 1633  6304]]
              precision    recall  f1-score   support

           0       0.94      0.43      0.59     61587
           1       0.15      0.79      0.25      7937

    accuracy                           0.47     69524
   macro avg       0.55      0.61      0.42     69524
weighted

Unnamed: 0,Model_Name,Train_Accuracy,Test_Accuracy,Train_f1,Train_precision,Train_recall,Train_auc_roc,Test_f1,Test_precision,Test_recall,Test_auc_roc
0,Logistic Regression Base,0.885723,0.886562,0.00176,0.368421,0.000882,0.500344,0.001182,0.25,0.000592,0.500183
1,Logistic Regression Balanced,0.599735,0.603437,0.2617,0.165754,0.621393,0.609169,0.265357,0.167926,0.632109,0.615941
2,Decision Tree,0.790101,0.789737,0.999937,1.0,0.999874,0.999937,0.158043,0.144649,0.174171,0.521283
3,Decision Tree Balanced,0.801148,0.796147,0.999937,0.999874,1.0,0.999992,0.152623,0.144251,0.162026,0.519601
4,Decision Tree Tuned,0.625237,0.646832,0.267704,0.17492,0.570115,0.611774,0.265,0.173385,0.561908,0.609796
5,Random Forest,0.885795,0.886797,0.999748,1.0,0.999496,0.999748,0.005894,0.588235,0.002962,0.501349
6,Random Forest Balanced,0.885737,0.886763,0.999874,0.999874,0.999874,0.999929,0.004132,0.583333,0.002073,0.500942
7,Random Forest Tuned,0.605432,0.659484,0.257047,0.170305,0.523876,0.597479,0.261034,0.173073,0.530806,0.603367
8,XG boost,0.88427,0.885756,0.112586,0.948104,0.059846,0.529712,0.027429,0.387097,0.014218,0.505671
9,XG boost balanced,0.682153,0.683347,0.407328,0.274915,0.785813,0.759356,0.265702,0.180196,0.505628,0.605842


In [24]:
best_model=pipe = Pipeline([
    ("pt", PowerTransformer()),
    ("smote", SMOTE(random_state=42)),     
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(class_weight='balanced'))
])

# Train it on your data
best_model.fit(X_train, y_train)

print("Best model trained and ready!")

Best model trained and ready!


In [25]:
# Quick template creation
def create_patient(age=50, gender=1, hospital_days=3, diabetes=False):
    # Start with all False/0
    patient = {col: False if col.startswith(('race_', 'admission_', 'discharge_', 'payer_', 'insulin_', 'change_', 'diabetesMed_', 'diag_')) else 0 
               for col in X_train.columns}
    
    # Set the main values
    patient['age'] = age
    patient['gender'] = gender
    patient['time_in_hospital'] = hospital_days
    patient['num_lab_procedures'] = 10
    patient['num_procedures'] = 2
    patient['num_medications'] = 5
    patient['number_diagnoses'] = 3
    patient['total_visits'] = 2
    
    # Set required categorical defaults (pick one from each category)
    patient['race_Caucasian'] = True
    patient['admission_type_id_Emergency'] = True
    patient['payer_code_MC'] = True  # Medicare
    patient['insulin_No'] = True
    patient['change_No'] = True
    
    # Set diabetes if specified
    if diabetes:
        patient['diag_1_Diabetes'] = True
        patient['diabetesMed_Yes'] = True
    
    return patient

# Test different scenarios
high_risk = create_patient(age=75, gender=1, hospital_days=8, diabetes=True)
low_risk = create_patient(age=35, gender=0, hospital_days=2, diabetes=False)

# Test them
for name, case in [("High Risk", high_risk), ("Low Risk", low_risk)]:
    test_df = pd.DataFrame([case])
    prediction = best_model.predict(test_df)
    probability = best_model.predict_proba(test_df)
    
    print(f"{name}: {'READMITTED' if prediction[0] == 1 else 'NOT READMITTED'} ({probability[0][1]*100:.1f}%)")

High Risk: READMITTED (51.7%)
Low Risk: READMITTED (51.9%)


In [26]:
# ONE comprehensive test to rule them all
print("=== FINAL DEBUG SESSION ===")

# 1. Check if model is basically broken (all predictions ~50%)
test_probs = best_model.predict_proba(X_test[:100])[:, 1]
print(f"Random 100 patients: {test_probs.min()*100:.1f}% to {test_probs.max()*100:.1f}%")
print(f"Average prediction: {test_probs.mean()*100:.1f}%")

# 2. If there's actually variation, test extreme cases
if test_probs.max() - test_probs.min() > 0.1:  # If range > 10%
    print("\n✅ Model has variation - testing extreme cases...")
    
    # Extreme cases
    super_sick = create_patient(age=85, gender=1, hospital_days=20, diabetes=True)
    super_sick.update({
        'num_medications': 25, 'number_diagnoses': 9, 'total_visits': 10,
        'diag_2_Diabetes': True, 'diag_3_Diabetes': True
    })
    
    super_healthy = create_patient(age=30, gender=0, hospital_days=1, diabetes=False)
    super_healthy.update({
        'num_medications': 1, 'number_diagnoses': 1, 'total_visits': 1
    })
    
    for name, case in [("SUPER SICK", super_sick), ("SUPER HEALTHY", super_healthy)]:
        test_df = pd.DataFrame([case])
        prob = best_model.predict_proba(test_df)[0][1]
        print(f"{name}: {prob*100:.1f}%")
        
else:
    print("\n❌ Model is broken - predicting ~50% for everything")
    print("SOLUTION: Use Random Forest without SMOTE for deployment")

print("\n=== DEBUG COMPLETE ===")

=== FINAL DEBUG SESSION ===
Random 100 patients: 15.2% to 87.4%
Average prediction: 46.8%

✅ Model has variation - testing extreme cases...
SUPER SICK: 53.4%
SUPER HEALTHY: 43.5%

=== DEBUG COMPLETE ===


In [27]:
def create_patient_from_user_input():
    """Get patient info from user and create feature vector"""
    
    print("🏥 HOSPITAL READMISSION RISK CALCULATOR 🏥")
    print("=" * 50)
    print("Please enter patient information:")
    print()
    
    # Get basic info
    try:
        age = int(input("Age (20-100): "))
        gender = input("Gender (M/F): ").upper()
        gender_code = 1 if gender == 'M' else 0
        
        time_in_hospital = int(input("Days in hospital (1-15): "))
        num_medications = int(input("Number of medications (1-30): "))
        num_lab_procedures = int(input("Number of lab procedures (1-50): "))
        num_procedures = int(input("Number of procedures (0-10): "))
        number_diagnoses = int(input("Number of diagnoses (1-10): "))
        total_visits = int(input("Previous hospital visits (0-20): "))
        
        # Diabetes info
        has_diabetes = input("Does patient have diabetes? (Y/N): ").upper() == 'Y'
        on_diabetes_med = input("On diabetes medication? (Y/N): ").upper() == 'Y' if has_diabetes else False
        
        # Race/ethnicity
        print("\nRace/Ethnicity:")
        print("1. Caucasian")
        print("2. Asian") 
        print("3. Hispanic")
        print("4. Other")
        race_choice = int(input("Select race (1-4): "))
        
        # Admission type
        print("\nAdmission Type:")
        print("1. Emergency")
        print("2. Not Available/Other")
        admission_choice = int(input("Select admission type (1-2): "))
        
        # Insurance
        print("\nInsurance/Payer:")
        print("1. Medicare (MC)")
        print("2. Other")
        insurance_choice = int(input("Select insurance (1-2): "))
        
    except ValueError:
        print("❌ Invalid input! Please enter numbers where requested.")
        return None
    
    # Create patient feature vector with all 67 columns
    patient = {}
    
    # Initialize all columns to False/0
    all_columns = [
        'gender', 'age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 
        'num_medications', 'number_diagnoses', 'total_visits', 'race_Asian', 
        'race_Caucasian', 'race_Hispanic', 'race_Other', 'admission_type_id_Emergency', 
        'admission_type_id_Not Available', 'discharge_disposition_id_Left AMA', 
        'discharge_disposition_id_Not Available', 'discharge_disposition_id_Still patient/referred to this institution', 
        'discharge_disposition_id_Transferred to another facility', 'admission_source_id_Not Available', 
        'admission_source_id_Referral', 'admission_source_id_Transferred from hospital', 
        'payer_code_CH', 'payer_code_CM', 'payer_code_CP', 'payer_code_DM', 'payer_code_FR', 
        'payer_code_HM', 'payer_code_MC', 'payer_code_MD', 'payer_code_MP', 'payer_code_OG', 
        'payer_code_OT', 'payer_code_Other', 'payer_code_PO', 'payer_code_SI', 'payer_code_SP', 
        'payer_code_UN', 'payer_code_WC', 'insulin_No', 'insulin_Steady', 'insulin_Up', 
        'change_No', 'diabetesMed_Yes', 'diag_1_Diabetes', 'diag_1_Digestive', 
        'diag_1_Genitourinary', 'diag_1_Injury', 'diag_1_Musculoskeletal', 'diag_1_Neoplasms', 
        'diag_1_Other', 'diag_1_Respiratory', 'diag_1_Unknown', 'diag_2_Diabetes', 
        'diag_2_Digestive', 'diag_2_Genitourinary', 'diag_2_Injury', 'diag_2_Musculoskeletal', 
        'diag_2_Neoplasms', 'diag_2_Other', 'diag_2_Respiratory', 'diag_2_Unknown', 
        'diag_3_Diabetes', 'diag_3_Digestive', 'diag_3_Genitourinary', 'diag_3_Injury', 
        'diag_3_Musculoskeletal', 'diag_3_Neoplasms', 'diag_3_Other', 'diag_3_Respiratory', 
        'diag_3_Unknown'
    ]
    
    # Initialize all to False/0
    for col in all_columns:
        if col in ['gender', 'age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 
                   'num_medications', 'number_diagnoses', 'total_visits']:
            patient[col] = 0
        else:
            patient[col] = False
    
    # Set actual values
    patient['gender'] = gender_code
    patient['age'] = age
    patient['time_in_hospital'] = time_in_hospital
    patient['num_lab_procedures'] = num_lab_procedures
    patient['num_procedures'] = num_procedures
    patient['num_medications'] = num_medications
    patient['number_diagnoses'] = number_diagnoses
    patient['total_visits'] = total_visits
    
    # Set race
    race_mapping = {1: 'race_Caucasian', 2: 'race_Asian', 3: 'race_Hispanic', 4: 'race_Other'}
    patient[race_mapping[race_choice]] = True
    
    # Set admission type
    if admission_choice == 1:
        patient['admission_type_id_Emergency'] = True
    else:
        patient['admission_type_id_Not Available'] = True
    
    # Set insurance
    if insurance_choice == 1:
        patient['payer_code_MC'] = True
    else:
        patient['payer_code_Other'] = True
    
    # Set diabetes info
    if has_diabetes:
        patient['diag_1_Diabetes'] = True
        patient['diag_2_Diabetes'] = True
        if on_diabetes_med:
            patient['diabetesMed_Yes'] = True
    
    # Set defaults for required fields
    patient['insulin_No'] = True
    patient['change_No'] = True
    
    # Set some diagnosis (if not diabetes, set other)
    if not has_diabetes:
        patient['diag_1_Other'] = True
    
    return patient

def predict_readmission(patient_data, model):
    """Make prediction and return results"""
    
    if patient_data is None:
        return
    
    try:
        # Convert to DataFrame
        patient_df = pd.DataFrame([patient_data])
        
        # Make prediction
        prediction = model.predict(patient_df)[0]
        probability = model.predict_proba(patient_df)[0][1]
        
        # Display results
        print("\n" + "="*50)
        print("🎯 READMISSION RISK ASSESSMENT")
        print("="*50)
        
        print(f"Prediction: {'HIGH RISK - Likely to be readmitted' if prediction == 1 else 'LOW RISK - Unlikely to be readmitted'}")
        print(f"Risk Score: {probability*100:.1f}%")
        
        # Risk interpretation
        if probability < 0.3:
            risk_level = "LOW"
            color = "🟢"
        elif probability < 0.7:
            risk_level = "MODERATE" 
            color = "🟡"
        else:
            risk_level = "HIGH"
            color = "🔴"
            
        print(f"Risk Level: {color} {risk_level}")
        
        # Recommendations
        print(f"\n📋 CLINICAL RECOMMENDATIONS:")
        if probability > 0.6:
            print("• Consider enhanced discharge planning")
            print("• Schedule early follow-up appointment")
            print("• Medication reconciliation recommended")
            print("• Patient education on warning signs")
        elif probability > 0.4:
            print("• Standard discharge planning")
            print("• Routine follow-up scheduling")
            print("• Basic medication review")
        else:
            print("• Standard care protocols")
            print("• Routine follow-up as needed")
            
    except Exception as e:
        print(f"❌ Error making prediction: {e}")

# MAIN EXECUTION
if __name__ == "__main__":
    # NOTE: Replace 'best_model' with your actual trained model variable
    # Example: predict_readmission(create_patient_from_user_input(), your_trained_model)
    
    print("To use this script:")
    print("1. Make sure your trained model is loaded as 'best_model'")
    print("2. Run: patient_data = create_patient_from_user_input()")  
    print("3. Run: predict_readmission(patient_data, best_model)")
    print()
    print("Example usage:")
    print("patient_data = create_patient_from_user_input()")
    print("predict_readmission(patient_data, best_model)")

To use this script:
1. Make sure your trained model is loaded as 'best_model'
2. Run: patient_data = create_patient_from_user_input()
3. Run: predict_readmission(patient_data, best_model)

Example usage:
patient_data = create_patient_from_user_input()
predict_readmission(patient_data, best_model)


In [28]:
# After loading your model
patient_data = create_patient_from_user_input()
predict_readmission(patient_data, best_model)


🏥 HOSPITAL READMISSION RISK CALCULATOR 🏥
Please enter patient information:


Race/Ethnicity:
1. Caucasian
2. Asian
3. Hispanic
4. Other

Race/Ethnicity:
1. Caucasian
2. Asian
3. Hispanic
4. Other

Admission Type:
1. Emergency
2. Not Available/Other

Admission Type:
1. Emergency
2. Not Available/Other

Insurance/Payer:
1. Medicare (MC)
2. Other

Insurance/Payer:
1. Medicare (MC)
2. Other

🎯 READMISSION RISK ASSESSMENT
Prediction: LOW RISK - Unlikely to be readmitted
Risk Score: 40.1%
Risk Level: 🟡 MODERATE

📋 CLINICAL RECOMMENDATIONS:
• Standard discharge planning
• Routine follow-up scheduling
• Basic medication review

🎯 READMISSION RISK ASSESSMENT
Prediction: LOW RISK - Unlikely to be readmitted
Risk Score: 40.1%
Risk Level: 🟡 MODERATE

📋 CLINICAL RECOMMENDATIONS:
• Standard discharge planning
• Routine follow-up scheduling
• Basic medication review


In [29]:
import joblib
# Save the best model as a pickle file
joblib.dump(best_model, "../notebook/diabetes_readmission.pkl")
print("Model saved as diabetes_readmission.pkl (pipeline/model object)")

Model saved as diabetes_readmission.pkl (pipeline/model object)
