In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# Load data from pickle file
df = pd.read_pickle('ehr_preprocessed_seq_by_day_cat_embedding.pkl')


In [None]:
cols1_logistic = []
cols2_logistic = []
cols3_logistic = []
cols4_logistic = []
cols5_logistic = []

#Create column name
for i in df['feature_cols']:
    j = i + "_avg"
    k = i + "_stdev"
    l = i + "_max"
    m = i + "_min"
    n = i + "_diff"

    cols1_logistic.append(j)
    cols2_logistic.append(k)
    cols3_logistic.append(l)
    cols4_logistic.append(m)
    cols5_logistic.append(n)
    
cols_logistic = ['ID']    
cols_logistic += cols1_logistic + cols2_logistic + cols3_logistic + cols4_logistic + cols5_logistic 
cols_logistic.append('n')

In [None]:
#Dimension Reduction

datalist = []

for i in df['feat_dict']:
    
    ID = i
    data = df['feat_dict'][i]
    means = np.mean(data, axis=0)
    std_devs = np.std(data, axis=0)
    n = data.shape[0]
    daily_changes = np.diff(data, axis=0)
    maxs = np.max(data, axis=0)
    mins = np.min(data, axis=0)
    diffs = data[-1, :] - data[0, :]
    summary_stats = np.concatenate([[ID], means, std_devs, maxs, mins, diffs, [n]])
    datalist.append(summary_stats)

In [None]:
dflogistic = pd.DataFrame(datalist, columns = cols_logistic)
dflogistic = dflogistic.set_index('ID')
dflogistic

#Get y values for model training
dftraindata = pd.read_csv('train.csv')
dftraindata = dftraindata.groupby('id')['readmitted_within_30days'].mean().to_frame()

dftrain = dflogistic.join(dftraindata, how='inner')

#Separate to training data x and y
dftraindata_x = dftrain[cols1_logistic + cols2_logistic + cols3_logistic + cols4_logistic + cols5_logistic + ['n']]
dftraindata_y = dftrain['readmitted_within_30days']

#Handling resampling using SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(dftraindata_x, dftraindata_y)


In [None]:
#Get valid data set for model validation
dfvaliddata = pd.read_csv('valid.csv')
dfvaliddataid = dfvaliddata['id'].unique()
dfvaliddata_x = dflogistic.loc[dfvaliddataid]
dfvaliddata_x.sort_values(by=['ID'], inplace=True)


#Separate to valid data x and y
dfvaliddata_y = dfvaliddata.groupby('id')['readmitted_within_30days'].mean().to_frame()
dfvaliddata_y.sort_values(by=['id'], inplace=True)

In [None]:
#model1
# Create a Random Forest model

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=400, random_state=42)

# Fit the model to your data
clf.fit(X_train_res, y_train_res)

In [None]:
X_valid = dfvaliddata_x[cols1_logistic + cols2_logistic + cols3_logistic + cols4_logistic + cols5_logistic + ['n']]

#Run model on valid data
predict = clf.predict(X_valid)
predict
validprob = clf.predict_proba(X_valid)[:, 1]

In [None]:
from sklearn import metrics

# Assuming `dftestdata_y` is your true labels
y_true = dfvaliddata_y

# Calculate metrics
accuracy = metrics.accuracy_score(dfvaliddata_y, predict)
precision = metrics.precision_score(dfvaliddata_y, predict)
recall = metrics.recall_score(dfvaliddata_y, predict)
f1_score = metrics.f1_score(dfvaliddata_y, predict)
auc = metrics.roc_auc_score(dfvaliddata_y, validprob)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"ROC AUC: {auc}")

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Calculate FPR, TPR for the ROC curve
fpr, tpr, thresholds = roc_curve(dfvaliddata_y, validprob)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
#Take in test data
dftestdata = pd.read_csv('test.csv')
dftestdataid = dftestdata['id'].unique()
dftestdata_x = dflogistic.loc[dftestdataid]
#dftestdata_x
dftestdata_x.sort_values(by=['ID'], inplace=True)


#Run model on test data
X_test = dftestdata_x[cols1_logistic + cols2_logistic + cols3_logistic + cols4_logistic + cols5_logistic + ['n']]
testprob = clf.predict_proba(X_test)[:,1]

In [None]:
len(testprob)

In [None]:
X_test['readmitted_within_30days'] = testprob

In [None]:
X_test['readmitted_within_30days'].to_csv("result.csv", float_format='%.7f')