In [1]:
from ucimlrepo import fetch_ucirepo
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Setting random seed.
seed = 123
np.random.seed(seed)

In [2]:
# Fetch dataset
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296)

# Data (as pandas dataframes)
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets

  df = pd.read_csv(data_url)


In [3]:
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets
# Cleaning the data

# Dropping columns with 30% missing values
X = X.drop(columns=['weight', 'payer_code', 'medical_specialty'])

In [4]:
# Using Label and Ordinal Encoding to make non-numerical data numerical
le = LabelEncoder()
columns_to_encode = ['race', 'gender', 'diag_1', 'diag_2', 'diag_3',
            'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
            'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
            'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol',
            'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin',
            'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone',
            'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

for col in columns_to_encode:
    X[col] = le.fit_transform(X[col])

# Create binary values for the 'readmitted' column using .loc
y.loc[:, 'readmitted'] = y['readmitted'].apply(lambda x: 0 if x == "NO" else 1)

oe = OrdinalEncoder(categories=[['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)']])
X['age'] = oe.fit_transform(X[['age']])

# Extract y and ensure it's a one-dimensional array
y = y['readmitted'].values.ravel()

In [5]:
# Creating a 55% training, 20% testing split, 25% validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)

y_train = y_train.astype(int)
y_val = y_val.astype(int)
y_test = y_test.astype(int)

In [11]:
clf = RandomForestClassifier(random_state=seed)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)

print('Results for validation')
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))


#best_accuracy = 0
#best_tree_index = 0
#for i, tree in enumerate(clf.estimators_):
#    tree_accuracy = accuracy_score(y_train, tree.predict(X_train))
#    if tree_accuracy > best_accuracy:
#        best_accuracy = tree_accuracy
#        best_tree_index = i

#best_tree = clf.estimators_[best_tree_index]

# Visualizing the best tree
#plt.figure(figsize=(20,10))
#plot_tree(best_tree, filled=True, feature_names=X.columns, class_names=['Not Readmitted', 'Readmitted'], rounded=True, max_depth = 2)
#plt.title(f"Best Tree (Tree Index: {best_tree_index}, Accuracy: {best_accuracy:.2f})")
#plt.show()



Results for validation
Accuracy: 0.64
Confusion Matrix:
[[8134 2889]
 [4488 4842]]
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.74      0.69     11023
           1       0.63      0.52      0.57      9330

    accuracy                           0.64     20353
   macro avg       0.64      0.63      0.63     20353
weighted avg       0.64      0.64      0.63     20353



In [9]:
y_pred = clf.predict(X_test)
print('Results for testing')
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Results for testing
Accuracy: 0.64
Confusion Matrix:
[[8044 2827]
 [4585 4898]]
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.74      0.68     10871
           1       0.63      0.52      0.57      9483

    accuracy                           0.64     20354
   macro avg       0.64      0.63      0.63     20354
weighted avg       0.64      0.64      0.63     20354

