In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

#### Data Analysis

In [38]:
health_df = pd.read_csv('healthcare_dataset.csv')
health_df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [39]:
health_df.shape

(55500, 15)

In [40]:
health_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [68]:
features = health_df.drop(columns=['Medical Condition', 'Name', 'Room Number', 'Doctor', 'Date of Admission', 'Discharge Date'])
target = health_df['Medical Condition']

In [69]:
# One-Hot Encoding for categorical variables
categorical_columns = ['Gender', 'Blood Type', 'Test Results', 'Medication']
encoder = OneHotEncoder(drop='first', sparse=False)

# Apply encoding
encoded_features = encoder.fit_transform(features[categorical_columns])



In [70]:
encoded_features = encoder.fit_transform(features[categorical_columns])

numerical_features = features[['Age', 'Billing Amount']].values



In [71]:
X = pd.concat([pd.DataFrame(numerical_features), pd.DataFrame(encoded_features)], axis=1)

In [72]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(target)

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Random Forest Classifier

In [74]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [75]:
y_pred = model.predict(X_test)

In [76]:
# Evaluating the model
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(report)

              precision    recall  f1-score   support

   Arthritis       0.25      0.24      0.25      1915
      Asthma       0.24      0.24      0.24      1847
      Cancer       0.26      0.26      0.26      1871
    Diabetes       0.24      0.25      0.25      1822
Hypertension       0.24      0.26      0.25      1788
     Obesity       0.26      0.25      0.25      1857

    accuracy                           0.25     11100
   macro avg       0.25      0.25      0.25     11100
weighted avg       0.25      0.25      0.25     11100



#### Logistic Regression

In [77]:
# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_log_reg = model.predict(X_test)

# Evaluate the model
report_log_reg = classification_report(y_test, y_pred_log_reg, target_names=label_encoder.classes_)
print(report_log_reg)

              precision    recall  f1-score   support

   Arthritis       0.00      0.00      0.00      1915
      Asthma       0.00      0.00      0.00      1847
      Cancer       0.29      0.00      0.00      1871
    Diabetes       0.16      1.00      0.28      1822
Hypertension       0.00      0.00      0.00      1788
     Obesity       0.00      0.00      0.00      1857

    accuracy                           0.16     11100
   macro avg       0.07      0.17      0.05     11100
weighted avg       0.08      0.16      0.05     11100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Dropping unnecessary columns for prediction (e.g., Name, Doctor, Date of Admission, etc.)
features = health_df.drop(columns=['Medical Condition', 'Name', 'Doctor', 'Date of Admission', 'Discharge Date', 'Insurance Provider'])

# Label encode categorical variables
label_encoded_columns = ['Gender', 'Blood Type', 'Hospital', 'Admission Type', 'Medication', 'Test Results']
label_encoder = LabelEncoder()

for col in label_encoded_columns:
    features[col] = label_encoder.fit_transform(features[col])

# Defining the target variable (Medical Condition)
target = label_encoder.fit_transform(health_df['Medical Condition'])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Training a Random Forest model to get feature importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances from the Random Forest model
feature_importances = rf_model.feature_importances_
feature_names = features.columns

# Creating a DataFrame for feature importance
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sorting the features by importance
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

# # Displaying the top features
# import ace_tools as tools; tools.display_dataframe_to_user(name="Feature Importance for Medical Condition Prediction", dataframe=importance_df_sorted)

importance_df_sorted.head()


Unnamed: 0,Feature,Importance
3,Hospital,0.196158
4,Billing Amount,0.19612
5,Room Number,0.18531
0,Age,0.15448
2,Blood Type,0.085102
