In [None]:
## Group Detail
Group Name: Data Explorer
Name: Mohammad Tohin Bapari
Email: tohin@gmx.de
Country: Germany
University: Bergische Universität Wuppertal 
Specialization: Data Science


## Model Selection 

Linear Model: Logistic Regression

Ensemble Model: Random Forest

Boosting Model: Gradient Boosting

Other Models: If time permits, we will explore models like Stacking.

## Model Building

Split the data into training and testing sets

Train the selected models

Evaluate and compare their performance

In [28]:
# Importing Libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix






In [30]:
# Load the dataset
file_path = 'Cleaned_Healthcare_Dataset.xlsx'
data = pd.read_excel(file_path)

In [31]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0,Ptid,Persistency_Flag,Gender,Race,Ethnicity,Region,Age_Bucket,Ntm_Speciality,Ntm_Specialist_Flag,Ntm_Speciality_Bucket,...,Risk_Family_History_Of_Osteoporosis,Risk_Low_Calcium_Intake,Risk_Vitamin_D_Insufficiency,Risk_Poor_Health_Frailty,Risk_Excessive_Thinness,Risk_Hysterectomy_Oophorectomy,Risk_Estrogen_Deficiency,Risk_Immobilization,Risk_Recurring_Falls,Count_Of_Risks
0,P1,Persistent,Male,Caucasian,Not Hispanic,West,>75,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,N,N,N,N,N,N,N,N,0
1,P2,Non-Persistent,Male,Asian,Not Hispanic,West,55-65,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,N,N,N,N,N,N,N,N,0
2,P3,Non-Persistent,Female,Other/Unknown,Hispanic,Midwest,65-75,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,Y,N,N,N,N,N,N,N,2
3,P4,Non-Persistent,Female,Caucasian,Not Hispanic,Midwest,>75,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,N,N,N,N,N,N,N,N,1
4,P5,Non-Persistent,Female,Caucasian,Not Hispanic,Midwest,>75,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,N,N,N,N,N,N,N,N,1


In [32]:
# Separate features and target variable
X = data.drop(columns=['Ptid', 'Persistency_Flag'])
y = data['Persistency_Flag']

In [36]:
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [38]:
# Encode categorical features
categorical_features = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=categorical_features)


In [40]:
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X_encoded)

In [42]:
# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [44]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((2280, 180), (570, 180), (2280,), (570,))

#### Classification Report
            precision    recall  f1-score   support

         0       0.85      0.92      0.88       411
         
         1       0.73      0.57      0.64       159

  accuracy                           0.82       570
  
 macro avg       0.79      0.75      0.76       570


## Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)

# Evaluate Logistic Regression
logistic_accuracy = accuracy_score(y_test, y_pred_logistic)
logistic_report = classification_report(y_test, y_pred_logistic)
logistic_confusion_matrix = confusion_matrix(y_test, y_pred_logistic)

print("Logistic Regression Accuracy:", logistic_accuracy)
print("Logistic Regression Report:\n", logistic_report)
print("Logistic Regression Confusion Matrix:\n", logistic_confusion_matrix)


Logistic Regression Accuracy: 0.8228070175438597
Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.88       411
           1       0.73      0.57      0.64       159

    accuracy                           0.82       570
   macro avg       0.79      0.75      0.76       570
weighted avg       0.82      0.82      0.82       570

Logistic Regression Confusion Matrix:
 [[378  33]
 [ 68  91]]


## Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_report = classification_report(y_test, y_pred_rf)
rf_confusion_matrix = confusion_matrix(y_test, y_pred_rf)

print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Report:\n", rf_report)
print("Random Forest Confusion Matrix:\n", rf_confusion_matrix)


Random Forest Accuracy: 0.8105263157894737
Random Forest Report:
               precision    recall  f1-score   support

           0       0.83      0.93      0.88       411
           1       0.74      0.49      0.59       159

    accuracy                           0.81       570
   macro avg       0.78      0.71      0.73       570
weighted avg       0.80      0.81      0.80       570

Random Forest Confusion Matrix:
 [[384  27]
 [ 81  78]]


## Gradient Boosting

In [51]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Evaluate Gradient Boosting
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_report = classification_report(y_test, y_pred_gb)
gb_confusion_matrix = confusion_matrix(y_test, y_pred_gb)

print("Gradient Boosting Accuracy:", gb_accuracy)
print("Gradient Boosting Report:\n", gb_report)
print("Gradient Boosting Confusion Matrix:\n", gb_confusion_matrix)


Gradient Boosting Accuracy: 0.8192982456140351
Gradient Boosting Report:
               precision    recall  f1-score   support

           0       0.84      0.93      0.88       411
           1       0.74      0.54      0.63       159

    accuracy                           0.82       570
   macro avg       0.79      0.73      0.75       570
weighted avg       0.81      0.82      0.81       570

Gradient Boosting Confusion Matrix:
 [[381  30]
 [ 73  86]]
