In [None]:
# Ml Models implementation

This step includes loading the dataset, encoding categorical variables using LabelEncoder, splitting the dataset into training and testing sets, and standardizing features using StandardScaler.

Libraries installation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

Uploading the csv file

In [None]:
from google.colab import files

uploaded = files.upload()

Saving healthcare_dataset.csv to healthcare_dataset (1).csv


Load and preccesing the data


In [None]:
df = pd.read_csv('healthcare_dataset.csv')

le = LabelEncoder()
for col in df.select_dtypes(include='object'):
    df[col] = le.fit_transform(df[col])

X = df.drop(['Billing Amount', 'Test Results'], axis=1)
y_reg = df['Billing Amount']
y_clf = df['Test Results']

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.1, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train_reg = scaler.fit_transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)
X_train_clf = scaler.fit_transform(X_train_clf)
X_test_clf = scaler.transform(X_test_clf)


Linear Regression






We use LinearRegression to predict the Billing Amount (a regression task). The performance is evaluated using Root Mean Squared Error (RMSE).

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
import numpy as np

# Define list of alphas (regularization strengths)
alphas = [0.01, 0.1, 1.0, 10.0, 100.0]

ridge_model = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', cv=5)
ridge_model.fit(X_train_reg, y_train_reg)

# Predict and evaluate
pred_reg = ridge_model.predict(X_test_reg)
print("Ridge Regression RMSE:", np.sqrt(mean_squared_error(y_test_reg, pred_reg)))
print("Best alpha:", ridge_model.alpha_)



Ridge Regression RMSE: 14010.933959794409
Best alpha: 10.0


 Logistic Regression

LogisticRegression is used to classify the Test Results (a classification task). The output includes a classification report (precision, recall, F1-score).


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Parameter grid to search over
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],            # Regularization strength
    'penalty': ['l1', 'l2'],                        # Regularization method
    'solver': ['liblinear', 'saga'],                # Solvers that support l1 and l2
    'max_iter': [100, 200, 500]                     # Ensure convergence
}

# Initialize base model
log_model = LogisticRegression()

# Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_clf, y_train_clf)

# Best model after tuning
best_log_model = grid_search.best_estimator_

# Predict and evaluate
log_preds = best_log_model.predict(X_test_clf)

print("\nTuned Logistic Regression Report:\n", classification_report(y_test_clf, log_preds))
print("Accuracy:", accuracy_score(y_test_clf, log_preds))
print("Best Parameters:", grid_search.best_params_)




Tuned Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.34      1.00      0.51      1887
           1       0.00      0.00      0.00      1811
           2       0.00      0.00      0.00      1852

    accuracy                           0.34      5550
   macro avg       0.11      0.33      0.17      5550
weighted avg       0.12      0.34      0.17      5550

Accuracy: 0.34
Best Parameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Decision Tree

A DecisionTreeClassifier is trained to classify test results. It is a simple yet powerful algorithm that splits the dataset into decision paths.

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_clf, y_train_clf)
dt_preds = dt_model.predict(X_test_clf)
print("\n Decision Tree Report:\n", classification_report(y_test_clf, dt_preds))



 Decision Tree Report:
               precision    recall  f1-score   support

           0       0.45      0.44      0.45      1887
           1       0.43      0.45      0.44      1811
           2       0.44      0.44      0.44      1852

    accuracy                           0.44      5550
   macro avg       0.44      0.44      0.44      5550
weighted avg       0.44      0.44      0.44      5550



Random Forest

RandomForestClassifier uses an ensemble of decision trees to improve accuracy and robustness. We evaluate it using a classification report.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
scaler = StandardScaler()
X_train_clf = scaler.fit_transform(X_train_clf)
X_test_clf = scaler.transform(X_test_clf)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_clf, y_train_clf)

rf_preds = rf_model.predict(X_test_clf)

print("\n Random Forest Classification Report:\n")
print(classification_report(y_test_clf, rf_preds))


 Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.46      0.46      0.46      1887
           1       0.44      0.46      0.45      1811
           2       0.45      0.43      0.44      1852

    accuracy                           0.45      5550
   macro avg       0.45      0.45      0.45      5550
weighted avg       0.45      0.45      0.45      5550



KNeighborsClassifier predicts the label based on the majority class of the nearest neighbors in the feature space. Feature scaling is important before using KNN.



K-Nearest Neighbors (KNN)

KNeighborsClassifier predicts the label based on the majority class of the nearest neighbors in the feature space. Feature scaling is important before using KNN.



In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_clf = scaler.fit_transform(X_train_clf)
X_test_clf = scaler.transform(X_test_clf)


In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_clf, y_train_clf)

knn_preds = knn_model.predict(X_test_clf)
print("\n KNN Report:\n", classification_report(y_test_clf, knn_preds))


 KNN Report:
               precision    recall  f1-score   support

           0       0.37      0.49      0.42      1887
           1       0.38      0.38      0.38      1811
           2       0.38      0.26      0.31      1852

    accuracy                           0.38      5550
   macro avg       0.38      0.38      0.37      5550
weighted avg       0.38      0.38      0.37      5550



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Parameter grid to search over
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],            # Regularization strength
    'penalty': ['l1', 'l2'],                        # Regularization method
    'solver': ['liblinear', 'saga'],                # Solvers that support l1 and l2
    'max_iter': [100, 200, 500]                     # Ensure convergence
}

# Initialize base model
log_model = LogisticRegression()

# Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_clf, y_train_clf)

# Best model after tuning
best_log_model = grid_search.best_estimator_

# Predict and evaluate
log_preds = best_log_model.predict(X_test_clf)

print("\nTuned Logistic Regression Report:\n", classification_report(y_test_clf, log_preds))
print("Accuracy:", accuracy_score(y_test_clf, log_preds))
print("Best Parameters:", grid_search.best_params_)


NameError: name 'X_train_clf' is not defined