In [1]:
# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Turn off all warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset
data = pd.read_csv('//Users/jake/ML/cirrhosis.csv')

# Drop all the rows where missing values were present in the Drug column and drop the ID column
data_cleaned = data.dropna(subset=['Drug']).drop(columns=['ID'])

# Show sum of missing values
missing_values_sum = data_cleaned.isnull().sum()
print("Sum of missing values:")
print(missing_values_sum)

Sum of missing values:
N_Days            0
Status            0
Drug              0
Age               0
Sex               0
Ascites           0
Hepatomegaly      0
Spiders           0
Edema             0
Bilirubin         0
Cholesterol      28
Albumin           0
Copper            2
Alk_Phos          0
SGOT              0
Tryglicerides    30
Platelets         4
Prothrombin       0
Stage             0
dtype: int64


In [3]:
# Impute missing values with mean results (only for numeric columns)
numeric_cols = data_cleaned.select_dtypes(include=[np.number]).columns
data_cleaned[numeric_cols] = data_cleaned[numeric_cols].fillna(data_cleaned[numeric_cols].mean())

# Split the data set into training and test set with a ratio of 8:2
train_data, test_data = train_test_split(data_cleaned, test_size=0.2, random_state=42)

# Based on the training and test data, show the feature types and indicate which features are continuous or categorical
continuous_features = train_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train_data.select_dtypes(exclude=[np.number]).columns.tolist()

print("\nContinuous Features:", continuous_features)
print("Categorical Features:", categorical_features)

# One-hot encoding for all category attributes
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_train_data = pd.DataFrame(encoder.fit_transform(train_data[categorical_features]), columns=encoder.get_feature_names_out(categorical_features))
encoded_test_data = pd.DataFrame(encoder.transform(test_data[categorical_features]), columns=encoder.get_feature_names_out(categorical_features))

train_data_encoded = pd.concat([train_data.reset_index(drop=True), encoded_train_data], axis=1).drop(columns=categorical_features)
test_data_encoded = pd.concat([test_data.reset_index(drop=True), encoded_test_data], axis=1).drop(columns=categorical_features)

# Show the label distribution based on the training data, is it a balanced training set
label_distribution = train_data['Status'].value_counts(normalize=True)
print("\nLabel distribution in the training set:")
print(label_distribution)
print("\nIs the training set balanced?:", label_distribution.max() < 0.7)



Continuous Features: ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
Categorical Features: ['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

Label distribution in the training set:
Status
C     0.558233
D     0.397590
CL    0.044177
Name: proportion, dtype: float64

Is the training set balanced?: True


In [4]:
# Ensure 'Status' column is in the original train_data before encoding
y_train = train_data['Status']

# Dropping the 'Status' column from the features before encoding
X_train = train_data.drop(columns=['Status'])

# Perform one-hot encoding on the categorical features
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train), columns=encoder.get_feature_names_out(X_train.columns))

# Create three supervised machine learning models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42)
}

# Validation method and performance metric
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform cross-validation for each model and store results
results = {}
for model_name, model in models.items():
    cv_results = cross_validate(model, X_train_encoded, y_train, cv=5, scoring=scoring)
    results[model_name] = {
        'accuracy': cv_results['test_accuracy'].mean(),
        'precision': cv_results['test_precision'].mean(),
        'recall': cv_results['test_recall'].mean(),
        'f1': cv_results['test_f1'].mean()
    }

# Justification of model choices
model_justifications = {
    'Logistic Regression': "Logistic Regression is a simple and interpretable model, often used as a baseline.",
    'Random Forest': "Random Forest is an ensemble method that reduces variance and avoids overfitting by averaging multiple decision trees.",
    'Support Vector Machine': "SVM is effective in high-dimensional spaces and works well for classification tasks with clear margins of separation."
}

# Display results and justifications
results, model_justifications


({'Logistic Regression': {'accuracy': 0.6947755102040817,
   'precision': 0.6677079359667015,
   'recall': 0.6947755102040817,
   'f1': 0.6700974732737949},
  'Random Forest': {'accuracy': 0.6706938775510204,
   'precision': 0.6955220453731732,
   'recall': 0.6706938775510204,
   'f1': 0.6167548652825409},
  'Support Vector Machine': {'accuracy': 0.6706938775510205,
   'precision': 0.6613702901879568,
   'recall': 0.6706938775510205,
   'f1': 0.6359097774942303}},
 {'Logistic Regression': 'Logistic Regression is a simple and interpretable model, often used as a baseline.',
  'Random Forest': 'Random Forest is an ensemble method that reduces variance and avoids overfitting by averaging multiple decision trees.',
  'Support Vector Machine': 'SVM is effective in high-dimensional spaces and works well for classification tasks with clear margins of separation.'})

In [5]:
# Define the hyper-parameter grids for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']  # 'liblinear' supports l1 penalty
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Support Vector Machine': {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf', 'linear']
    }
}

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42)
}

# Custom scoring method
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform GridSearchCV for each model
best_params = {}
best_scores = {}

for model_name, model in models.items():
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring='f1_weighted', cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_encoded, y_train)
    
    best_params[model_name] = grid_search.best_params_
    best_scores[model_name] = grid_search.best_score_

# Display the best parameters and scores for each model
best_params, best_scores


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits


({'Logistic Regression': {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'},
  'Random Forest': {'max_depth': None,
   'min_samples_leaf': 2,
   'min_samples_split': 5,
   'n_estimators': 200},
  'Support Vector Machine': {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}},
 {'Logistic Regression': 0.6930697626822789,
  'Random Forest': 0.663759490083859,
  'Support Vector Machine': 0.6703101919781055})

Hyper-parameter optimization is a crucial step in building effective machine learning models. By carefully selecting and tuning hyper-parameters using methods like GridSearchCV, we can significantly improve model performance, ensure better generalization, and adapt the models to the specific characteristics of the dataset. This process helps to achieve a balance between underfitting and overfitting, ultimately leading to more reliable and accurate predictions.

In [7]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_encoded, y_train)

# Retrain the models on the balanced dataset
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42)
}

# Custom scoring method
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform cross-validation for each model on the balanced dataset and store results
balanced_results = {}
for model_name, model in models.items():
    cv_results = cross_validate(model, X_train_balanced, y_train_balanced, cv=5, scoring=scoring)
    balanced_results[model_name] = {
        'accuracy': cv_results['test_accuracy'].mean(),
        'precision': cv_results['test_precision'].mean(),
        'recall': cv_results['test_recall'].mean(),
        'f1': cv_results['test_f1'].mean()
    }

# Compare the results with those from the imbalanced dataset
print("Balanced Dataset Results:", balanced_results)
print("Imbalanced Dataset Results:", best_scores)  # from the previous step

# Model recommendation based on the results
# Compare the balanced_results with best_scores and choose the best performing model
best_model = max(balanced_results, key=lambda x: balanced_results[x]['f1'])
print(f"The best model after balancing the dataset is: {best_model}")
print(f"Performance metrics: {balanced_results[best_model]}")

Balanced Dataset Results: {'Logistic Regression': {'accuracy': 0.8825014343086632, 'precision': 0.8904895304524688, 'recall': 0.8825014343086632, 'f1': 0.88127437405917}, 'Random Forest': {'accuracy': 0.8442914515203672, 'precision': 0.8676163040185079, 'recall': 0.8442914515203672, 'f1': 0.8372457408549977}, 'Support Vector Machine': {'accuracy': 0.86815834767642, 'precision': 0.8833621564503791, 'recall': 0.86815834767642, 'f1': 0.8653795611352366}}
Imbalanced Dataset Results: {'Logistic Regression': 0.6930697626822789, 'Random Forest': 0.663759490083859, 'Support Vector Machine': 0.6703101919781055}
The best model after balancing the dataset is: Logistic Regression
Performance metrics: {'accuracy': 0.8825014343086632, 'precision': 0.8904895304524688, 'recall': 0.8825014343086632, 'f1': 0.88127437405917}


In [8]:
Preprocess the data
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = pd.DataFrame(encoder.fit_transform(train_data.drop(columns=['Status'])), columns=encoder.get_feature_names_out(train_data.drop(columns=['Status']).columns))

# Split the dataset into training and test sets (using 80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, train_data['Status'], test_size=0.2, random_state=42)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train the Logistic Regression model on the balanced training data
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train_balanced, y_train_balanced)

# Perform predictions on the pre-processed test set
y_pred = logistic_model.predict(X_test)

# Report the model performance
performance_report = classification_report(y_test, y_pred)

# Print the performance report
print("Logistic Regression Model Performance after SMOTE:")
print(performance_report)

# If you have previous performance metrics from models without SMOTE, compare them here
# For example:
# print("Random Forest Model Performance (before SMOTE):")
# print(random_forest_performance_report)  # This should be the performance report of the Random Forest before applying SMOTE


SyntaxError: invalid syntax (4140188169.py, line 1)

In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

# Extract feature importance from Random Forest
feature_importances_rf = rf_model.feature_importances_
features_rf = X_train_balanced.columns

# Create a DataFrame for Random Forest feature importance
importance_rf_df = pd.DataFrame({
    'Feature': features_rf,
    'Importance': feature_importances_rf
}).sort_values(by='Importance', ascending=False)

# Train a Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train_balanced, y_train_balanced)

# Extract coefficients from Logistic Regression
coefficients_log = logistic_model.coef_[0]
features_log = X_train_balanced.columns

# Create a DataFrame for Logistic Regression coefficients
coefficients_log_df = pd.DataFrame({
    'Feature': features_log,
    'Coefficient': coefficients_log
}).sort_values(by='Coefficient', ascending=False)

# Visualize the top features side by side
def plot_top_features_side_by_side(importance_df_rf, coefficients_df_log, top_n=10):
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))

    # Plot for Random Forest
    sns.barplot(x='Importance', y='Feature', data=importance_df_rf.head(top_n), ax=axes[0])
    axes[0].set_title(f'Top {top_n} Features by Importance (Random Forest)')
    axes[0].set_xlabel('Feature Importance')
    axes[0].set_ylabel('Feature')

    # Plot for Logistic Regression
    sns.barplot(x='Coefficient', y='Feature', data=coefficients_df_log.head(top_n), ax=axes[1])
    axes[1].set_title(f'Top {top_n} Features by Coefficient (Logistic Regression)')
    axes[1].set_xlabel('Coefficient Value')
    axes[1].set_ylabel('Feature')

    plt.tight_layout()
    plt.show()

# Plot the top 10 features side by side
plot_top_features_side_by_side(importance_rf_df, coefficients_log_df, top_n=10)


Random Forest: Provides a robust and non-linear approach to feature importance, highlighting features that contribute to complex decision rules and interactions. Logistic Regression: Offers clear and interpretable insights into features with the strongest linear effects on the target variable. Combined Interpretation: By using both models, we can gain a comprehensive understanding of feature importance, capturing both linear and non-linear effects. This dual approach is statistically advantageous as it ensures that no important relationships in the data are overlooked, leading to a more thorough and reliable analysis of the factors influencing the prediction of "Status." When analyzing feature importance across two different models—Random Forest and Logistic Regression—each model provides a different perspective on which features are most influential for predicting the target variable ("Status"). Here's a statistical breakdown of the findings and reasons behind them:

1. Random Forest Feature Importance:
Nature of Random Forest:

Random Forest is an ensemble method that constructs multiple decision trees during training. Each tree is built by selecting random subsets of features and data points. The importance of a feature in a Random Forest is typically determined by how much it decreases the weighted impurity (e.g., Gini impurity or entropy) in a tree across all the trees in the forest.
Statistical Reasoning:

Non-Linearity: Random Forest can capture complex, non-linear interactions between features. Therefore, a feature that may not appear important in a linear model (like Logistic Regression) might be critical in a Random Forest if it plays a crucial role in certain decision paths within the trees.
Robustness: Since Random Forest averages the results of multiple trees, it tends to be more robust to noise. Features that consistently reduce impurity across various trees are considered important.
Feature Interactions: Random Forests can implicitly capture interactions between features. If two or more features together lead to significant impurity reduction in the trees, they may be given higher importance even if they are not as influential individually.
Finding: The top features identified by Random Forest are those that most effectively partition the data into homogeneous subsets across multiple decision trees, indicating their importance in capturing complex relationships in the data.

2. Logistic Regression Coefficients:
Nature of Logistic Regression:

Logistic Regression is a linear model that estimates the probability of a binary outcome based on a linear combination of input features. The model's coefficients represent the log-odds of the outcome for a one-unit increase in the corresponding feature, assuming all other features are held constant.
Statistical Reasoning:

Linearity: Logistic Regression assumes a linear relationship between each feature and the log-odds of the outcome. Therefore, features with large absolute coefficients are those that have the strongest linear association with the target variable.
Additivity: The effect of each feature is additive, meaning the model does not account for interactions between features unless interaction terms are explicitly included.
Interpretability: Coefficients in Logistic Regression are easy to interpret. A positive coefficient increases the log-odds of the positive class, while a negative coefficient decreases it. The magnitude of the coefficient indicates the strength of the relationship.
Finding: The top features identified by Logistic Regression are those with the strongest linear relationship with the target variable, making them influential in determining the outcome in a linear context.