## Logistic Regression

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('..\\data\\clean\\ordinal_encoded.csv')

X = df.drop('class_value', axis=1)
y = df['class_value']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


In [5]:
# Create a pipeline with scaling and logistic regression
logistic_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(
        multi_class='multinomial',  # For multi-class classification
        solver='lbfgs',  # Recommended for multi-class
        max_iter=1000  # Increased iterations to ensure convergence
    ))
])

# Define hyperparameter grid
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l2'],
    'classifier__class_weight': [None, 'balanced']
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(
    logistic_pipeline, 
    param_grid, 
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)


In [6]:
# Fit the grid search
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Comprehensive Model Evaluation
print("Best Hyperparameters:")
print(grid_search.best_params_)

Best Hyperparameters:
{'classifier__C': 10, 'classifier__class_weight': None, 'classifier__penalty': 'l2'}




In [7]:
print("\nModel Performance Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Model Performance Metrics:
Accuracy: 0.8352601156069365

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.53      0.61        77
           1       0.58      0.50      0.54        14
           2       0.88      0.96      0.92       242
           3       0.67      0.62      0.64        13

    accuracy                           0.84       346
   macro avg       0.71      0.65      0.68       346
weighted avg       0.82      0.84      0.82       346


Confusion Matrix:
[[ 41   2  30   4]
 [  5   7   2   0]
 [  8   1 233   0]
 [  3   2   0   8]]


In [None]:
# Feature Importance (Coefficient Magnitude)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(best_model.named_steps['classifier'].coef_[0])
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Probability Predictions
y_pred_proba = best_model.predict_proba(X_test)
print("\nPrediction Probabilities (first 5 samples):")
print(y_pred_proba[:5])