In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_score, recall_score, roc_curve, roc_auc_score

In [2]:
X_train_engineered = pd.read_parquet('../../data/processed/X_train_engineered.parquet')
y_train_engineered = pd.read_parquet('../../data/processed/y_train_engineered.parquet')
X_test_engineered = pd.read_parquet('../../data/processed/X_test_engineered.parquet')
y_test_engineered = pd.read_parquet('../../data/processed/y_test_engineered.parquet')

In [3]:
total_features_train = X_train_engineered.shape[1]
total_features_test = X_test_engineered.shape[1]

print("Total number of features in the training set:", total_features_train)
print("Total number of features in the test set:", total_features_test)

Total number of features in the training set: 51
Total number of features in the test set: 51


In [4]:
logreg = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=logreg, n_features_to_select=51) 
X_train_selected = rfe.fit_transform(X_train_engineered, y_train_engineered)
X_test_selected = rfe.transform(X_test_engineered)

In [5]:
logreg.fit(X_train_selected, y_train_engineered)

In [6]:
y_pred = logreg.predict(X_test_selected)

In [7]:
accuracy = accuracy_score(y_test_engineered, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5530787116075638


In [8]:
precision = precision_score(y_test_engineered, y_pred)

recall = recall_score(y_test_engineered, y_pred)
print("Precision:", precision)
print("Recall:", recall)

Precision: 0.09438089432351993
Recall: 0.5411184210526315


In [9]:
y_lr_prob = logreg.predict_proba(X_test_engineered)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test_engineered, y_lr_prob)
roc_auc = roc_auc_score(y_test_engineered, y_lr_prob)
print("ROC AUC:", roc_auc)

ROC AUC: 0.5690855319493449


In [10]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

In [11]:
lr = LogisticRegression(max_iter=1000)

In [12]:
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy')

In [13]:
grid_search.fit(X_train_engineered, y_train_engineered)

In [14]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 0.01}


In [15]:
lr = LogisticRegression(max_iter=1000, C=grid_search.best_params_['C'])
lr.fit(X_train_engineered, y_train_engineered)

In [16]:
y_lr_predict = lr.predict(X_test_engineered)
accuracy = accuracy_score(y_test_engineered, y_lr_predict)
conf_matrix = confusion_matrix(y_test_engineered, y_lr_predict)

In [17]:
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.5530787116075638
Confusion Matrix:
[[31384 25255]
 [ 2232  2632]]


In [18]:

# Precision
precision = precision_score(y_test_engineered, y_lr_predict)

# Recall
recall = recall_score(y_test_engineered, y_lr_predict)

# ROC and AUC
y_lr_prob = lr.predict_proba(X_test_engineered)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test_engineered, y_lr_prob)
roc_auc = roc_auc_score(y_test_engineered, y_lr_prob)

print("Precision:", precision)
print("Recall:", recall)

Precision: 0.09438089432351993
Recall: 0.5411184210526315


In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_train_engineered, y_train_engineered, test_size=0.2, random_state=100)

In [20]:
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

In [21]:
best_accuracy = 0
best_C = None

In [22]:
for C in C_values:
    lr = LogisticRegression(max_iter=1000, C=C)
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C

final_lr = LogisticRegression(max_iter=1000, C=best_C)
final_lr.fit(X_train_engineered, y_train_engineered)

y_test_pred = final_lr.predict(X_test_engineered)
test_accuracy = accuracy_score(y_test_engineered, y_test_pred)

print("Best C value:", best_C)
print("Validation Accuracy with Best C:", best_accuracy)
print("Test Accuracy with Best C:", test_accuracy)

Best C value: 0.1
Validation Accuracy with Best C: 0.5512609988846201
Test Accuracy with Best C: 0.5530787116075638


In [23]:
total_features_train = X_train_engineered.shape[1]
total_features_test = X_test_engineered.shape[1]

print("Total number of features in the training set:", total_features_train)
print("Total number of features in the test set:", total_features_test)

Total number of features in the training set: 51
Total number of features in the test set: 51


In [24]:
logreg = LogisticRegression(max_iter=1000)

In [25]:
rfe = RFE(estimator=logreg, n_features_to_select=51) 

In [26]:
X_train_selected = rfe.fit_transform(X_train_engineered, y_train_engineered)

In [27]:
X_test_selected = rfe.transform(X_test_engineered)

In [28]:
logreg.fit(X_train_selected, y_train_engineered)

In [29]:
y_pred = logreg.predict(X_test_selected)

In [30]:
accuracy = accuracy_score(y_test_engineered, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5530787116075638
