In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold


# Load the dataset
data = pd.read_csv("ProcessedData.csv")

X = data.iloc[:, 2:].values  
y = data.iloc[:, 1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(246008, 148) (61503, 148) (246008,) (61503,)


# KNN

In [73]:
# Initialize the KNN classifier
k = 8  # You can choose any value for k
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Train the classifier
knn_classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("AUC:", auc)

# Additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9187519308001235
Recall: 0.0030309153364316025
Precision: 0.19230769230769232
F1 Score: 0.005967774020290432
AUC: 0.5009584678885363

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56554
           1       0.19      0.00      0.01      4949

    accuracy                           0.92     61503
   macro avg       0.56      0.50      0.48     61503
weighted avg       0.86      0.92      0.88     61503



# Linear Regression

In [74]:
# Create a Linear Regression model
linear_reg = LinearRegression()

# Train the Linear Regression model
linear_reg.fit(X_train, y_train)

# Make predictions using Linear Regression
linear_reg_y_pred = linear_reg.predict(X_test)

# Convert predictions to binary using a threshold of 0.5
threshold = 0.5
binary_predictions = (linear_reg_y_pred > threshold).astype(int)

# Evaluate the Linear Regression model
linear_reg_mse = mean_squared_error(y_test, binary_predictions)
linear_reg_r2 = r2_score(y_test, binary_predictions)

print("\nLinear Regression Mean Squared Error:", linear_reg_mse)
print("Linear Regression R-squared Score:", linear_reg_r2)


Linear Regression Mean Squared Error: 0.08045136009625546
Linear Regression R-squared Score: -0.08728953992533794


# Logistic Regression

In [75]:
logistic_reg = LogisticRegression()

# Train the Logistic Regression model
logistic_reg.fit(X_train, y_train)

# Make predictions using Logistic Regression
logistic_reg_y_pred = logistic_reg.predict(X_test)

conf_matrix = confusion_matrix(y_test, logistic_reg_y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

# Output TP, FP, TN, FN
print("\nTrue Positives (TP):", tp)
print("False Positives (FP):", fp)
print("True Negatives (TN):", tn)
print("False Negatives (FN):", fn)

positive_predictions = (logistic_reg_y_pred == 1).sum()
negative_predictions = (logistic_reg_y_pred == 0).sum()
print("\nPositive Predictions:", positive_predictions)
print("Negative Predictions:", negative_predictions)

# Evaluate Logistic Regression model
logistic_reg_accuracy = accuracy_score(y_test, logistic_reg_y_pred)
logistic_reg_accuracy = accuracy_score(y_test, logistic_reg_y_pred)
logistic_reg_recall = recall_score(y_test, logistic_reg_y_pred)
logistic_reg_precision = precision_score(y_test, logistic_reg_y_pred)
logistic_reg_f1 = f1_score(y_test, logistic_reg_y_pred)
logistic_reg_auc = roc_auc_score(y_test, logistic_reg_y_pred)

print("\nLogistic Regression Accuracy:", logistic_reg_accuracy)
print("Logistic Regression Recall:", logistic_reg_recall)
print("Logistic Regression Precision:", logistic_reg_precision)
print("Logistic Regression F1 Score:", logistic_reg_f1)
print("Logistic Regression AUC:", logistic_reg_auc)

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, logistic_reg_y_pred))

mse = mean_squared_error(y_test, y_pred)
r2_sc = r2_score(y_test, y_pred)

print("\nMean Squared Error:", mse)
print("R-squared Score:", r2_sc)


True Positives (TP): 1
False Positives (FP): 1
True Negatives (TN): 56553
False Negatives (FN): 4948

Positive Predictions: 2
Negative Predictions: 61501

Logistic Regression Accuracy: 0.9195323805342829
Logistic Regression Recall: 0.00020206102242877348
Logistic Regression Precision: 0.5
Logistic Regression F1 Score: 0.0004039587962027873
Logistic Regression AUC: 0.5000921894036003

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56554
           1       0.50      0.00      0.00      4949

    accuracy                           0.92     61503
   macro avg       0.71      0.50      0.48     61503
weighted avg       0.89      0.92      0.88     61503


Mean Squared Error: 0.08124806919987643
R-squared Score: -0.098056958570516


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Decision Trees

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Create a Random Forest classifier
dt_classifier = DecisionTreeClassifier()

# Train the classifier on the training data
dt_classifier.fit(X_train, y_train)

# Predict on the testing data
y_pred = dt_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Print the evaluation metrics
print("Decision Tree Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

mse = mean_squared_error(y_test, y_pred)
r2_sc = r2_score(y_test, y_pred)

print("\nMean Squared Error:", mse)
print("R-squared Score:", r2_sc)

# Random Forest

In [76]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Print the evaluation metrics
print("Random Forest Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

mse = mean_squared_error(y_test, y_pred)
r2_sc = r2_score(y_test, y_pred)

print("\nMean Squared Error:", mse)
print("R-squared Score:", r2_sc)

Random Forest Classifier Metrics:
Accuracy: 0.9195486399037446
Precision: 1.0
Recall: 0.00020206102242877348
F1 Score: 0.00040404040404040404
AUC: 0.5001010305112144

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56554
           1       1.00      0.00      0.00      4949

    accuracy                           0.92     61503
   macro avg       0.96      0.50      0.48     61503
weighted avg       0.93      0.92      0.88     61503


Mean Squared Error: 0.08045136009625546
R-squared Score: -0.08728953992533794
