In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE



# Load the dataset
data = pd.read_csv("ProcessedData.csv")

X = data.iloc[:, 2:].values  
y = data.iloc[:, 1].values

# SMOTE
smote = SMOTE(sampling_strategy=0.75)  # adjust sampling_strategy as needed
X_smote, y_smote = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Count the number of 0s and 1s in the label after SMOTE
label_counts_smote = pd.Series(y_smote).value_counts()

print("Number of 0s after SMOTE:", label_counts_smote[0])
print("Number of 1s after SMOTE:", label_counts_smote[1])

(395760, 148) (98940, 148) (395760,) (98940,)
Number of 0s after SMOTE: 282686
Number of 1s after SMOTE: 212014


# KNN

In [7]:
# Initialize the KNN classifier
k = 8  # You can choose any value for k
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Train the classifier
knn_classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("AUC:", auc)

# Additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7696583788154437
Recall: 0.9904699155234488
Precision: 0.6528219365044899
F1 Score: 0.7869575784770131
AUC: 0.7969375603873821

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.60      0.75     56443
           1       0.65      0.99      0.79     42497

    accuracy                           0.77     98940
   macro avg       0.82      0.80      0.77     98940
weighted avg       0.84      0.77      0.77     98940



# Linear Regression

In [8]:
# Create a Linear Regression model
linear_reg = LinearRegression()

# Train the Linear Regression model
linear_reg.fit(X_train, y_train)

# Make predictions using Linear Regression
linear_reg_y_pred = linear_reg.predict(X_test)

# Convert predictions to binary using a threshold of 0.5
threshold = 0.5
binary_predictions = (linear_reg_y_pred > threshold).astype(int)

# Evaluate the Linear Regression model
linear_reg_mse = mean_squared_error(y_test, binary_predictions)
linear_reg_r2 = r2_score(y_test, binary_predictions)

print("\nLinear Regression Mean Squared Error:", linear_reg_mse)
print("Linear Regression R-squared Score:", linear_reg_r2)


Linear Regression Mean Squared Error: 0.34123711340206186
Linear Regression R-squared Score: -0.3926170558130775


# Logistic Regression

In [9]:
logistic_reg = LogisticRegression()

# Train the Logistic Regression model
logistic_reg.fit(X_train, y_train)

# Make predictions using Logistic Regression
logistic_reg_y_pred = logistic_reg.predict(X_test)

conf_matrix = confusion_matrix(y_test, logistic_reg_y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

# Output TP, FP, TN, FN
print("\nTrue Positives (TP):", tp)
print("False Positives (FP):", fp)
print("True Negatives (TN):", tn)
print("False Negatives (FN):", fn)

positive_predictions = (logistic_reg_y_pred == 1).sum()
negative_predictions = (logistic_reg_y_pred == 0).sum()
print("\nPositive Predictions:", positive_predictions)
print("Negative Predictions:", negative_predictions)

# Evaluate Logistic Regression model
logistic_reg_accuracy = accuracy_score(y_test, logistic_reg_y_pred)
logistic_reg_accuracy = accuracy_score(y_test, logistic_reg_y_pred)
logistic_reg_recall = recall_score(y_test, logistic_reg_y_pred)
logistic_reg_precision = precision_score(y_test, logistic_reg_y_pred)
logistic_reg_f1 = f1_score(y_test, logistic_reg_y_pred)
logistic_reg_auc = roc_auc_score(y_test, logistic_reg_y_pred)

print("\nLogistic Regression Accuracy:", logistic_reg_accuracy)
print("Logistic Regression Recall:", logistic_reg_recall)
print("Logistic Regression Precision:", logistic_reg_precision)
print("Logistic Regression F1 Score:", logistic_reg_f1)
print("Logistic Regression AUC:", logistic_reg_auc)

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, logistic_reg_y_pred))

mse = mean_squared_error(y_test, y_pred)
r2_sc = r2_score(y_test, y_pred)

print("\nMean Squared Error:", mse)
print("R-squared Score:", r2_sc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



True Positives (TP): 21681
False Positives (FP): 13329
True Negatives (TN): 43114
False Negatives (FN): 20816

Positive Predictions: 35010
Negative Predictions: 63930

Logistic Regression Accuracy: 0.654891853648676
Logistic Regression Recall: 0.5101771889780455
Logistic Regression Precision: 0.619280205655527
Logistic Regression F1 Score: 0.5594591456255564
Logistic Regression AUC: 0.637013722494267

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.76      0.72     56443
           1       0.62      0.51      0.56     42497

    accuracy                           0.65     98940
   macro avg       0.65      0.64      0.64     98940
weighted avg       0.65      0.65      0.65     98940


Mean Squared Error: 0.23034162118455628
R-squared Score: 0.0599566760861312


# Decision Trees

In [10]:
# Create a Random Forest classifier
dt_classifier = DecisionTreeClassifier()

# Train the classifier on the training data
dt_classifier.fit(X_train, y_train)

# Predict on the testing data
y_pred = dt_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Print the evaluation metrics
print("Decision Tree Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

mse = mean_squared_error(y_test, y_pred)
r2_sc = r2_score(y_test, y_pred)

print("\nMean Squared Error:", mse)
print("R-squared Score:", r2_sc)

Decision Tree Classifier Metrics:
Accuracy: 0.8890944006468566
Precision: 0.8613148725472217
Recall: 0.8841565286961433
F1 Score: 0.8725862449344527
AUC: 0.888484373165817

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     56443
           1       0.86      0.88      0.87     42497

    accuracy                           0.89     98940
   macro avg       0.89      0.89      0.89     98940
weighted avg       0.89      0.89      0.89     98940


Mean Squared Error: 0.11090559935314331
R-squared Score: 0.547385020039189


# Random Forest

In [11]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Print the evaluation metrics
print("Random Forest Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

mse = mean_squared_error(y_test, y_pred)
r2_sc = r2_score(y_test, y_pred)

print("\nMean Squared Error:", mse)
print("R-squared Score:", r2_sc)

Random Forest Classifier Metrics:
Accuracy: 0.9518900343642611
Precision: 0.9987049028677151
Recall: 0.8891451161258441
F1 Score: 0.9407459044963402
AUC: 0.9441384918368179

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56443
           1       1.00      0.89      0.94     42497

    accuracy                           0.95     98940
   macro avg       0.96      0.94      0.95     98940
weighted avg       0.96      0.95      0.95     98940


Mean Squared Error: 0.048109965635738834
R-squared Score: 0.8036592267735843


# Naive Bayes

# Bayesian Networks

# Neural Networks

# Convolutional Neural Networks

# Support Vector Machines

In [12]:
svm_model = SVC(kernel='rbf', C=1.0, gamma='auto', random_state=42)
svm_model.fit(X_train, y_train)

# Predictions
y_pred_train = svm_model.predict(X_train)
y_pred_test = svm_model.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

# Classification report
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_pred_test))

In [None]:
from cuml.svm import SVC

# Initialize and train the SVM classifier on GPU
svm = SVC()
svm.fit(X_train, y_train)

# Predict on the test set
y_pred = svm.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# Print the evaluation metrics
print("Random Forest Classifier Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))