**Import required libraries**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, roc_auc_score,classification_report,f1_score
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'xgboost'

**Load Data**

In [None]:
data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Aars/train(1).csv")

In [None]:
df=data.copy()
df.head()

In [None]:
df.drop("ID_code", axis=1, inplace=True) # Irrelevant column

In [None]:
print("Dataset Information:")
print(df.info())

**DATA PREPROCESSING**

In [None]:
# Check for Missing Values
print("\nMissing Values:")
print(df.isnull().sum().sum())

**Exploratory data analysis**

In [None]:
sns.countplot(df.target)
plt.show()

In [None]:
df.hist(figsize=(40,40));

**Outlier Detection**

In [None]:
def identify_outliers(df):

  idx = []

  for col in df.columns:
    if not col == "target":
      std = df[col].std()
      threshold = std * 3
      idx_col = df[col][(df[col]>=abs(threshold)) | (df[col]<=-abs(threshold))].index.to_list()
      idx.append(idx_col)

  idx_flat = list(set([i for j in idx for i in j]))

  return idx_flat

In [None]:
outliers_idx = identify_outliers(df)
len(outliers_idx)

**MODEL BUILDING**

In [None]:
# Splitting Features and Target
X = df.drop(columns=['target'])
y = df['target']

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


**Logistic Regression**

In [None]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
print("\nLogistic Regression")
print(classification_report(y_test, y_pred_log))

**XGBoost and Gaussian Naive Bayes**

In [None]:


# XGBoost
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("\nXGBoost")
print(classification_report(y_test, y_pred_xgb))

# Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print("\nGaussian Naive Bayes")
print(classification_report(y_test, y_pred_gnb))

**Random Forest**

In [None]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest")
print(classification_report(y_test, y_pred_rf))

**MODEL COMPARISON REPORT**

In [None]:
# Model Comparison Report
models = ['Logistic Regression', 'Random Forest', 'XGBoost', 'Gaussian Naive Bayes']
accuracy = [accuracy_score(y_test, y_pred_log), accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_xgb), accuracy_score(y_test, y_pred_gnb)]
precision = [precision_score(y_test, y_pred_log), precision_score(y_test, y_pred_rf), precision_score(y_test, y_pred_xgb), precision_score(y_test, y_pred_gnb)]
recall = [recall_score(y_test, y_pred_log), recall_score(y_test, y_pred_rf), recall_score(y_test, y_pred_xgb), recall_score(y_test, y_pred_gnb)]
f1 = [f1_score(y_test, y_pred_log), f1_score(y_test, y_pred_rf), f1_score(y_test, y_pred_xgb), f1_score(y_test, y_pred_gnb)]
auc = [roc_auc_score(y_test, y_pred_log), roc_auc_score(y_test, y_pred_rf), roc_auc_score(y_test, y_pred_xgb), roc_auc_score(y_test, y_pred_gnb)]

report_df = pd.DataFrame({'Model': models, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'AUC': auc})
print("\nModel Comparison Report:")
print(report_df)

# **Challenges Faced**
- Lack of feature names.
- Imbalanced Dataset, which was handled using performance metrics like Precision and Recall.
- Choosing the best model required multiple performance evaluation criteria.
- Presence of outliers which could affect model performance.


# **Model Comparison Report**
Based on the Model Comparison Report, the Gaussian Naive Bayes model is the best model among the four because it has:
The highest Recall(0.3701), which is crucial in predicting customer transactions since missing a potential transaction is more costly than predicitng one incorrectly. The highest AUC score,indicating better discrimination between classes. The best F1 score, balancing both precision and recall.