In [2]:
pip install scikit-learn xgboost




In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

# Load the dataset
file_path = '/content/drive/MyDrive/machinelearning-assingment2-dataset.csv'
data = pd.read_csv(file_path)

# Income Binning: Divide the 'Income' column into bins (low, medium, high)
data['Income_Bin'] = pd.cut(data['Income'], bins=[0, 30000, 70000, float('inf')], labels=['Low', 'Medium', 'High'])

# Drop the original 'Income' column after binning
data = data.drop(columns=['Income'])

# Separate features and target variable
X = data.drop(columns=['Fraud', 'Credit_card_number', 'Expiry', 'Security_code'])
y = data['Fraud']

# Define the categorical columns to be encoded
categorical_cols = ['Profession', 'Income_Bin']

# Preprocess categorical data using OneHotEncoding and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns)
    ],
    remainder='passthrough'  # Keep other columns as is
)

# Initialize models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "SVM": SVC(probability=True)  # Enable probability estimation for AUC calculation
}

# Define hyperparameters for each model
param_grids = {
    "Decision Tree": {
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15],
        'min_samples_leaf': [1, 2, 4]
    },
    "XGBoost": {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    "SVM": {
        'C': [0.1, 1, 10],  # Regularization parameter
        'kernel': ['linear', 'rbf'],  # Kernel types
        'gamma': ['scale', 'auto']  # Kernel coefficient
    }
}

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Choose the scoring metric
scoring_metric = 'f1'

# Perform hyperparameter tuning with GridSearchCV using pipelines
best_models = {}

for model_name in models:
    print(f"\nTraining {model_name}...")

    # Create a pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        (model_name, models[model_name])
    ])

    # Construct parameter grid with correct prefix for the model in the pipeline
    param_grid = {f'{model_name}__' + key: value for key, value in param_grids[model_name].items()}

    # Perform GridSearchCV with the chosen scoring metric
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=scoring_metric, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Save the best model
    best_models[model_name] = grid_search.best_estimator_

    # Print best parameters and evaluate the model on the test set
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    y_pred = best_models[model_name].predict(X_test)

    # Evaluate using the chosen metric
    if scoring_metric == 'f1':
        print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")
    elif scoring_metric == 'roc_auc':
        # For AUC, we need probability estimates
        if hasattr(best_models[model_name], "predict_proba"):
            y_pred_proba = best_models[model_name].predict_proba(X_test)[:, 1]
            auc_score = roc_auc_score(y_test, y_pred_proba)
            print(f"AUC-ROC Score for {model_name}: {auc_score}")
        else:
            print(f"{model_name} does not support probability estimates for AUC-ROC calculation.")



Training Decision Tree...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for Decision Tree: {'Decision Tree__max_depth': 5, 'Decision Tree__min_samples_leaf': 1, 'Decision Tree__min_samples_split': 2}
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.50      0.48      0.49      1476
           1       0.52      0.54      0.53      1524

    accuracy                           0.51      3000
   macro avg       0.51      0.51      0.51      3000
weighted avg       0.51      0.51      0.51      3000


Training Random Forest...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for Random Forest: {'Random Forest__max_depth': 5, 'Random Forest__min_samples_leaf': 1, 'Random Forest__n_estimators': 50}
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.50      0.48      0.49      1476
           1       0.5

Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBoost: {'XGBoost__learning_rate': 0.01, 'XGBoost__max_depth': 5, 'XGBoost__n_estimators': 100}
Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.50      0.48      0.49      1476
           1       0.52      0.54      0.53      1524

    accuracy                           0.51      3000
   macro avg       0.51      0.51      0.51      3000
weighted avg       0.51      0.51      0.51      3000


Training SVM...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters for SVM: {'SVM__C': 0.1, 'SVM__gamma': 'auto', 'SVM__kernel': 'rbf'}
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.50      0.49      0.50      1476
           1       0.52      0.53      0.52      1524

    accuracy                           0.51      3000
   macro avg       0.51      0.51      0.51      3000
weighted avg       0.51      0.51      0.51      3000

