# COGS 118A Final Project Fall 2024

## Dataset 3: [Shill Bidding Dataset](https://archive.ics.uci.edu/dataset/562/shill+bidding+dataset)


In [2]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

### EDA and Preprocessing


In [3]:
dataset = pd.read_csv('data/Shill Bidding Dataset.csv')

dataset.head()

Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,1,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,2,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,3,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0
3,4,732,7***n,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,5,0
4,5,900,z***z,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,7,0


In [4]:
dataset.shape

(6321, 13)

In [5]:
# Check for missing values
dataset.isnull().sum()

Record_ID                 0
Auction_ID                0
Bidder_ID                 0
Bidder_Tendency           0
Bidding_Ratio             0
Successive_Outbidding     0
Last_Bidding              0
Auction_Bids              0
Starting_Price_Average    0
Early_Bidding             0
Winning_Ratio             0
Auction_Duration          0
Class                     0
dtype: int64

In [None]:
dataset = dataset.drop(['Record_ID', 'Auction_ID', 'Bidder_ID'], axis=1)
dataset.head()

Unnamed: 0,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0
3,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,5,0
4,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,7,0


## Logistic Regression


In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tqdm import tqdm

In [8]:
numerical_features = ['Bidder_Tendency', 'Bidding_Ratio', 'Successive_Outbidding',
                      'Last_Bidding', 'Auction_Bids', 'Starting_Price_Average',
                      'Early_Bidding', 'Winning_Ratio', 'Auction_Duration']
categorical_features = []  # Add if you have any categorical fields

# Preprocessor for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())  # Standardize features
])

# Preprocessor for categorical features (if any exist)
if categorical_features:
    categorical_transformer = Pipeline(steps=[
        # Handle missing categories
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode
    ])

# Combine preprocessors into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
    ] + ([
        ('cat', categorical_transformer, categorical_features)
    ] if categorical_features else []),
    remainder='drop'  # Drop other unlisted columns
)


def comprehensive_model_evaluation(X, y, classifier, param_grid, split_ratios=[0.2, 0.5, 0.8]):
    """
    Comprehensive model evaluation across multiple splits and configurations
    """
    all_results = []

    for test_size in tqdm(split_ratios, desc="Split Ratios"):
        for seed in tqdm(range(3), desc="Random Seeds", leave=False):
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=42 + seed)

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', classifier)
            ])

            grid_search = GridSearchCV(
                pipeline,
                param_grid,
                cv=5,
                scoring='accuracy',
                n_jobs=-1
            )
            grid_search.fit(X_train, y_train)

            best_model = grid_search.best_estimator_
            train_pred = best_model.predict(X_train)
            test_pred = best_model.predict(X_test)

            result = {
                'Test Size': f"{int((1-test_size)*100)}-{int(test_size*100)}",
                'Random Seed': seed,
                'Best Params': str(grid_search.best_params_),
                'Best CV Score': grid_search.best_score_,
                'Train Accuracy': accuracy_score(y_train, train_pred),
                'Test Accuracy': accuracy_score(y_test, test_pred),
                'Classifier': type(classifier).__name__
            }

            report = classification_report(y_test, test_pred, output_dict=True)
            result.update({
                'Macro Precision': report['macro avg']['precision'],
                'Macro Recall': report['macro avg']['recall'],
                'Macro F1-Score': report['macro avg']['f1-score']
            })

            all_results.append(result)

    results_df = pd.DataFrame(all_results)
    print("Evaluation Summary:")
    print(results_df.groupby(['Test Size', 'Classifier'])[
          ['Train Accuracy', 'Test Accuracy', 'Best CV Score']].agg(['mean', 'std']))

    return results_df

In [9]:
X = dataset.drop(columns=['Class'])
y = dataset['Class']

In [10]:
y.value_counts()

Class
0    5646
1     675
Name: count, dtype: int64

## Logistic Regression


In [11]:
lr_param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear']
}

# Comprehensive evaluation
lr_results = comprehensive_model_evaluation(
    X, y,
    LogisticRegression(),
    lr_param_grid
)

lr_results

Split Ratios: 100%|██████████| 3/3 [00:07<00:00,  2.53s/it]

Evaluation Summary:
                             Train Accuracy           Test Accuracy            \
                                       mean       std          mean       std   
Test Size Classifier                                                            
19-80     LogisticRegression       0.979958  0.003567      0.978116  0.001015   
50-50     LogisticRegression       0.978903  0.004032      0.978277  0.003247   
80-20     LogisticRegression       0.978837  0.001101      0.977075  0.004183   

                             Best CV Score            
                                      mean       std  
Test Size Classifier                                  
19-80     LogisticRegression      0.978112  0.005264  
50-50     LogisticRegression      0.978270  0.004297  
80-20     LogisticRegression      0.979167  0.001257  





Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__C': 0.001, 'classifier__penalty'...",0.980618,0.980024,0.972332,LogisticRegression,0.902202,0.967821,0.931637
1,80-20,1,"{'classifier__C': 0.001, 'classifier__penalty'...",0.978441,0.977848,0.980237,LogisticRegression,0.933372,0.976832,0.95366
2,80-20,2,"{'classifier__C': 0.001, 'classifier__penalty'...",0.978441,0.978639,0.978656,LogisticRegression,0.920507,0.974373,0.945272
3,50-50,0,"{'classifier__C': 0.1, 'classifier__penalty': ...",0.983228,0.983544,0.974692,LogisticRegression,0.914641,0.965127,0.937904
4,50-50,1,"{'classifier__C': 0.001, 'classifier__penalty'...",0.975949,0.976266,0.981019,LogisticRegression,0.931003,0.976103,0.952046
5,50-50,2,"{'classifier__C': 0.1, 'classifier__penalty': ...",0.975633,0.976899,0.979121,LogisticRegression,0.941276,0.948322,0.944762
6,19-80,0,"{'classifier__C': 0.01, 'classifier__penalty':...",0.982596,0.983386,0.977259,LogisticRegression,0.923298,0.967051,0.943713
7,19-80,1,"{'classifier__C': 1, 'classifier__penalty': 'l...",0.972316,0.976266,0.977852,LogisticRegression,0.938283,0.945504,0.941855
8,19-80,2,"{'classifier__C': 0.1, 'classifier__penalty': ...",0.979425,0.980222,0.979237,LogisticRegression,0.92477,0.975154,0.948057


## K-Nearest Neighbors


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [13]:
knn_param_grid = {
    # K from 1 to 104 with step 4
    'classifier__n_neighbors': np.arange(1, 105, 4),
    # 'uniform' or 'distance' weights
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan'],  # Distance metrics
}

# Assuming 'comprehensive_model_evaluation' is a function for evaluation
knn_results = comprehensive_model_evaluation(
    X, y,
    KNeighborsClassifier(),
    knn_param_grid
)

knn_results

Split Ratios: 100%|██████████| 3/3 [00:31<00:00, 10.38s/it]

Evaluation Summary:
                               Train Accuracy      Test Accuracy            \
                                         mean  std          mean       std   
Test Size Classifier                                                         
19-80     KNeighborsClassifier            1.0  0.0      0.986224  0.000975   
50-50     KNeighborsClassifier            1.0  0.0      0.987978  0.002214   
80-20     KNeighborsClassifier            1.0  0.0      0.990250  0.000456   

                               Best CV Score            
                                        mean       std  
Test Size Classifier                                    
19-80     KNeighborsClassifier      0.986021  0.001647  
50-50     KNeighborsClassifier      0.989135  0.002154  
80-20     KNeighborsClassifier      0.991232  0.001160  





Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__metric': 'manhattan', 'classifie...",0.99209,1.0,0.990514,KNeighborsClassifier,0.963397,0.988011,0.975284
1,80-20,1,"{'classifier__metric': 'manhattan', 'classifie...",0.989913,1.0,0.989723,KNeighborsClassifier,0.973346,0.976185,0.97476
2,80-20,2,"{'classifier__metric': 'manhattan', 'classifie...",0.991693,1.0,0.990514,KNeighborsClassifier,0.965339,0.984411,0.974625
3,50-50,0,"{'classifier__metric': 'manhattan', 'classifie...",0.990823,1.0,0.987029,KNeighborsClassifier,0.962397,0.970746,0.966522
4,50-50,1,"{'classifier__metric': 'manhattan', 'classifie...",0.986709,1.0,0.986397,KNeighborsClassifier,0.965544,0.961827,0.963676
5,50-50,2,"{'classifier__metric': 'manhattan', 'classifie...",0.989873,1.0,0.990509,KNeighborsClassifier,0.967791,0.982695,0.975089
6,19-80,0,"{'classifier__metric': 'manhattan', 'classifie...",0.986549,1.0,0.987344,KNeighborsClassifier,0.966344,0.967851,0.967096
7,19-80,1,"{'classifier__metric': 'manhattan', 'classifie...",0.984174,1.0,0.985762,KNeighborsClassifier,0.961614,0.963138,0.962375
8,19-80,2,"{'classifier__metric': 'manhattan', 'classifie...",0.987339,1.0,0.985565,KNeighborsClassifier,0.956638,0.967933,0.962194


## Decision Trees


In [14]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
dt_param_grid = {
    # Maximum depth of the tree
    'classifier__max_depth': [None, 5, 10, 20, 30],
    # Minimum number of samples required to split a node
    'classifier__min_samples_split': [2, 5, 10],
    # Minimum number of samples required to be at a leaf node
    'classifier__min_samples_leaf': [1, 2, 4],
    # The function to measure the quality of a split
    'classifier__criterion': ['gini', 'entropy'],
    # Strategy used to split at each node
    'classifier__splitter': ['best', 'random'],
    # The number of features to consider for the best split
    'classifier__max_features': [None, 'sqrt', 'log2']
}

# Model evaluation using comprehensive_model_evaluation
dt_results = comprehensive_model_evaluation(
    X, y,
    DecisionTreeClassifier(),
    dt_param_grid
)

dt_results

Split Ratios: 100%|██████████| 3/3 [00:42<00:00, 14.13s/it]

Evaluation Summary:
                                 Train Accuracy           Test Accuracy  \
                                           mean       std          mean   
Test Size Classifier                                                      
19-80     DecisionTreeClassifier       0.999736  0.000457      0.994727   
50-50     DecisionTreeClassifier       0.999051  0.000837      0.996626   
80-20     DecisionTreeClassifier       1.000000  0.000000      0.996047   

                                           Best CV Score            
                                       std          mean       std  
Test Size Classifier                                                
19-80     DecisionTreeClassifier  0.001510      0.996307  0.001647  
50-50     DecisionTreeClassifier  0.002222      0.997785  0.001141  
80-20     DecisionTreeClassifier  0.001369      0.998154  0.000604  





Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__criterion': 'entropy', 'classifi...",0.998023,1.0,0.996838,DecisionTreeClassifier,0.991542,0.991542,0.991542
1,80-20,1,"{'classifier__criterion': 'entropy', 'classifi...",0.998813,1.0,0.996838,DecisionTreeClassifier,0.989349,0.995212,0.992257
2,80-20,2,"{'classifier__criterion': 'entropy', 'classifi...",0.997627,1.0,0.994466,DecisionTreeClassifier,0.983294,0.986612,0.984945
3,50-50,0,"{'classifier__criterion': 'entropy', 'classifi...",0.999051,1.0,0.994306,DecisionTreeClassifier,0.982769,0.987756,0.985246
4,50-50,1,"{'classifier__criterion': 'gini', 'classifier_...",0.997468,0.998734,0.998735,DecisionTreeClassifier,0.996634,0.996634,0.996634
5,50-50,2,"{'classifier__criterion': 'gini', 'classifier_...",0.996835,0.998418,0.996836,DecisionTreeClassifier,0.986548,0.996899,0.991653
6,19-80,0,"{'classifier__criterion': 'gini', 'classifier_...",0.997625,1.0,0.993079,DecisionTreeClassifier,0.986991,0.976722,0.981785
7,19-80,1,"{'classifier__criterion': 'gini', 'classifier_...",0.99446,0.999209,0.995056,DecisionTreeClassifier,0.978345,0.996411,0.987157
8,19-80,2,"{'classifier__criterion': 'entropy', 'classifi...",0.996835,1.0,0.996045,DecisionTreeClassifier,0.987166,0.991997,0.989566


## Bagging Decisoin Tree


In [16]:
from sklearn.ensemble import BaggingClassifier

In [17]:
bagging_param_grid = {
    # Number of base estimators (trees)
    'classifier__n_estimators': [10, 50, 100, 200],
    # Proportion of samples to train each base estimator
    'classifier__max_samples': [0.5, 0.7, 1.0],
    # Proportion of features to train each base estimator
    'classifier__max_features': [0.5, 0.7, 1.0],
    # Whether to use bootstrap sampling
    'classifier__bootstrap': [True, False],
    # Base estimator (Decision Tree)
    'classifier__estimator': [DecisionTreeClassifier(random_state=42)],
    'classifier__random_state': [42]  # For reproducibility
}

# Model evaluation using comprehensive_model_evaluation
bagging_results = comprehensive_model_evaluation(
    X, y,
    BaggingClassifier(),
    bagging_param_grid
)

Split Ratios: 100%|██████████| 3/3 [02:08<00:00, 42.74s/it]

Evaluation Summary:
                            Train Accuracy           Test Accuracy            \
                                      mean       std          mean       std   
Test Size Classifier                                                           
19-80     BaggingClassifier       0.999473  0.000914      0.994661  0.002202   
50-50     BaggingClassifier       0.998840  0.000731      0.997153  0.002471   
80-20     BaggingClassifier       0.999670  0.000571      0.998682  0.000456   

                            Best CV Score            
                                     mean       std  
Test Size Classifier                                 
19-80     BaggingClassifier      0.994986  0.001994  
50-50     BaggingClassifier      0.997574  0.000483  
80-20     BaggingClassifier      0.998088  0.000892  





In [18]:
bagging_results

Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__bootstrap': True, 'classifier__e...",0.998023,1.0,0.998419,BaggingClassifier,0.999119,0.992424,0.995742
1,80-20,1,"{'classifier__bootstrap': False, 'classifier__...",0.999011,1.0,0.998419,BaggingClassifier,0.996105,0.996105,0.996105
2,80-20,2,"{'classifier__bootstrap': True, 'classifier__e...",0.997231,0.999011,0.999209,BaggingClassifier,0.99956,0.996124,0.997835
3,50-50,0,"{'classifier__bootstrap': True, 'classifier__e...",0.998101,0.998418,0.994306,BaggingClassifier,0.98517,0.98517,0.98517
4,50-50,1,"{'classifier__bootstrap': True, 'classifier__e...",0.997152,0.998418,0.998418,BaggingClassifier,0.995142,0.996458,0.995799
5,50-50,2,"{'classifier__bootstrap': True, 'classifier__e...",0.997468,0.999684,0.998735,BaggingClassifier,0.99403,0.999293,0.996643
6,19-80,0,"{'classifier__bootstrap': False, 'classifier__...",0.996835,1.0,0.992683,BaggingClassifier,0.981321,0.980543,0.980931
7,19-80,1,"{'classifier__bootstrap': True, 'classifier__e...",0.995251,0.998418,0.994265,BaggingClassifier,0.977745,0.992665,0.985054
8,19-80,2,"{'classifier__bootstrap': False, 'classifier__...",0.992873,1.0,0.997034,BaggingClassifier,0.988598,0.995859,0.992194


## Random Forest


In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf_param_grid = {
    'classifier__n_estimators': [1024],  # Number of trees set to 1024
    # Various options for max_features
    'classifier__max_features': [1, 2, 4, 6, 8, 12, 16, 20],
    'classifier__random_state': [42]  # For reproducibility
}

# Model evaluation using comprehensive_model_evaluation
rf_results = comprehensive_model_evaluation(
    X, y,
    RandomForestClassifier(),
    rf_param_grid
)

rf_results

Split Ratios: 100%|██████████| 3/3 [02:21<00:00, 47.24s/it]

Evaluation Summary:
                                 Train Accuracy      Test Accuracy            \
                                           mean  std          mean       std   
Test Size Classifier                                                           
19-80     RandomForestClassifier            1.0  0.0      0.994793  0.001981   
50-50     RandomForestClassifier            1.0  0.0      0.997575  0.002289   
80-20     RandomForestClassifier            1.0  0.0      0.998682  0.000456   

                                 Best CV Score            
                                          mean       std  
Test Size Classifier                                      
19-80     RandomForestClassifier      0.992089  0.001586  
50-50     RandomForestClassifier      0.997046  0.000659  
80-20     RandomForestClassifier      0.997890  0.000302  





Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__max_features': 6, 'classifier__n...",0.997825,1.0,0.998419,RandomForestClassifier,0.999119,0.992424,0.995742
1,80-20,1,"{'classifier__max_features': 8, 'classifier__n...",0.99822,1.0,0.998419,RandomForestClassifier,0.996105,0.996105,0.996105
2,80-20,2,"{'classifier__max_features': 6, 'classifier__n...",0.997627,1.0,0.999209,RandomForestClassifier,0.99956,0.996124,0.997835
3,50-50,0,"{'classifier__max_features': 8, 'classifier__n...",0.997785,1.0,0.994938,RandomForestClassifier,0.9844,0.989404,0.986885
4,50-50,1,"{'classifier__max_features': 8, 'classifier__n...",0.996835,1.0,0.999051,RandomForestClassifier,0.99682,0.99814,0.997479
5,50-50,2,"{'classifier__max_features': 6, 'classifier__n...",0.996519,1.0,0.998735,RandomForestClassifier,0.995319,0.997959,0.996634
6,19-80,0,"{'classifier__max_features': 12, 'classifier__...",0.993673,1.0,0.993277,RandomForestClassifier,0.982492,0.982492,0.982492
7,19-80,1,"{'classifier__max_features': 8, 'classifier__n...",0.992092,1.0,0.994068,RandomForestClassifier,0.974778,0.995032,0.984626
8,19-80,2,"{'classifier__max_features': 8, 'classifier__n...",0.990501,1.0,0.997034,RandomForestClassifier,0.988598,0.995859,0.992194


In [21]:
combined_results = pd.concat([
    lr_results, 
    knn_results, 
    dt_results, 
    bagging_results, 
    rf_results
])

# Aggregate and compare key metrics
comparison_summary = combined_results.groupby('Classifier')[
    ['Test Accuracy', 'Macro F1-Score', 'Best CV Score']
].agg(['mean', 'std'])

In [22]:
comparison_summary

Unnamed: 0_level_0,Test Accuracy,Test Accuracy,Macro F1-Score,Macro F1-Score,Best CV Score,Best CV Score
Unnamed: 0_level_1,mean,std,mean,std,mean,std
Classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
BaggingClassifier,0.996832,0.002425,0.991719,0.006301,0.996883,0.001823
DecisionTreeClassifier,0.9958,0.001727,0.988976,0.004592,0.997415,0.001346
KNeighborsClassifier,0.988151,0.002138,0.969069,0.005806,0.988796,0.002708
LogisticRegression,0.977823,0.002754,0.944323,0.006805,0.978516,0.00349
RandomForestClassifier,0.997017,0.002314,0.992221,0.005989,0.995675,0.002851
