# COGS 118A Final Project Fall 2024

## Dataset 4: [In-Vehicle Coupon Recommendation](https://archive.ics.uci.edu/dataset/603/in+vehicle+coupon+recommendation)


In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

### EDA and Preprocessing


In [13]:
from ucimlrepo import fetch_ucirepo
  
# fetch dataset 
rice_cammeo_and_osmancik = fetch_ucirepo(id=545) 

dataset = rice_cammeo_and_osmancik.data.original

dataset.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896,Cammeo
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436,Cammeo
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259,Cammeo
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669,Cammeo
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024,Cammeo


In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3810 entries, 0 to 3809
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Area               3810 non-null   int64  
 1   Perimeter          3810 non-null   float64
 2   Major_Axis_Length  3810 non-null   float64
 3   Minor_Axis_Length  3810 non-null   float64
 4   Eccentricity       3810 non-null   float64
 5   Convex_Area        3810 non-null   int64  
 6   Extent             3810 non-null   float64
 7   Class              3810 non-null   object 
dtypes: float64(5), int64(2), object(1)
memory usage: 238.3+ KB


In [15]:
dataset.shape

(3810, 8)

In [16]:
# Check for missing values
dataset.isnull().sum()

Area                 0
Perimeter            0
Major_Axis_Length    0
Minor_Axis_Length    0
Eccentricity         0
Convex_Area          0
Extent               0
Class                0
dtype: int64

In [17]:
dataset['Class'].value_counts()

Class
Osmancik    2180
Cammeo      1630
Name: count, dtype: int64

In [18]:
dataset['Class'] = dataset['Class'].apply(lambda x: 1 if x == 'Cammeo' else 0)

dataset['Class'].value_counts()

Class
0    2180
1    1630
Name: count, dtype: int64

## Logistic Regression


In [19]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tqdm import tqdm

In [20]:
# Categorical and numerical columns
numerical_features = ['Area', 'Perimeter', 'Major_Axis_Length', 
                      'Minor_Axis_Length', 'Eccentricity', 
                      'Convex_Area', 'Extent']

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler())  # Scale numerical data
])

# Combine everything in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_features),
    ],
    remainder='drop'  # Drop any columns not specified
)

def comprehensive_model_evaluation(X, y, classifier, param_grid, split_ratios=[0.2, 0.5, 0.8]):
    """
    Comprehensive model evaluation across multiple splits and configurations
    """
    all_results = []

    for test_size in tqdm(split_ratios, desc="Split Ratios"):
        for seed in tqdm(range(3), desc="Random Seeds", leave=False):
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=42 + seed)

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', classifier)
            ])

            grid_search = GridSearchCV(
                pipeline,
                param_grid,
                cv=5,
                scoring='accuracy',
                n_jobs=-1
            )
            grid_search.fit(X_train, y_train)

            best_model = grid_search.best_estimator_
            train_pred = best_model.predict(X_train)
            test_pred = best_model.predict(X_test)

            result = {
                'Test Size': f"{int((1-test_size)*100)}-{int(test_size*100)}",
                'Random Seed': seed,
                'Best Params': str(grid_search.best_params_),
                'Best CV Score': grid_search.best_score_,
                'Train Accuracy': accuracy_score(y_train, train_pred),
                'Test Accuracy': accuracy_score(y_test, test_pred),
                'Classifier': type(classifier).__name__
            }

            report = classification_report(y_test, test_pred, output_dict=True)
            result.update({
                'Macro Precision': report['macro avg']['precision'],
                'Macro Recall': report['macro avg']['recall'],
                'Macro F1-Score': report['macro avg']['f1-score']
            })

            all_results.append(result)

    results_df = pd.DataFrame(all_results)
    print("Evaluation Summary:")
    print(results_df.groupby(['Test Size', 'Classifier'])[
          ['Train Accuracy', 'Test Accuracy', 'Best CV Score']].agg(['mean', 'std']))

    return results_df

In [21]:
X = dataset.drop(columns=['Class'])
y = dataset['Class']

In [23]:
y.value_counts()

Class
0    2180
1    1630
Name: count, dtype: int64

## Logistic Regression


In [24]:
lr_param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear']
}

# Comprehensive evaluation
lr_results = comprehensive_model_evaluation(
    X, y,
    LogisticRegression(),
    lr_param_grid
)

lr_results

Split Ratios: 100%|██████████| 3/3 [00:05<00:00,  1.84s/it]

Evaluation Summary:
                             Train Accuracy           Test Accuracy            \
                                       mean       std          mean       std   
Test Size Classifier                                                            
19-80     LogisticRegression       0.933071  0.013826      0.927493  0.003991   
50-50     LogisticRegression       0.931409  0.003076      0.929659  0.002923   
80-20     LogisticRegression       0.932196  0.000947      0.928696  0.002732   

                             Best CV Score            
                                      mean       std  
Test Size Classifier                                  
19-80     LogisticRegression      0.932207  0.013769  
50-50     LogisticRegression      0.931409  0.003724  
80-20     LogisticRegression      0.931539  0.002238  





Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__C': 10, 'classifier__penalty': '...",0.929786,0.932743,0.927822,LogisticRegression,0.927797,0.926803,0.927259
1,80-20,1,"{'classifier__C': 10, 'classifier__penalty': '...",0.930772,0.931102,0.931759,LogisticRegression,0.930958,0.9296,0.930252
2,80-20,2,"{'classifier__C': 10, 'classifier__penalty': '...",0.93406,0.932743,0.926509,LogisticRegression,0.924871,0.925479,0.925168
3,50-50,0,"{'classifier__C': 0.1, 'classifier__penalty': ...",0.928084,0.929134,0.929134,LogisticRegression,0.930613,0.925307,0.927546
4,50-50,1,"{'classifier__C': 100, 'classifier__penalty': ...",0.935433,0.934908,0.927034,LogisticRegression,0.927906,0.923342,0.925327
5,50-50,2,"{'classifier__C': 10, 'classifier__penalty': '...",0.930709,0.930184,0.932808,LogisticRegression,0.931156,0.931431,0.931292
6,19-80,0,"{'classifier__C': 1, 'classifier__penalty': 'l...",0.918662,0.918635,0.929462,LogisticRegression,0.928272,0.928041,0.928156
7,19-80,1,"{'classifier__C': 100, 'classifier__penalty': ...",0.931768,0.934383,0.930118,LogisticRegression,0.928384,0.928957,0.928665
8,19-80,2,"{'classifier__C': 1, 'classifier__penalty': 'l...",0.94619,0.946194,0.9229,LogisticRegression,0.919997,0.923065,0.921372


## K-Nearest Neighbors


In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [26]:
knn_param_grid = {
    # K from 1 to 104 with step 4
    'classifier__n_neighbors': np.arange(1, 105, 4),
    # 'uniform' or 'distance' weights
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan'],  # Distance metrics
}

# Assuming 'comprehensive_model_evaluation' is a function for evaluation
knn_results = comprehensive_model_evaluation(
    X, y,
    KNeighborsClassifier(),
    knn_param_grid
)

knn_results

Split Ratios: 100%|██████████| 3/3 [00:16<00:00,  5.55s/it]

Evaluation Summary:
                               Train Accuracy           Test Accuracy  \
                                         mean       std          mean   
Test Size Classifier                                                    
19-80     KNeighborsClassifier       0.949694  0.046552      0.924650   
50-50     KNeighborsClassifier       0.958180  0.036491      0.924234   
80-20     KNeighborsClassifier       0.953959  0.040021      0.927384   

                                         Best CV Score            
                                     std          mean       std  
Test Size Classifier                                              
19-80     KNeighborsClassifier  0.002462      0.928259  0.013887  
50-50     KNeighborsClassifier  0.003375      0.930884  0.003076  
80-20     KNeighborsClassifier  0.010607      0.928916  0.003034  





Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__metric': 'manhattan', 'classifie...",0.927161,0.927493,0.929134,KNeighborsClassifier,0.928662,0.928662,0.928662
1,80-20,1,"{'classifier__metric': 'euclidean', 'classifie...",0.927167,1.0,0.937008,KNeighborsClassifier,0.936011,0.935337,0.935667
2,80-20,2,"{'classifier__metric': 'euclidean', 'classifie...",0.932419,0.934383,0.91601,KNeighborsClassifier,0.914188,0.914781,0.914478
3,50-50,0,"{'classifier__metric': 'euclidean', 'classifie...",0.929659,0.932808,0.921785,KNeighborsClassifier,0.924544,0.916747,0.919823
4,50-50,1,"{'classifier__metric': 'manhattan', 'classifie...",0.934383,0.941732,0.922835,KNeighborsClassifier,0.923264,0.919352,0.921081
5,50-50,2,"{'classifier__metric': 'manhattan', 'classifie...",0.928609,1.0,0.928084,KNeighborsClassifier,0.926272,0.926679,0.926473
6,19-80,0,"{'classifier__metric': 'euclidean', 'classifie...",0.913416,0.908136,0.927493,KNeighborsClassifier,0.927248,0.924874,0.925972
7,19-80,1,"{'classifier__metric': 'euclidean', 'classifie...",0.930427,1.0,0.923228,KNeighborsClassifier,0.921579,0.921579,0.921579
8,19-80,2,"{'classifier__metric': 'manhattan', 'classifie...",0.940936,0.940945,0.923228,KNeighborsClassifier,0.920528,0.922941,0.92164


## Decision Trees


In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
dt_param_grid = {
    # Maximum depth of the tree
    'classifier__max_depth': [None, 5, 10, 20, 30],
    # Minimum number of samples required to split a node
    'classifier__min_samples_split': [2, 5, 10],
    # Minimum number of samples required to be at a leaf node
    'classifier__min_samples_leaf': [1, 2, 4],
    # The function to measure the quality of a split
    'classifier__criterion': ['gini', 'entropy'],
    # Strategy used to split at each node
    'classifier__splitter': ['best', 'random'],
    # The number of features to consider for the best split
    'classifier__max_features': [None, 'sqrt', 'log2']
}

# Model evaluation using comprehensive_model_evaluation
dt_results = comprehensive_model_evaluation(
    X, y,
    DecisionTreeClassifier(),
    dt_param_grid
)

dt_results

Split Ratios: 100%|██████████| 3/3 [00:40<00:00, 13.35s/it]

Evaluation Summary:
                                 Train Accuracy           Test Accuracy  \
                                           mean       std          mean   
Test Size Classifier                                                      
19-80     DecisionTreeClassifier       0.915136  0.020089      0.917432   
50-50     DecisionTreeClassifier       0.927559  0.007926      0.916010   
80-20     DecisionTreeClassifier       0.930774  0.003868      0.923447   

                                           Best CV Score            
                                       std          mean       std  
Test Size Classifier                                                
19-80     DecisionTreeClassifier  0.006432      0.930452  0.012020  
50-50     DecisionTreeClassifier  0.009228      0.926509  0.004666  
80-20     DecisionTreeClassifier  0.008738      0.926399  0.002111  





Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__criterion': 'entropy', 'classifi...",0.925523,0.927493,0.927822,DecisionTreeClassifier,0.926896,0.928308,0.927471
1,80-20,1,"{'classifier__criterion': 'entropy', 'classifi...",0.924868,0.935039,0.929134,DecisionTreeClassifier,0.927191,0.92844,0.927789
2,80-20,2,"{'classifier__criterion': 'entropy', 'classifi...",0.928807,0.92979,0.913386,DecisionTreeClassifier,0.912558,0.910647,0.911542
3,50-50,0,"{'classifier__criterion': 'entropy', 'classifi...",0.928084,0.935958,0.922835,DecisionTreeClassifier,0.925231,0.918087,0.920952
4,50-50,1,"{'classifier__criterion': 'gini', 'classifier_...",0.930184,0.926509,0.905512,DecisionTreeClassifier,0.905986,0.901212,0.903254
5,50-50,2,"{'classifier__criterion': 'gini', 'classifier_...",0.92126,0.92021,0.919685,DecisionTreeClassifier,0.918012,0.917605,0.917806
6,19-80,0,"{'classifier__criterion': 'gini', 'classifier_...",0.919978,0.892388,0.916667,DecisionTreeClassifier,0.914091,0.917561,0.915527
7,19-80,1,"{'classifier__criterion': 'gini', 'classifier_...",0.927804,0.922572,0.911417,DecisionTreeClassifier,0.912617,0.906113,0.908845
8,19-80,2,"{'classifier__criterion': 'gini', 'classifier_...",0.943576,0.930446,0.924213,DecisionTreeClassifier,0.922522,0.922261,0.922391


## Bagging Decisoin Tree


In [29]:
from sklearn.ensemble import BaggingClassifier

In [30]:
bagging_param_grid = {
    # Number of base estimators (trees)
    'classifier__n_estimators': [10, 50, 100, 200],
    # Proportion of samples to train each base estimator
    'classifier__max_samples': [0.5, 0.7, 1.0],
    # Proportion of features to train each base estimator
    'classifier__max_features': [0.5, 0.7, 1.0],
    # Whether to use bootstrap sampling
    'classifier__bootstrap': [True, False],
    # Base estimator (Decision Tree)
    'classifier__estimator': [DecisionTreeClassifier(random_state=42)],
    'classifier__random_state': [42]  # For reproducibility
}

# Model evaluation using comprehensive_model_evaluation
bagging_results = comprehensive_model_evaluation(
    X, y,
    BaggingClassifier(),
    bagging_param_grid
)

Split Ratios: 100%|██████████| 3/3 [03:05<00:00, 61.88s/it]

Evaluation Summary:
                            Train Accuracy           Test Accuracy            \
                                      mean       std          mean       std   
Test Size Classifier                                                           
19-80     BaggingClassifier       0.977253  0.013145      0.924322  0.002184   
50-50     BaggingClassifier       0.976028  0.012134      0.924059  0.004243   
80-20     BaggingClassifier       0.976925  0.008999      0.924759  0.001515   

                            Best CV Score            
                                     mean       std  
Test Size Classifier                                 
19-80     BaggingClassifier      0.928268  0.015353  
50-50     BaggingClassifier      0.928259  0.002367  
80-20     BaggingClassifier      0.928259  0.002187  





In [31]:
bagging_results

Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__bootstrap': False, 'classifier__...",0.927164,0.987205,0.926509,BaggingClassifier,0.926817,0.92516,0.925884
1,80-20,1,"{'classifier__bootstrap': True, 'classifier__e...",0.926836,0.970472,0.923885,BaggingClassifier,0.922598,0.921945,0.922265
2,80-20,2,"{'classifier__bootstrap': True, 'classifier__e...",0.930777,0.973097,0.923885,BaggingClassifier,0.92244,0.92244,0.92244
3,50-50,0,"{'classifier__bootstrap': True, 'classifier__e...",0.928084,0.968504,0.92336,BaggingClassifier,0.925676,0.918689,0.921503
4,50-50,1,"{'classifier__bootstrap': True, 'classifier__e...",0.930709,0.969554,0.92021,BaggingClassifier,0.9213,0.916035,0.918277
5,50-50,2,"{'classifier__bootstrap': False, 'classifier__...",0.925984,0.990026,0.928609,BaggingClassifier,0.92709,0.926814,0.926951
6,19-80,0,"{'classifier__bootstrap': True, 'classifier__e...",0.910802,0.972441,0.926837,BaggingClassifier,0.92693,0.923848,0.925244
7,19-80,1,"{'classifier__bootstrap': True, 'classifier__e...",0.934374,0.992126,0.923228,BaggingClassifier,0.922162,0.920802,0.921455
8,19-80,2,"{'classifier__bootstrap': True, 'classifier__e...",0.939628,0.967192,0.9229,BaggingClassifier,0.921036,0.921122,0.921079


## Random Forest


In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
rf_param_grid = {
    'classifier__n_estimators': [1024],  # Number of trees set to 1024
    # Various options for max_features
    'classifier__max_features': [1, 2, 4, 6, 8, 12, 16, 20],
    'classifier__random_state': [42]  # For reproducibility
}

# Model evaluation using comprehensive_model_evaluation
rf_results = comprehensive_model_evaluation(
    X, y,
    RandomForestClassifier(),
    rf_param_grid
)

rf_results

Split Ratios: 100%|██████████| 3/3 [04:20<00:00, 86.67s/it] 

Evaluation Summary:
                                 Train Accuracy      Test Accuracy            \
                                           mean  std          mean       std   
Test Size Classifier                                                           
19-80     RandomForestClassifier            1.0  0.0      0.921150  0.003955   
50-50     RandomForestClassifier            1.0  0.0      0.922485  0.003940   
80-20     RandomForestClassifier            1.0  0.0      0.923010  0.004968   

                                 Best CV Score            
                                          mean       std  
Test Size Classifier                                      
19-80     RandomForestClassifier      0.920390  0.016272  
50-50     RandomForestClassifier      0.927384  0.005806  
80-20     RandomForestClassifier      0.926072  0.002674  





Unnamed: 0,Test Size,Random Seed,Best Params,Best CV Score,Train Accuracy,Test Accuracy,Classifier,Macro Precision,Macro Recall,Macro F1-Score
0,80-20,0,"{'classifier__max_features': 2, 'classifier__n...",0.924867,1.0,0.925197,RandomForestClassifier,0.925149,0.924161,0.924614
1,80-20,1,"{'classifier__max_features': 1, 'classifier__n...",0.924213,1.0,0.926509,RandomForestClassifier,0.925584,0.924243,0.924886
2,80-20,2,"{'classifier__max_features': 1, 'classifier__n...",0.929137,1.0,0.917323,RandomForestClassifier,0.916144,0.915206,0.91566
3,50-50,0,"{'classifier__max_features': 1, 'classifier__n...",0.928084,1.0,0.92021,RandomForestClassifier,0.922231,0.915623,0.918304
4,50-50,1,"{'classifier__max_features': 1, 'classifier__n...",0.932808,1.0,0.92021,RandomForestClassifier,0.920195,0.917037,0.918461
5,50-50,2,"{'classifier__max_features': 1, 'classifier__n...",0.92126,1.0,0.927034,RandomForestClassifier,0.925537,0.925123,0.925327
6,19-80,0,"{'classifier__max_features': 1, 'classifier__n...",0.901617,1.0,0.924869,RandomForestClassifier,0.924645,0.922112,0.923277
7,19-80,1,"{'classifier__max_features': 8, 'classifier__n...",0.929119,1.0,0.916995,RandomForestClassifier,0.915736,0.914483,0.915086
8,19-80,2,"{'classifier__max_features': 1, 'classifier__n...",0.930435,1.0,0.921588,RandomForestClassifier,0.91903,0.920904,0.919911


In [34]:
combined_results = pd.concat([
    lr_results, 
    knn_results, 
    dt_results, 
    bagging_results, 
    rf_results
])

# Aggregate and compare key metrics
comparison_summary = combined_results.groupby('Classifier')[
    ['Test Accuracy', 'Macro F1-Score', 'Best CV Score']
].agg(['mean', 'std'])

In [35]:
comparison_summary

Unnamed: 0_level_0,Test Accuracy,Test Accuracy,Macro F1-Score,Macro F1-Score,Best CV Score,Best CV Score
Unnamed: 0_level_1,mean,std,mean,std,mean,std
Classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
BaggingClassifier,0.92438,0.002522,0.922788,0.00274,0.928262,0.007844
DecisionTreeClassifier,0.918963,0.0079,0.917286,0.008352,0.927787,0.006832
KNeighborsClassifier,0.925423,0.00589,0.92393,0.006082,0.929353,0.007367
LogisticRegression,0.928616,0.002978,0.927226,0.002976,0.931718,0.007229
RandomForestClassifier,0.922215,0.003828,0.920614,0.004018,0.924616,0.009315
