# Fit Logistic Regression Classifier

In [37]:
# Packages
import pandas as pd
import sklearn
import os
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
import joblib

## Load Training Data

In [38]:
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    # If path does not end in a slash, add one
    if directory_path[-1] != '/':
        directory_path += '/'
    # list of files in directory
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    # read in all parquet files
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    # Return combined dataframe
    return combined_df

In [39]:
training_data = combine_directory_parquets('../../../Data/Features/All Features/train')
# Drop Image Path, test_80_20
training_data = training_data.drop(columns=['Image Path', 'test_80_20'])
# Create class mapping
# class_mapping = {class_name: index for index, class_name in enumerate(training_data['Class'].unique())}
# print(class_mapping)
# training_data['Class'] = training_data['Class'].map(class_mapping)
training_data

Unnamed: 0,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,ViT_Embedding_Element_9,...,ViT_Embedding_Element_1271,ViT_Embedding_Element_1272,ViT_Embedding_Element_1273,ViT_Embedding_Element_1274,ViT_Embedding_Element_1275,ViT_Embedding_Element_1276,ViT_Embedding_Element_1277,ViT_Embedding_Element_1278,ViT_Embedding_Element_1279,Class
0,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,-0.016147,...,0.231262,-0.059224,0.173239,0.363462,0.457626,-0.077351,-0.236950,-0.031632,-0.261893,Sedan
1,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,0.078013,...,0.261749,0.215697,0.023538,0.393746,0.455197,0.223018,-0.265846,-0.200683,-0.405006,SUV
2,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,-0.041193,...,0.010302,0.111003,0.158716,0.380261,0.493224,0.169883,-0.105756,0.124275,-0.446003,Convertible
3,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,0.139090,...,0.256229,0.048522,-0.032885,0.400770,0.430547,0.214644,-0.323948,-0.276459,-0.414079,Pickup
4,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,0.182784,...,0.034098,-0.089879,0.237541,0.321542,0.343445,0.047995,-0.305656,0.091199,-0.370617,SUV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,-0.158255,0.157451,-0.432161,-0.277322,-0.183226,0.360362,0.075823,-0.038296,-0.218423,0.415864,...,0.051396,-0.170626,0.026921,0.312183,0.341269,0.320813,-0.311571,0.077954,-0.347245,Convertible
408,-0.418245,0.207232,0.252406,-0.184066,0.186511,0.335708,-0.253648,0.084424,0.074907,0.242903,...,0.174837,-0.015541,0.334850,0.320183,0.060452,0.142291,-0.129446,-0.244659,-0.262288,SUV
409,-0.150467,0.195957,0.038565,-0.442825,0.062831,0.389507,-0.077920,0.054704,-0.321771,-0.055589,...,0.102700,0.030044,0.330523,0.191709,0.393343,0.126389,-0.291039,-0.021738,-0.298589,SUV
410,-0.016809,0.228351,-0.182011,-0.242165,0.052763,0.387869,0.048351,0.074478,-0.541290,0.097804,...,0.216685,0.016541,0.079678,0.477238,0.572567,0.115644,-0.315334,-0.274688,-0.190796,Sedan


## Hyperparameter Settings

Regularization: none, L1, L2

Balanced Class Weights: yes, no

Multi-Class Strategy: one-vs-rest, multinomial

In [40]:
# Create combinations of all hyperparameter settings
regularization = ['l1', 'l2', None]
class_weight = ['balanced', None]
multi_class_strategy = ['ovr', 'multinomial']
# Hyperparameter setting combinations dataframe
hyperparameters_setting_combos = pd.DataFrame(columns=['regularization', 'class_weight', 'multi_class_strategy'])
# Iterate through all hyperparameter settings
for reg in regularization:
    for cw in class_weight:
        for mcs in multi_class_strategy:
            hyperparameters_setting_combos = pd.concat([hyperparameters_setting_combos, pd.DataFrame({'regularization': reg, 'class_weight': cw, 'multi_class_strategy': mcs}, index=[0])])

hyperparameters_setting_combos

Unnamed: 0,regularization,class_weight,multi_class_strategy
0,l1,balanced,ovr
0,l1,balanced,multinomial
0,l1,,ovr
0,l1,,multinomial
0,l2,balanced,ovr
0,l2,balanced,multinomial
0,l2,,ovr
0,l2,,multinomial
0,,balanced,ovr
0,,balanced,multinomial


## Create Hyperparameter Setting Validation Set

Sample 10% of the training data for validation.

Note still using cross validation for l1, l2, etc. choices

In [41]:
hps_validation_df = training_data.sample(frac=0.1, random_state=290)
X_hps_validation = hps_validation_df.drop(columns=['Class'])
y_hps_validation = hps_validation_df['Class']
hps_training_df = training_data.drop(hps_validation_df.index)
X_hps_training = hps_training_df.drop(columns=['Class'])
y_hps_training = hps_training_df['Class']

## Preprocess Data

In [42]:
# Use StandardScaler to scale the data
scaler = sklearn.preprocessing.StandardScaler()
X_hps_training = scaler.fit_transform(X_hps_training)
X_hps_validation = scaler.transform(X_hps_validation)

## Function to Evaluate Hyperparameter Settings

In [44]:
def eval_hyperparameter_settings(regularization, class_weight, multi_class_strategy):
    '''
    Evaluate hyperparameter settings using Logistic Regression. Returns validation accuracy.
    '''
    # Fit model
    if regularization in ['l1', 'l2']:
        clf = LogisticRegressionCV(cv=10, random_state=290, penalty=regularization, class_weight=class_weight, multi_class=multi_class_strategy, solver='saga').fit(X_hps_training, y_hps_training)
    else:
        clf = LogisticRegression(random_state=290, penalty=regularization, class_weight=class_weight, multi_class=multi_class_strategy, solver='saga').fit(X_hps_training, y_hps_training)
    # Return validation accuracy
    return clf.score(X_hps_validation, y_hps_validation)

In [45]:
# Add column to hyperparameters_setting_combos for validation accuracy
hyperparameters_setting_combos['validation_accuracy'] = hyperparameters_setting_combos.apply(lambda row: eval_hyperparameter_settings(row['regularization'], row['class_weight'], row['multi_class_strategy']), axis=1)
hyperparameters_setting_combos



Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\ijyli\anaconda3\envs\computervision\Lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\ijyli\AppData\Local\Temp\ipykernel_14520\2958825575.py", line 2, in <module>
    hyperparameters_setting_combos['validation_accuracy'] = hyperparameters_setting_combos.apply(lambda row: eval_hyperparameter_settings(row['regularization'], row['class_weight'], row['multi_class_strategy']), axis=1)
                                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ijyli\anaconda3\envs\computervision\Lib\site-packages\pandas\core\frame.py", line 10034, in apply
    return op.apply().__finalize__(self, method="apply")
           ^^^^^^^^^^
  File "c:\Users\ijyli\anaconda3\envs\computervision\L

## Select Row With Highest Validation Score

In [None]:
highest_accuracy = hyperparameters_setting_combos['validation_accuracy'].max()
best_hyperparameters = dict(hyperparameters_setting_combos[hyperparameters_setting_combos['validation_accuracy'] == highest_accuracy])
best_hyperparameters

## Fit Model on Full Training Data

In [None]:
# Create matrices for training
X = training_data.drop(columns=['Class'])
y = training_data['Class']

# Preprocess with standard scalar
scaler = sklearn.preprocessing.StandardScaler()
X = scaler.fit_transform(X)

# Fit model
if best_hyperparameters['regularization'] in ['l1', 'l2']:
    clf = LogisticRegressionCV(cv=10, random_state=290, penalty=best_hyperparameters['regularization'], class_weight=best_hyperparameters['class_weight'], multi_class=best_hyperparameters['multi_class_strategy'], solver='saga').fit(X, y)
else:
    clf = LogisticRegression(random_state=290, penalty=best_hyperparameters['regularization'], class_weight=best_hyperparameters['class_weight'], multi_class=best_hyperparameters['multi_class_strategy'], solver='saga').fit(X, y)

# save
joblib.dump(clf, "Best Logistic Regression Model.pkl") 