In [5]:
# Mount Drive to Access Data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import Libraries and Set Paths

In [6]:
# Define directories
base_dir = '/content/drive/MyDrive/w281FinalProjectLogo/'
drive_save_dir = base_dir + 'Logos-32plus_v1.0.1/feature_extraction/'

In [7]:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, plot_confusion_matrix, \
      roc_auc_score, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.decomposition import PCA
import seaborn as sns

### Load Data, Split

In [8]:
# Load df with extracted features
df = pd.read_pickle(drive_save_dir + 'fe_merged_120822.csv')
df.dropna(subset=['norm_hist'], inplace=True)
print(f"Num Bboxes: {len(df)}")
print(df['split'].value_counts())

Num Bboxes: 7359
train    6008
test      677
val       674
Name: split, dtype: int64


In [9]:
# Prepare class map
df.sort_values('class_code', ascending=True, inplace=True)
class_df = df[['class_code', 'class']].drop_duplicates()
class_map = dict(class_df[['class_code', 'class']].values)
class_names = class_df['class'].unique()

In [10]:
## Select features to use in modeling
# Section 1: Non-SIFT features
ns_features = ['cm_hsv_mean', 'cm_hsv_var', 'cm_hsv_skew',
                'cm_rgb_mean', 'cm_rgb_var', 'cm_rgb_skew',
                'cm_ycrcb_mean','cm_ycrcb_var', 'cm_ycrcb_skew',
                'hu_moments', 'contrast', 'dissimilarity',
                'homogeneity', 'energy', 'correlation', 'ASM']
                
# Non-SIFT: Combine each feature vector into 1-D array
features_ns = []
for idx, row in df.iterrows():
  vals = []
  for col in ns_features:
    vals.extend(list(row[col]))
  features_ns.append(vals)
df['features_ns'] = features_ns


# Section 2: All Features
all_features = ns_features + ['norm_hist']
                
# Combine each feature vector into 1-D array
features_all = []
for idx, row in df.iterrows():
  vals = []
  for col in all_features:
    vals.extend(list(row[col]))
  features_all.append(vals)
df['features_all'] = features_all

In [11]:
## split df
train_df = df.loc[df['split']=='train', ].copy()
val_df = df.loc[df['split']=='val', ].copy()
test_df = df.loc[df['split']=='test', ].copy()

## split X, y
y_train = train_df['class_code'].values
y_val = val_df['class_code'].values
y_test = test_df['class_code'].values

# Section 1: Non-SIFT features
X_train_ns = train_df['features_ns'].tolist()
X_val_ns = val_df['features_ns'].tolist()
X_test_ns = test_df['features_ns'].tolist()

# Section 2: All Features
X_train_all = train_df['features_all'].tolist()
X_val_all = val_df['features_all'].tolist()
X_test_all = test_df['features_all'].tolist()

# Section 3: SIFT Features Only
X_train_sift = train_df['norm_hist'].tolist()
X_val_sift = val_df['norm_hist'].tolist()
X_test_sift = test_df['norm_hist'].tolist()

In [12]:
## Standardize inputs
# Section 1: Non-SIFT features
scaler = StandardScaler() 
X_train_scaled_ns = scaler.fit_transform(X_train_ns)
X_val_scaled_ns = scaler.transform(X_val_ns)
X_test_scaled_ns = scaler.transform(X_test_ns)

# Section 2: All Features
scaler = StandardScaler() 
X_train_scaled_all = scaler.fit_transform(X_train_all)
X_val_scaled_all = scaler.transform(X_val_all)
X_test_scaled_all = scaler.transform(X_test_all)

# Section 3: SIFT Features
scaler = StandardScaler() 
X_train_scaled_sift = scaler.fit_transform(X_train_sift)
X_val_scaled_sift = scaler.transform(X_val_sift)
X_test_scaled_sift = scaler.transform(X_test_sift)

# Section 1: Non-SIFT Feature Modeling

## Model 1.1: SVM

In [13]:
# Instantiate linear SVC (manually tuned params)
lsvc = LinearSVC(penalty='l2', dual=False, multi_class = 'ovr', max_iter = 1000,
                C = 0.25, class_weight='balanced')

# Fit the training histograms and predict with SVC on training data itself
lsvc.fit(X_train_scaled_ns, y_train)

# Generate preds
lsvc_preds = lsvc.predict(X_test_scaled_ns)

# Get accuracy
acc = accuracy_score(y_test, lsvc_preds)
print(f"Accuracy: {acc:.2%}\n")

Accuracy: 79.03%



In [14]:
# Classification Report
print(classification_report(y_test, lsvc_preds, target_names=class_names))

              precision    recall  f1-score   support

      adidas       0.26      0.18      0.21        28
       apple       0.68      0.79      0.73        82
         bmw       0.82      0.79      0.81        81
    cocacola       0.79      0.68      0.73        71
         dhl       0.95      0.91      0.93        88
       fedex       0.75      0.79      0.77        81
    heineken       0.85      0.55      0.67        31
       pepsi       0.89      0.93      0.91       100
   starbucks       0.76      0.82      0.79        79
         ups       0.79      0.94      0.86        36

    accuracy                           0.79       677
   macro avg       0.75      0.74      0.74       677
weighted avg       0.79      0.79      0.79       677



## Model 1.2: K Nearest-Neighbors

In [15]:
# Fit model (manually tuned params on val set)
neigh = KNeighborsClassifier(n_neighbors=16, weights='distance')
neigh.fit(X_train_scaled_ns, y_train)

# Generate preds
preds = neigh.predict(X_test_scaled_ns)
y_proba = neigh.predict_proba(X_test_scaled_ns)

# Get accuracy
acc = accuracy_score(y_test, preds)
print(f"Accuracy: {acc:.2%}\n")

Accuracy: 71.64%



## Model 1.3: Log Regression

In [16]:
# Init multinomial logreg model (manually tuned on val set)
logreg = LogisticRegression(penalty='elasticnet', multi_class='multinomial',
                            solver='saga', max_iter=500, l1_ratio = 0.8)
logreg.fit(X_train_scaled_ns, y_train)

# Generate preds
lr_preds = logreg.predict(X_test_scaled_ns)

# Get accuracy
acc = accuracy_score(y_test, lr_preds)
print(f"\nAccuracy: {acc:.2%}\n")


Accuracy: 78.88%





# Section 2: All Features (Including SIFT)

## Model 2.1: SVM

In [17]:
# Instantiate linear SVC
lsvc = LinearSVC(penalty='l2', dual=False, multi_class = 'ovr', 
                 max_iter = 10000, C = 0.0001, class_weight=None)

# Fit the training histograms and predict with SVC on training data itself
lsvc.fit(X_train_scaled_all, y_train)

# Generate preds
lsvc_preds = lsvc.predict(X_test_scaled_all)

# Get accuracy
acc = accuracy_score(y_test, lsvc_preds)
print(f"Accuracy: {acc:.2%}\n")

Accuracy: 89.22%



In [18]:
# Classification Report
print(classification_report(y_test, lsvc_preds, target_names=class_names))

              precision    recall  f1-score   support

      adidas       0.74      0.61      0.67        28
       apple       0.77      0.82      0.79        82
         bmw       0.91      0.89      0.90        81
    cocacola       0.89      0.82      0.85        71
         dhl       0.93      0.98      0.96        88
       fedex       0.89      0.91      0.90        81
    heineken       0.90      0.87      0.89        31
       pepsi       0.91      0.90      0.90       100
   starbucks       0.96      1.00      0.98        79
         ups       0.92      0.94      0.93        36

    accuracy                           0.89       677
   macro avg       0.88      0.87      0.88       677
weighted avg       0.89      0.89      0.89       677



## Model 2.2: Multinomial Logistic Regression

In [19]:
# Init multinomial logreg model (manually tuned on val set)
logreg = LogisticRegression(penalty='l2', multi_class='multinomial',
                            solver='lbfgs', max_iter=100, C=0.001)
logreg.fit(X_train_scaled_all, y_train)

# Generate preds
lr_preds = logreg.predict(X_test_scaled_all)

# Get accuracy
acc = accuracy_score(y_test, lr_preds)
print(f"Accuracy: {acc:.2%}\n")

Accuracy: 88.63%



## Model 2.3: KNN

In [20]:
# Fit model
neigh = KNeighborsClassifier(n_neighbors=8, weights='distance') #<--- tuned n value on val set
neigh.fit(X_train_scaled_all, y_train)

# Generate preds
preds = neigh.predict(X_test_scaled_all)
y_proba = neigh.predict_proba(X_test_scaled_all)

# Get accuracy
acc = accuracy_score(y_test, preds)
print(f"Accuracy: {acc:.2%}\n")

Accuracy: 52.44%



# Section 3: SIFT-only Modeling

In [21]:
# Instantiate linear SVC
lsvc = LinearSVC(penalty='l2', dual=False, multi_class = 'ovr', 
                 max_iter = 5000, C = 0.0001, class_weight=None)

# Fit the training histograms and predict with SVC on training data itself
lsvc.fit(X_train_scaled_sift, y_train)

# Generate preds
lsvc_preds = lsvc.predict(X_test_scaled_sift)

# Get accuracy
acc = accuracy_score(y_test, lsvc_preds)
print(f"Accuracy: {acc:.2%}\n")

Accuracy: 80.35%



In [22]:
# Classification Report
print(classification_report(y_test, lsvc_preds, target_names=class_names))

              precision    recall  f1-score   support

      adidas       0.59      0.68      0.63        28
       apple       0.61      0.72      0.66        82
         bmw       0.88      0.83      0.85        81
    cocacola       0.82      0.70      0.76        71
         dhl       0.84      0.82      0.83        88
       fedex       0.92      0.86      0.89        81
    heineken       0.78      0.94      0.85        31
       pepsi       0.71      0.69      0.70       100
   starbucks       0.97      0.99      0.98        79
         ups       0.86      0.86      0.86        36

    accuracy                           0.80       677
   macro avg       0.80      0.81      0.80       677
weighted avg       0.81      0.80      0.81       677



In [23]:
# # Confusion Matrix
# cm = confusion_matrix(y_test, lsvc_preds)

# fig, ax = plt.subplots(1, 1, figsize=(10, 7))
# sns.heatmap(cm, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation
# ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
# ax.set_title('Confusion Matrix')
# ax.set_xticklabels(class_names, rotation=45, ha='right')
# ax.set_yticklabels(class_names, rotation=0, ha='right')
# plt.show()