In [None]:
#!pip install -U scikit-image

In [None]:
# Mount Drive to Access Data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Authenticate to access cloud bucket
from google.colab import auth
auth.authenticate_user()

# create a LOCAL directory in /content/  so you can move stuff from bucket to local
!mkdir /content/w281FinalProjectLogo
!mkdir /content/w281FinalProjectLogo/Logos-32plus_v1.0.1
# copy from google bucket to local directory
!gsutil -m -q cp -r gs://w281finalprojectlogo/Logos-32plus_v1.0.1 /content/w281FinalProjectLogo

## 1. Imports and Paths

In [None]:
# Define directories
base_dir = '/content/drive/MyDrive/w281FinalProjectLogo/'
bucket = '/content/w281FinalProjectLogo/Logos-32plus_v1.0.1/'
john_dir = base_dir + '/john/'
fe_dir = base_dir + '/john/feature_extraction/'
drive_save_dir = base_dir + 'Logos-32plus_v1.0.1/feature_extraction/'

# reading, writing to bucket
preproc_dir = bucket + 'preprocessed/'
bucket_save_dir = bucket + 'feature_extraction/'
da_path = preproc_dir + 'da/'
bb_path = preproc_dir + 'bb/'
cn_path = preproc_dir + 'cn/'

In [None]:
# Playing with labeled image data
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import cv2
import json
from tqdm import tqdm
import skimage.feature as feat

sys.path.append(fe_dir)
from helper_functions import HarrisKeypointDetector, SimpleFeatureDescriptor, \
    ORB_SIFT_FeatureDescriptor, extract_color_moments, hu_moments

try:
  os.mkdir(bucket_save_dir)
except:
  pass

In [None]:
# Set script params
new_df = False
resume_fe = True

# Set texture feature params
distances = [1, 2]
angles = [0, np.pi/8, np.pi/4, 3*np.pi/8, np.pi/2,
          5*np.pi/8, 3*np.pi/4, 7*np.pi/8]

## 2. Ingest Split BBoxes
Goal: For each split, get a list of image paths that we can load and loop through later

In [None]:
if new_df or resume_fe:

  # Step 1: Get the map of base images to bb files w/ split info
  bb_map_file = preproc_dir + 'preproc_map_cn.json'
  bb_map = pd.read_json(bb_map_file).T
  bb_map.index = bb_map.index.set_names(['img_path'])
  bb_map.reset_index(level=0, inplace=True)
  bb_map['image_name'] = bb_map['image_source'].apply(lambda x: x.split('/')[-1])

  # Step 2: get bb image list for val and test images
  val_df = bb_map.loc[bb_map['set']=='val', ].copy()
  test_df = bb_map.loc[bb_map['set']=='test', ].copy()

  # Step 3: get da image list for train images
  da_map_file = preproc_dir + 'preproc_map_da.json'
  da_map = pd.read_json(da_map_file).T
  da_map.index = da_map.index.set_names(['img_path'])
  da_map.reset_index(level=0, inplace=True)
  da_map['image_name'] = da_map['image_source'].apply(lambda x: x.split('/')[-1])
  train_df = da_map.loc[da_map['set']=='train', ].copy()

  # Step 4: Combine the train, test, val dfs to extract features in same loop
  loop_df = pd.concat([train_df, val_df, test_df], axis=0)
  loop_df.reset_index(inplace=True, drop=True)

  # Get counts
  print(f"Val bb count: {len(val_df)}")
  print(f"Test bb count: {len(test_df)}")
  print(f"Train da_bb count: {len(train_df)}")
  print(f"Loop df rows: {len(loop_df)}")

Val bb count: 361
Test bb count: 434
Train da_bb count: 10789
Loop df rows: 11584


## 3. Extract Features
Plan: Loop through each train, val, test list and extract features

In [None]:
if new_df and not resume_fe:
  ## Initialize empty df for extracted features
  features = pd.DataFrame({
    'image_path': pd.Series(dtype='str'),
    'image_source': pd.Series(dtype='str'),
    'bbox_source': pd.Series(dtype='str'),
    'class': pd.Series(dtype='str'),
    'class_code':  pd.Series(dtype='int'),
    'split': pd.Series(dtype='str'),
    })
  print("Initialize new feature df")
elif resume_fe:
  # Load df that has been processed, find next episodes to work with
  features = pd.read_pickle(drive_save_dir+'fe_subset_112722.csv')
  loop_df = loop_df.loc[~loop_df['img_path'].isin(features['image_path']), ].copy() # subset to unprocessed records
  loop_df.reset_index(inplace=True, drop=True)
  print(f"Remaining records to process: {len(loop_df)}")
else:
  # Load df that has been processed
  features = pd.read_pickle(drive_save_dir+'fe_subset_112722.csv')
  print(f"Loaded complete feature df w/ {len(features)} rows")

Remaining records to process: 795


In [None]:
## Train Loop
skipped = 0
if new_df | resume_fe:
  for idx, row in tqdm(loop_df.iterrows(), total=loop_df.shape[0]):
    try:
      # Step 1: load rgb images, create gray, hsv, ycrcb copies
      im_path = da_path if row['set']=='train' else cn_path
      bbox_rgb = plt.imread(im_path + row['img_path']) # read img
      bbox_gray = cv2.cvtColor(bbox_rgb, cv2.COLOR_RGB2GRAY) # convert to gray
      bbox_hsv = cv2.cvtColor(bbox_rgb, cv2.COLOR_RGB2HSV) # convert to HSV
      bbox_ycrcb = cv2.cvtColor(bbox_rgb, cv2.COLOR_RGB2YCR_CB) # convert to YCRCB

      # Step 2: extract moment features
      cm_rgb_mean, cm_rgb_var, cm_rgb_skew = extract_color_moments(bbox_rgb)
      cm_hsv_mean, cm_hsv_var, cm_hsv_skew = extract_color_moments(bbox_hsv)
      cm_ycrcb_mean, cm_ycrcb_var, cm_ycrcb_skew = extract_color_moments(bbox_ycrcb)
      hu_mom = hu_moments(bbox_gray).reshape(-1)

      # Step 3: Get texture features
      graycom = feat.graycomatrix(bbox_gray, distances, angles, levels=256)
      contrast = feat.graycoprops(graycom, 'contrast').reshape(-1)
      dissimilarity = feat.graycoprops(graycom, 'dissimilarity').reshape(-1)
      homogeneity = feat.graycoprops(graycom, 'homogeneity').reshape(-1)
      energy = feat.graycoprops(graycom, 'energy').reshape(-1)
      correlation = feat.graycoprops(graycom, 'correlation').reshape(-1)
      ASM = feat.graycoprops(graycom, 'ASM').reshape(-1)

      # Append new cols to df
      row_features = {
          # info from preproc_map
          'image_path': row['img_path'],
          'image_source': row['image_name'],
          'bbox_source': row['bbox_source'],
          'class': row['class'],
          'class_code':  row['class_code'],
          'split': row['set'],
          # moment features
          'hu_moments': hu_mom,
          'cm_rgb_mean': cm_rgb_mean,
          'cm_rgb_var': cm_rgb_var,
          'cm_rgb_skew': cm_rgb_skew,
          'cm_hsv_mean': cm_hsv_mean,
          'cm_hsv_var': cm_hsv_var,
          'cm_hsv_skew': cm_hsv_skew,
          'cm_ycrcb_mean': cm_ycrcb_mean,
          'cm_ycrcb_var': cm_ycrcb_var,
          'cm_ycrcb_skew': cm_ycrcb_skew,
          # texture features
          'contrast': contrast,
          'dissimilarity': dissimilarity,
          'homogeneity': homogeneity,
          'energy': energy,
          'correlation': correlation,
          'ASM': ASM,
          }
      features = features.append(row_features, ignore_index=True)

      if ((idx+1)%1000==0): # save every 1k records out
        print(f"\nSaving df with {len(features)} records")
        features.to_pickle(drive_save_dir+'fe_subset_112722.csv')
        features.to_pickle(bucket_save_dir+'fe_subset_112722.csv')

    except:
      skipped += 1
      print(f"Error #{skipped}")
      pass

# Save final df
features.to_pickle(drive_save_dir+'fe_subset_112722.csv')
features.to_pickle(bucket_save_dir+'fe_subset_112722.csv')
print(f"Total FE errors: {skipped}")

100%|██████████| 795/795 [16:45<00:00,  1.26s/it]


Total FE errors: 0


## Merge in SIFT Histogram Features

In [None]:
# Load SIFT
sift_train = pd.read_pickle(john_dir+'train_set.pkl')
sift_val = pd.read_pickle(john_dir+'val_set.pkl')
sift_test = pd.read_pickle(john_dir+'test_set.pkl')
sift = pd.concat([sift_train, sift_val, sift_test], axis=0)

# Merge
sift['merge_key'] = sift['file'].apply(lambda x: '_'.join(x[:-4].split("_")[:6]))
features['merge_key'] = features['image_path'].apply(lambda x: '_'.join(x[:-4].split("_")[:6]))
print(len(features))
features = pd.merge(features, sift[['merge_key', 'norm_hist']], on='merge_key', how='inner') # <-- should it be inner join?
print(len(features))

num_null = len(features.loc[features['norm_hist'].isnull(), ])
print(f"Null SIFT feature count: {num_null}")

# Save out merged features
features.to_pickle(drive_save_dir+'fe_merged_all_112722.csv')
#features.to_pickle(bucket_save_dir+'fe_merged_all_112722.csv')

11584
11584
Null SIFT feature count: 138


## Save changes back to storage bucket

In [None]:
# after done working, copy files back to google bucket
!gsutil -m -q cp -r /content/w281FinalProjectLogo/* gs://w281finalprojectlogo/