#Clustering

This notebook contains functions to support clustering runway photos iteratively using elcorto's imagecluster library.

##Set-up, Install, Import

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
def doMemoryHack():
  a = []
  while(1):
      a.append('1')

In [None]:
#get 25GB in Colab since I need it
# doMemoryHack()

In [None]:
def runInstalls():
  #! git clone https://github.com/elcorto/imagecluster "./drive/My Drive/project_files/imagecluster"
  ! pip3 uninstall tensorflow-gpu
  ! pip3 uninstall tensorflow
  ! pip3 install -e "./drive/My Drive/project_files/imagecluster"

In [None]:
runInstalls()

In [None]:
base_url = "/content/drive/My Drive/project_files/dataset/test_images_out"

In [None]:
import os
import sys
from pathlib import Path
from PIL import Image 
import shutil
import glob
sys.path.append(base_url)
from imagecluster import calc, io as icio, postproc
import pickle
import itertools
import math
import json

## Calculate Fingerprints from Images

Use elcorto's imagecluster library to calculate fingerprints from all images.

In [None]:
def calc_fingerprints(in_folder):
  for folder in sorted(os.listdir(in_folder)):
    if os.path.exists(os.path.join(in_folder, folder, "imagecluster", "fingerprints.pk")):
      os.remove(os.path.join(in_folder, folder, "imagecluster", "fingerprints.pk"))
    images = None
    fingerprints = None #being paranoid about memory
    timestamps = None
    images,fingerprints,timestamps = icio.get_image_data(os.path.join(in_folder, folder))
    # Remove the images.pk files since they are too big. 
    # I think it's possible to skip this step with better written code
    os.remove(os.path.join(in_folder, folder, "imagecluster", "images.pk"))

In [None]:
calc_fingerprints(base_url)

##Combine Fingerprints in One File
Put all fingerprint data in a single file. Perform PCA to reduce feature complexity. The 200 comes from the approximate number of features (if memory serves) returned if using n_components=0.9 on files individually.

In [None]:
def populate_big_dict(in_folder):
  big_pickle = dict()
  for folder in sorted(os.listdir(in_folder)):
    if len(folder.split("batch")) > 1:
      new_dict = pickle.load( open( os.path.join(in_folder, folder, "imagecluster", "fingerprints.pk"), "rb" ) )
      big_pickle.update( calc.pca(new_dict, n_components=200))
      new_dict = None
  return big_pickle


In [None]:
big_dict = dict()
try:
  big_dict = pickle.load( open( os.path.join(base_url, "big_dict.pk"), "rb" ) )
except:
  big_dict = populate_big_dict(base_url)
  pickle.dump( big_dict, open( os.path.join(base_url, "big_dict.pk"), "wb" ) )

In [None]:
len(big_dict.keys())

##Cluster the Image Features Iteratively

This method is probably quite bad. I wanted to reduce the number of photos in a non-arbitrary way to work with the data within the notebook's memory limit. To this end, I clustered features multiple times, discarding images from very large and very small clusters, until all could be clustered together. Parameters were chosen based on how cohesive the mid-size looked. 

Clustering was performed using elcorto's imagecluster library (which uses scipy's hierarchical clustering algorithm).

In [None]:
def cluster_for_noise(num_groups, sim_index, dict_object, min_c, min_filter, max_filter):
  iter_group = math.ceil(len(dict_object.keys())/num_groups)
  noise = []
  idx = 0
  while idx < len(dict_object.keys()):
    dict_segment = dict(itertools.islice(dict_object.items(), idx, idx + iter_group))
    clusters = calc.cluster(dict_segment, sim=sim_index, alpha=0, min_csize=min_c)
    print(idx/iter_group)
    for cluster in list(clusters.keys()):
      if int(cluster) > max_filter or int(cluster) < min_filter:
        noise = noise + list(clusters[cluster])
    dict_segment = {}
    clusters = {}
    idx = idx + iter_group
  return noise

In [None]:
#first round of clustering annd discarding
filtered_dict = {}
try:
  filtered_dict = pickle.load( open( os.path.join(base_url, "filtered_dict.pk"), "rb" ) )
except:
  #the num groups argument is usually chosen to give max ~50,000 images per group
  #which is about what the memory can handle given num features
  first_noise = cluster_for_noise(6, 0.75, big_dict, 1, 4, 500)
  first_noise = [item for sublist in first_noise for item in sublist] #flatten
  print(len(first_noise))
  filtered_dict = {k: v for k, v in big_dict.items() if k not in first_noise}
  print(len(filtered_dict))
  pickle.dump( filtered_dict, open( os.path.join(base_url, "filtered_dict.pk"), "wb" ) )

In [None]:
#second round of clustering annd discarding
double_filtered = {}
try:
  double_filtered = pickle.load( open( os.path.join(base_url, "double_filtered_dict.pk"), "rb" ) )
except:
  second_noise = cluster_for_noise(2, 0.65, filtered_dict, 1, 4, 1000)
  second_noise = [item for sublist in second_noise for item in sublist]
  double_filtered = {k: v for k, v in filtered_dict.items() if k not in second_noise}
  pickle.dump( double_filtered, open( os.path.join(base_url, "double_filtered_dict.pk"), "wb" ) )

In [None]:
#final clustering
cluter_dct = {}
try:
  with open(os.path.join(base_url, 'cluster_dct.json'), 'r') as f:
      cluster_dct = json.load(f)
except:
  clusters = calc.cluster(double_filtered, sim=0.65, alpha=0, min_csize=10)
  idx = 0
  for cluster_group in clusters:
    for cluster in clusters[cluster_group]:
      cluster_dct[idx] = cluster
      idx = idx + 1
  with open(os.path.join(base_url, 'cluster_dct.json'), 'w') as f:
    json.dump(cluster_dct, f)

In [None]:
#show 10 pictures each of the remaining clusters that have more
#than 14 pics total in each. I picked the groups that looked
#most cohesive here
import cv2
from google.colab.patches import cv2_imshow

for c in cluster_dct.keys():
  if len(cluster_dct[c]) > 14:
    try:
      print(c)
      print(len(cluster_dct[c]))
      row_1 = cv2.hconcat((cv2.imread(cluster_dct[c][0]), \
                          cv2.imread(cluster_dct[c][1]), \
                          cv2.imread(cluster_dct[c][2]), \
                          cv2.imread(cluster_dct[c][3]), \
                          cv2.imread(cluster_dct[c][4])))
      row_2 = cv2.hconcat((cv2.imread(cluster_dct[c][5]), \
                          cv2.imread(cluster_dct[c][6]), \
                          cv2.imread(cluster_dct[c][7]), \
                          cv2.imread(cluster_dct[c][8]), \
                          cv2.imread(cluster_dct[c][9])))
      concat = cv2.vconcat((row_1, row_2))
      scale_percent = 50 # percent of original size
      width = int(concat.shape[1] * scale_percent / 100)
      height = int(concat.shape[0] * scale_percent / 100)
      dim = (width, height)
      shrunk = cv2.resize(concat, (dim))
      row_1 = None
      row_2 = None
      concat = None
      cv2_imshow(shrunk)
    except:
      print("ERROR")


##Light Analysis
Let's just have a little peek at the data

In [None]:
import pandas as pd

array_for_pandas = []
for cluster in cluster_dct:
  for image in cluster_dct[cluster]:
    split_name = image.split("~")
    year = split_name[0][-4:]
    designer = split_name[1]
    array_for_pandas.append({"name": image, "year": year, "designer": designer, "cluster": cluster})

clothes_df = pd.DataFrame(data=array_for_pandas)

In [None]:
clothes_df.head()

In [None]:
clothes_df.shape

In [None]:
#paranoid removal of images with non-valid years...
#there were about 7 in my batch somehow
clothes_df = clothes_df[clothes_df.year.apply(lambda x: x.isnumeric())]

In [None]:
clothes_df.shape

In [None]:
#most frequently recurring designers
clothes_df.groupby("designer").count().sort_values("name", ascending=False).head(10)

In [None]:
#in case we haven't properly stripped out weird years
clothes_df = clothes_df.astype({'year': 'int32'})
just_years = clothes_df[clothes_df.year > 1988][clothes_df.year < 2015]

In [None]:
#most cohesive (to me) looking clusters
good_clusters = ["310","985","26","55","999","1009","1134","1194","808","840","511","112","350","373","481","483","923","892","531","535","549","43","49","50","74","75","76","130","277","289","295","297","316","405","425","430","435","494","501","787","886","901","943","968","993","1021","1025", "394", "397", "410"]

In [None]:
just_good_clusters = just_years[just_years["cluster"].isin(good_clusters)]

In [None]:
#split out the photo id so we're not saving out to folders
just_good_clusters["file_name"] = just_good_clusters["name"].str.split("~", n=-1, expand=True)[2]

In [None]:
#export cluster data to a csv
just_good_clusters[["year", "designer", "cluster", "file_name"]].to_csv("/content/drive/MyDrive/project_files/data.csv", index=False)

##Export Pics
Replace yellow background with transparent background. Export 1 version very small, one slightly bigger for timeline graphic

Background colour removal once again lifted from https://stackoverflow.com/questions/58754961/how-to-remove-the-object-marked-by-the-biggest-contour-from-an-image-and-save-it

In [None]:
import numpy as np
import cv2
from google.colab.patches import cv2_imshow

# yellow color boundaries [B, G, R]
lower = [0, 210, 210, 255]
upper = [110, 255, 255, 255]

# create NumPy arrays from the boundaries
lower = np.array(lower, dtype="uint8")
upper = np.array(upper, dtype="uint8")
index = 0
for row in just_good_clusters.iterrows():
  name = row[1][0]
  cluster = row[1][3]
  im_name = name.split("/")[len(name.split("/")) - 1]
  yellow_img = cv2.imread(name)
  b_channel, g_channel, r_channel = cv2.split(yellow_img)
  alpha_channel = np.full_like(yellow_img[...,0], 255)
  img_BGRA = cv2.merge((b_channel, g_channel, r_channel, alpha_channel)) 
  # find the colors within the specified boundaries and apply the mask
  mask = cv2.inRange(img_BGRA, lower, upper)
  # Change image to red where we found brown
  img_BGRA[mask>0]=(255,255,255,0)
  im_name_concat = im_name.split("~")[len(im_name.split("~")) - 1]
  
  scale_percent_sm = 7 # percent of original size
  width_sm = int(img_BGRA.shape[1] * scale_percent_sm / 100)
  height_sm = int(img_BGRA.shape[0] * scale_percent_sm / 100)
  dim_sm = (width_sm, height_sm)
  resized_sm = cv2.resize(img_BGRA, dim_sm, interpolation = cv2.INTER_AREA)
  cv2.imwrite(os.path.join("/content/drive/MyDrive/project_files", "out_sm", im_name_concat[:-3]+"png"), resized_sm)

  scale_percent_md = 55 # percent of original size
  width_md = int(img_BGRA.shape[1] * scale_percent_md / 100)
  height_md = int(img_BGRA.shape[0] * scale_percent_md / 100)
  dim_md = (width_md, height_md)
  resized_md = cv2.resize(img_BGRA, dim_md, interpolation = cv2.INTER_AREA)
  cv2.imwrite(os.path.join("/content/drive/MyDrive/project_files", "out_md", im_name_concat[:-3]+"png"), resized_md)


##Normalize Data
Export number of pictures per year so we can see % of images per year to account for years with very few pics

In [None]:
big_dict_array_for_pandas = []
for image in big_dict:
  split_name = image.split("~")
  year = split_name[0][-4:]
  designer = split_name[1]
  big_dict_array_for_pandas.append({"name": image, "year": year})

all_clothes_df = pd.DataFrame(data=big_dict_array_for_pandas)

In [None]:
all_clothes_df.groupby("year", as_index = False).count().to_csv("/content/drive/MyDrive/project_files/fashion_years.csv", index=False)