# 1. Install and import libraries

In [34]:
import pandas as pd
from pathlib import Path
import re
import datetime
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from sklearn.cluster import MeanShift

from PIL import Image
from PIL import ImageFile
import matplotlib.pyplot as plt
%matplotlib inline
import random
import statistics

# 2. Read multiple rgba csv to form dataframe

In [2]:
#set instance of dataframe using rgba_0.csv
rgba = pd.read_csv('../data/rgba/rgba_0.csv')
rgba = rgba.T
image_list = list(rgba.index)
rgba_array = rgba.to_numpy()

rgba_df = pd.DataFrame(zip(image_list, rgba_array), columns = ['image_list', 'rgba_list'])
rgba_df

Unnamed: 0,image_list,rgba_list
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
195,../pictures/wo_background/ec24m/ec24m_2021-12-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
196,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
197,../pictures/wo_background/ec24m/ec24m_2021-08-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
198,../pictures/wo_background/ec24m/ec24m_2021-09-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [4]:
%%time
#loop through all 33 csvs to form complete dataframe of rgba for all 6000+ photos
for n in range(34):
    rgba = pd.read_csv('../data/rgba/rgba_'+str(n)+'.csv')
    rgba = rgba.T
    image_path = list(rgba.index)
    rgba_array = rgba.to_numpy()
    df = pd.DataFrame(zip(image_path, rgba_array), columns = ['image_list', 'rgba_list'])
    rgba_df = pd.concat([rgba_df,df])

CPU times: user 31.6 s, sys: 4.94 s, total: 36.5 s
Wall time: 37.6 s


In [5]:
#reset index of rgba_df
rgba_df.reset_index(drop=True, inplace = True)
rgba_df

Unnamed: 0,image_list,rgba_list
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
6834,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6835,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6836,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6837,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# 3. Add columns of information to rgba_df

In [6]:
#information such as instagram handle, date of post, and sequence were lost in the rgba csv
#hence need to add these back into the datafram

image_list = rgba_df['image_list']
instagram_handle = []
date_post = []
seq_post_samedate = []

for image in image_list:
    handle = re.search('wo_background/(.*)/', image).group(1)
    instagram_handle.append(handle)
    date_post.append(image[-16:-6])
    seq_post_samedate.append(image[-5:-4])

In [7]:
rgba_df['instagram_handle'] = instagram_handle
rgba_df['date_post'] = date_post
rgba_df['seq_post_samedate'] = seq_post_samedate

In [8]:
#delete corrupted posts that contains .DS
rgba_df = rgba_df[~rgba_df.date_post.str.contains('.DS')]
rgba_df.shape

(6837, 5)

In [9]:
rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate']\
.map({'C':'0','c':'0','0':'10','1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9'})
rgba_df['seq_post_samedate'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate']\


1     1933
0     1805
2     1680
3      777
4      320
5      125
6       78
7       43
8       34
9       30
10      12
Name: seq_post_samedate, dtype: int64

In [10]:
rgba_df['date_post'] = pd.to_datetime(rgba_df['date_post'])
rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate'].astype(str).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['date_post'] = pd.to_datetime(rgba_df['date_post'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate'].astype(str).astype(int)


In [11]:
#extract month
def month_extract(x):
    month = x.month
    return month

In [12]:
rgba_df['month'] = rgba_df['date_post'].apply(month_extract)
rgba_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['month'] = rgba_df['date_post'].apply(month_extract)


Unnamed: 0,image_list,rgba_list,instagram_handle,date_post,seq_post_samedate,month
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-01-31,0,1
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-11-09,2,11
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-03-03,0,3
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-06-29,1,6
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-02-03,0,2
...,...,...,...,...,...,...
6834,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-01-15,4,1
6835,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-03-03,0,3
6836,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-02-03,0,2
6837,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-08-18,0,8


# 4. Apply mean-shift clustering

In [40]:
#function for mean shift clustering
#returns the center of the clusters
def meanshift_clustering(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] >200], np.uint8)
    clt = MeanShift(bin_seeding = True, cluster_all = False).fit(x)
    centers = clt.cluster_centers_
    centers = np.uint8(centers)
    centers_ravel = centers.ravel()
    return centers_ravel

In [15]:
%%time
rgba_df['meanshift_centers'] = rgba_df['rgba_list'].apply(meanshift_clustering)

CPU times: user 13h 13min 31s, sys: 22min 44s, total: 13h 36min 16s
Wall time: 11h 22min 53s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
rgba_df.head()

Unnamed: 0,image_list,rgba_list,instagram_handle,date_post,seq_post_samedate,month,meanshift_centers
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-01-31,0,1,"[188, 145, 148, 253]"
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-11-09,2,11,"[152, 138, 130, 253, 43, 43, 47, 253]"
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-03-03,0,3,"[231, 219, 204, 253, 61, 44, 38, 253]"
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-06-29,1,6,"[207, 51, 23, 252, 156, 113, 97, 253, 61, 26, ..."
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-02-03,0,2,"[208, 173, 152, 251, 78, 38, 31, 249]"


In [47]:
#define function to calculate number of clusters for each photo
def number_clusters(x):
    return len(x)/4

In [48]:
rgba_df['meanshift_number_clusters'] = rgba_df['meanshift_centers'].apply(number_clusters)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['meanshift_number_clusters'] = rgba_df['meanshift_centers'].apply(number_clusters)


In [49]:
rgba_df['meanshift_number_clusters'].value_counts()

2.0     4305
3.0     1364
1.0      953
4.0      143
5.0       24
6.0       12
7.0       10
8.0        9
14.0       3
12.0       3
9.0        3
20.0       2
13.0       1
34.0       1
11.0       1
16.0       1
10.0       1
15.0       1
Name: meanshift_number_clusters, dtype: int64

Most photos (>80% of the photos) have 2-3 clusters after mean-shift clustering. There are some outliers (~1% of photos) that have >4 clusters, which is likely to have been clustered wrongly as it is unlikely for a photo (with background removed) to have so many color themes going on. We should remove these data points

In [35]:
#filter outliers that have >4 clusters
rgba_df_filtered = rgba_df.loc[rgba_df['meanshift_number_clusters']<5]
rgba_df_filtered

Unnamed: 0,image_list,rgba_list,instagram_handle,date_post,seq_post_samedate,month,meanshift_centers,meanshift_number_clusters
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-01-31,0,1,"[188, 145, 148, 253]",1.0
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-11-09,2,11,"[152, 138, 130, 253, 43, 43, 47, 253]",2.0
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-03-03,0,3,"[231, 219, 204, 253, 61, 44, 38, 253]",2.0
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-06-29,1,6,"[207, 51, 23, 252, 156, 113, 97, 253, 61, 26, ...",3.0
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-02-03,0,2,"[208, 173, 152, 251, 78, 38, 31, 249]",2.0
...,...,...,...,...,...,...,...,...
6834,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-01-15,4,1,"[153, 129, 124, 239, 37, 26, 25, 241]",2.0
6835,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-03-03,0,3,"[174, 159, 159, 253, 34, 24, 21, 253]",2.0
6836,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-02-03,0,2,"[186, 45, 33, 251, 198, 156, 134, 248]",2.0
6837,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-08-18,0,8,"[182, 141, 127, 254, 207, 189, 190, 253, 38, 3...",3.0


In [36]:
#save the dataframe to csv
rgba_df_filtered.to_csv('../data/meanshiftcluster_by_photo.csv',index = False)

In [37]:
#sample 100 photos to get silhouette scores from
#silhouette scores take long to calculate and hence the sampling
#to make it fair, I have used 100 random numbers indicating row number in rgba_df_filtered
random.seed(5)
list_sample100 = random.sample(range(len(rgba_df)), 100)

In [41]:
def meanshift_score(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] >200], np.uint8)
    clt = MeanShift(bin_seeding = True, cluster_all = False).fit(x)
    score = silhouette_score(x, clt.labels_, metric='euclidean')
    return score

In [42]:
%%time
meanshift_samplescores = []
image_list = []
for n in list_sample100:
    score = meanshift_score(rgba_df_filtered.iloc[n]['rgba_list'])
    image_path = rgba_df.iloc[n]['image_list']
    meanshift_samplescores.append(score)
    image_list.append(image_path)

CPU times: user 19min 10s, sys: 3min 30s, total: 22min 40s
Wall time: 15min 34s


In [43]:
statistics.median(meanshift_samplescores)

0.5076052144436581

In [44]:
statistics.mean(meanshift_samplescores)

0.5000211303013818

In [45]:
meanshift_sample100_score_df = pd.DataFrame(zip(image_list, meanshift_samplescores), columns = ['image_list', 'meanshift_samplescores'])
meanshift_sample100_score_df

Unnamed: 0,image_list,meanshift_samplescores
0,../pictures/wo_background/mirchelley/mirchelle...,0.414861
1,../pictures/wo_background/kimlimhl/kimlimhl_20...,0.499510
2,../pictures/wo_background/ohhowstrange/ohhowst...,0.345161
3,../pictures/wo_background/mongabong/mongabong_...,0.408515
4,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.555531
...,...,...
95,../pictures/wo_background/ohhowstrange/ohhowst...,0.574395
96,../pictures/wo_background/mongabong/mongabong_...,0.467553
97,../pictures/wo_background/aureliang_/aureliang...,0.566081
98,../pictures/wo_background/ec24m/ec24m_2021-03-...,0.547017


In [46]:
meanshift_sample100_score_df.to_csv('../data/meanshift_sample100_score.csv', index = False)