# 1. Install and import libraries

In [2]:
import pandas as pd
from pathlib import Path
import re
import datetime
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from PIL import Image
from PIL import ImageFile
import matplotlib.pyplot as plt
%matplotlib inline
import random
import statistics

# 2. Read multiple rgba csv to form dataframe

In [3]:
#set instance of dataframe using rgba_0.csv
rgba = pd.read_csv('../data/rgba/rgba_0.csv')
rgba = rgba.T
image_list = list(rgba.index)
rgba_array = rgba.to_numpy()

rgba_df = pd.DataFrame(zip(image_list, rgba_array), columns = ['image_list', 'rgba_list'])
rgba_df

Unnamed: 0,image_list,rgba_list
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
195,../pictures/wo_background/ec24m/ec24m_2021-12-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
196,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
197,../pictures/wo_background/ec24m/ec24m_2021-08-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
198,../pictures/wo_background/ec24m/ec24m_2021-09-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [4]:
%%time
#loop through all 33 csvs to form complete dataframe of rgba for all 6000+ photos
for n in range(34):
    rgba = pd.read_csv('../data/rgba/rgba_'+str(n)+'.csv')
    rgba = rgba.T
    image_path = list(rgba.index)
    rgba_array = rgba.to_numpy()
    df = pd.DataFrame(zip(image_path, rgba_array), columns = ['image_list', 'rgba_list'])
    rgba_df = pd.concat([rgba_df,df])

CPU times: user 32.8 s, sys: 5.04 s, total: 37.8 s
Wall time: 38.8 s


In [5]:
#reset index of rgba_df
rgba_df.reset_index(drop=True, inplace = True)
rgba_df

Unnamed: 0,image_list,rgba_list
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
6834,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6835,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6836,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6837,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# 3. Add columns of information to rgba_df

In [6]:
#information such as instagram handle, date of post, and sequence were lost in the rgba csv
#hence need to add these back into the datafram

image_list = rgba_df['image_list']
instagram_handle = []
date_post = []
seq_post_samedate = []

for image in image_list:
    handle = re.search('wo_background/(.*)/', image).group(1)
    instagram_handle.append(handle)
    date_post.append(image[-16:-6])
    seq_post_samedate.append(image[-5:-4])

In [7]:
rgba_df['instagram_handle'] = instagram_handle
rgba_df['date_post'] = date_post
rgba_df['seq_post_samedate'] = seq_post_samedate

In [8]:
#delete corrupted posts that contains .DS
rgba_df = rgba_df[~rgba_df.date_post.str.contains('.DS')]
rgba_df.shape

(6837, 5)

In [9]:
rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate']\
.map({'C':'0','c':'0','0':'10','1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9'})
rgba_df['seq_post_samedate'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate']\


1     1933
0     1805
2     1680
3      777
4      320
5      125
6       78
7       43
8       34
9       30
10      12
Name: seq_post_samedate, dtype: int64

In [10]:
rgba_df['date_post'] = pd.to_datetime(rgba_df['date_post'])
rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate'].astype(str).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['date_post'] = pd.to_datetime(rgba_df['date_post'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate'].astype(str).astype(int)


In [11]:
#extract month
def month_extract(x):
    month = x.month
    return month

In [12]:
rgba_df['month'] = rgba_df['date_post'].apply(month_extract)
rgba_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rgba_df['month'] = rgba_df['date_post'].apply(month_extract)


Unnamed: 0,image_list,rgba_list,instagram_handle,date_post,seq_post_samedate,month
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-01-31,0,1
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-11-09,2,11
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-03-03,0,3
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-06-29,1,6
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-02-03,0,2
...,...,...,...,...,...,...
6834,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-01-15,4,1
6835,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-03-03,0,3
6836,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-02-03,0,2
6837,../pictures/wo_background/xinlinnn/xinlinnn_20...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",xinlinnn,2021-08-18,0,8


# 4. Apply k-means clustering for each photo

In [12]:
#function for k-means clustering where k=2
#returns the center of the clusters
def cluster2(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] >200], np.uint8)
    clt = KMeans(n_clusters = 2, max_iter=500, random_state=5).fit(x)
    centers = clt.cluster_centers_
    centers = np.uint8(centers)
    centers_ravel = centers.ravel()
    return centers_ravel

In [13]:
#function for k-means clustering where k=3
#returns the center of the clusters
def cluster3(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] >200], np.uint8)
    clt = KMeans(n_clusters = 3, max_iter=500, random_state=5).fit(x)
    centers = clt.cluster_centers_
    centers = np.uint8(centers)
    centers_ravel = centers.ravel()
    return centers_ravel

In [14]:
#function for k-means clustering where k=4
#returns the center of the clusters
def cluster4(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] >200], np.uint8)
    clt = KMeans(n_clusters = 4, max_iter=500, random_state=5).fit(x)
    centers = clt.cluster_centers_
    centers = np.uint8(centers)
    centers_ravel = centers.ravel()
    return centers_ravel

Tried with 5 clusters and some photos could not be processed, an indication that 5 clusters is probably too many for each photo. Just a quick eyeball scan on the photos also reveals that most photos do not have that many color themes going on, hence stopping at 4 clusters seem appropriate

In [16]:
%%time
#creating a column to house the centers of the 2 clusters
rgba_df['rgba_centers_2'] = rgba_df['rgba_list'].apply(cluster2)

CPU times: user 33min 4s, sys: 9min 4s, total: 42min 9s
Wall time: 7min


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [17]:
%%time
#creating a column to house the centers of the 3 clusters
rgba_df['rgba_centers_3'] = rgba_df['rgba_list'].apply(cluster3)

CPU times: user 50min 17s, sys: 13min 34s, total: 1h 3min 52s
Wall time: 10min 25s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [18]:
%%time
#creating a column to house the centers of the 4 clusters
rgba_df['rgba_centers_4'] = rgba_df['rgba_list'].apply(cluster4)

CPU times: user 1h 7min 25s, sys: 18min 6s, total: 1h 25min 32s
Wall time: 13min 51s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
#check that the new columns were added to the dataframe and input looks ok
rgba_df.head()

Unnamed: 0,image_list,rgba_list,instagram_handle,date_post,seq_post_samedate,month,rgba_centers_2,rgba_centers_3,rgba_centers_4
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-01-31,0,1,"[152, 96, 96, 252, 193, 153, 158, 253]","[124, 75, 73, 250, 174, 119, 122, 253, 198, 16...","[164, 99, 99, 253, 179, 133, 138, 253, 102, 64..."
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-11-09,2,11,"[145, 132, 123, 253, 50, 48, 51, 251]","[152, 138, 129, 253, 45, 45, 48, 251, 115, 100...","[154, 141, 132, 253, 44, 44, 48, 251, 103, 89,..."
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-03-03,0,3,"[223, 208, 191, 253, 91, 72, 68, 253]","[75, 57, 58, 253, 193, 167, 141, 252, 238, 233...","[240, 237, 229, 253, 127, 108, 102, 253, 55, 3..."
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-06-29,1,6,"[184, 84, 61, 253, 64, 25, 20, 253]","[61, 25, 20, 253, 210, 51, 22, 252, 156, 114, ...","[38, 22, 19, 253, 219, 56, 23, 252, 159, 118, ..."
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ec24m,2021-02-03,0,2,"[133, 56, 51, 250, 208, 172, 150, 251]","[67, 44, 42, 249, 208, 179, 161, 251, 194, 86,...","[203, 46, 48, 252, 214, 188, 172, 252, 186, 13..."


In [59]:
#save the dataframe to csv
rgba_df.to_csv('../data/cluster_by_photo.csv',index = False)

In [31]:
#sample 100 photos to get silhouette scores from
#silhouette scores take long to calculate and hence the sampling
#to make it fair, I have used 100 random numbers indicating row number in rgba_df
random.seed(5)
list_sample100 = random.sample(range(len(rgba_df)), 100)

In [13]:
def cluster2_score(x_array):
    x = x_array.reshape(-1, 4)
    x = np.array([f for f in x if f[3] !=0], np.uint8)
    clt = KMeans(n_clusters = 2, max_iter=500, random_state=5).fit(x)
    score = silhouette_score(x, clt.labels_, metric='euclidean')
    return score

In [14]:
def cluster3_score(x_array):
    x = x_array.reshape(-1, 4)
    x = np.array([f for f in x if f[3] !=0], np.uint8)
    clt = KMeans(n_clusters = 3, max_iter=500, random_state=5).fit(x)
    score = silhouette_score(x, clt.labels_, metric='euclidean')
    return score

In [51]:
def cluster4_score(x_array):
    x = x_array.reshape(-1, 4)
    x = np.array([f for f in x if f[3] !=0], np.uint8)
    clt = KMeans(n_clusters = 4, max_iter=500, random_state=5).fit(x)
    score = silhouette_score(x, clt.labels_, metric='euclidean')
    return score

In [43]:
%%time
cluster2_samplescores = []
image_list = []
for n in list_sample100:
    score = cluster2_score(rgba_df.iloc[n]['rgba_list'])
    image_path = rgba_df.iloc[n]['image_list']
    cluster2_samplescores.append(score)
    image_list.append(image_path)

CPU times: user 9min 2s, sys: 4min 45s, total: 13min 48s
Wall time: 3min 51s


In [44]:
statistics.median(cluster2_samplescores)

0.6181558797572099

In [45]:
statistics.mean(cluster2_samplescores)

0.6209313356957802

In [46]:
cluster2score_df = pd.DataFrame(zip(image_list, cluster2_samplescores), columns = ['image_list', 'cluster2_samplescores'])
cluster2score_df

Unnamed: 0,image_list,cluster2_samplescores
0,../pictures/wo_background/mirchelley/mirchelle...,0.538705
1,../pictures/wo_background/kimlimhl/kimlimhl_20...,0.593360
2,../pictures/wo_background/ohhowstrange/ohhowst...,0.662018
3,../pictures/wo_background/mongabong/mongabong_...,0.484139
4,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.672311
...,...,...
95,../pictures/wo_background/ohhowstrange/ohhowst...,0.679931
96,../pictures/wo_background/mongabong/mongabong_...,0.672868
97,../pictures/wo_background/aureliang_/aureliang...,0.590585
98,../pictures/wo_background/ec24m/ec24m_2021-03-...,0.645308


In [47]:
%%time
cluster3_samplescores = []
image_list = []
for n in list_sample100:
    score = cluster3_score(rgba_df.iloc[n]['rgba_list'])
    image_path = rgba_df.iloc[n]['image_list']
    cluster3_samplescores.append(score)
    image_list.append(image_path)

CPU times: user 9min 1s, sys: 4min 37s, total: 13min 38s
Wall time: 3min 52s


In [48]:
statistics.median(cluster3_samplescores)

0.6174883036078588

In [49]:
statistics.mean(cluster3_samplescores)

0.6189258527385727

In [50]:
cluster3score_df = pd.DataFrame(zip(image_list, cluster3_samplescores), columns = ['image_list', 'cluster3_samplescores'])
cluster3score_df

Unnamed: 0,image_list,cluster3_samplescores
0,../pictures/wo_background/mirchelley/mirchelle...,0.573004
1,../pictures/wo_background/kimlimhl/kimlimhl_20...,0.565264
2,../pictures/wo_background/ohhowstrange/ohhowst...,0.628560
3,../pictures/wo_background/mongabong/mongabong_...,0.522076
4,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.701073
...,...,...
95,../pictures/wo_background/ohhowstrange/ohhowst...,0.633929
96,../pictures/wo_background/mongabong/mongabong_...,0.708039
97,../pictures/wo_background/aureliang_/aureliang...,0.593973
98,../pictures/wo_background/ec24m/ec24m_2021-03-...,0.628080


In [52]:
%%time
cluster4_samplescores = []
image_list = []
for n in list_sample100:
    score = cluster4_score(rgba_df.iloc[n]['rgba_list'])
    image_path = rgba_df.iloc[n]['image_list']
    cluster4_samplescores.append(score)
    image_list.append(image_path)

CPU times: user 9min 3s, sys: 4min 37s, total: 13min 40s
Wall time: 3min 56s


In [53]:
statistics.median(cluster4_samplescores)

0.5551220634248124

In [54]:
statistics.mean(cluster4_samplescores)

0.5704408248071771

In [55]:
cluster4score_df = pd.DataFrame(zip(image_list, cluster4_samplescores), columns = ['image_list', 'cluster4_samplescores'])
cluster4score_df

Unnamed: 0,image_list,cluster4_samplescores
0,../pictures/wo_background/mirchelley/mirchelle...,0.458616
1,../pictures/wo_background/kimlimhl/kimlimhl_20...,0.558163
2,../pictures/wo_background/ohhowstrange/ohhowst...,0.527730
3,../pictures/wo_background/mongabong/mongabong_...,0.513245
4,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.637278
...,...,...
95,../pictures/wo_background/ohhowstrange/ohhowst...,0.615704
96,../pictures/wo_background/mongabong/mongabong_...,0.636422
97,../pictures/wo_background/aureliang_/aureliang...,0.523820
98,../pictures/wo_background/ec24m/ec24m_2021-03-...,0.632259


The sample mean and median silhouette scores for 2 & 3 clusters were quite close at around 0.62. The scores dipped to 0.55 (mean) and 0.57 (median) when number of clusters increased to 4-- not that 4 clusters are not appropriate, but we might want to anchor at 3 clusters for our analysis since the silhouette scores peaked there. Additionally, 3 color cluster per photo also seems reasonable as most influencers do not wear too many colors in one photo

In [56]:
sample100_score_df = cluster2score_df.merge(cluster3score_df, how = 'left', on = 'image_list')
sample100_score_df

Unnamed: 0,image_list,cluster2_samplescores,cluster3_samplescores
0,../pictures/wo_background/mirchelley/mirchelle...,0.538705,0.573004
1,../pictures/wo_background/kimlimhl/kimlimhl_20...,0.593360,0.565264
2,../pictures/wo_background/ohhowstrange/ohhowst...,0.662018,0.628560
3,../pictures/wo_background/mongabong/mongabong_...,0.484139,0.522076
4,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.672311,0.701073
...,...,...,...
95,../pictures/wo_background/ohhowstrange/ohhowst...,0.679931,0.633929
96,../pictures/wo_background/mongabong/mongabong_...,0.672868,0.708039
97,../pictures/wo_background/aureliang_/aureliang...,0.590585,0.593973
98,../pictures/wo_background/ec24m/ec24m_2021-03-...,0.645308,0.628080


In [57]:
sample100_score_df = sample100_score_df.merge(cluster4score_df, how = 'left', on = 'image_list')
sample100_score_df

Unnamed: 0,image_list,cluster2_samplescores,cluster3_samplescores,cluster4_samplescores
0,../pictures/wo_background/mirchelley/mirchelle...,0.538705,0.573004,0.458616
1,../pictures/wo_background/kimlimhl/kimlimhl_20...,0.593360,0.565264,0.558163
2,../pictures/wo_background/ohhowstrange/ohhowst...,0.662018,0.628560,0.527730
3,../pictures/wo_background/mongabong/mongabong_...,0.484139,0.522076,0.513245
4,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.672311,0.701073,0.637278
...,...,...,...,...
95,../pictures/wo_background/ohhowstrange/ohhowst...,0.679931,0.633929,0.615704
96,../pictures/wo_background/mongabong/mongabong_...,0.672868,0.708039,0.636422
97,../pictures/wo_background/aureliang_/aureliang...,0.590585,0.593973,0.523820
98,../pictures/wo_background/ec24m/ec24m_2021-03-...,0.645308,0.628080,0.632259


In [60]:
sample100_score_df.to_csv('../data/sample100_score.csv', index = False)

In [16]:
%%time
#try running all scores for 3 clusters
cluster3_allscores = []
image_list = []
for n in range(len(rgba_df)):
    score = cluster3_score(rgba_df.iloc[n]['rgba_list'])
    image_path = rgba_df.iloc[n]['image_list']
    cluster3_allscores.append(score)
    image_list.append(image_path)

CPU times: user 9h 2min 56s, sys: 4h 52min 58s, total: 13h 55min 54s
Wall time: 4h 19min 51s


In [17]:
statistics.median(cluster3_allscores)

0.6034284128002082

In [18]:
statistics.mean(cluster3_allscores)

0.6084052841646067

The scores for each photos using k=3 seems reasonable at 0.61, and close to the sample scores as well. This provides comfort for using these clusters, and moving to the next stage of clustering across multiple photos

In [19]:
cluster3_allscores_df = pd.DataFrame(zip(image_list, cluster3_allscores), columns = ['image_list', 'cluster3_scores'])
cluster3_allscores_df

Unnamed: 0,image_list,cluster3_scores
0,../pictures/wo_background/ec24m/ec24m_2021-01-...,0.498040
1,../pictures/wo_background/ec24m/ec24m_2021-11-...,0.797430
2,../pictures/wo_background/ec24m/ec24m_2021-03-...,0.605453
3,../pictures/wo_background/ec24m/ec24m_2021-06-...,0.499142
4,../pictures/wo_background/ec24m/ec24m_2021-02-...,0.549464
...,...,...
6832,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.627131
6833,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.622005
6834,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.603268
6835,../pictures/wo_background/xinlinnn/xinlinnn_20...,0.540809


In [20]:
cluster3_allscores_df.to_csv('../data/cluster3_allscores.csv', index = False)