In [1]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import datetime

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from PIL import Image
from PIL import ImageFile
import os
import numpy as np
import pandas as pd

In [2]:
# Load list of top 34 insta accounts
final_acct_df = pd.read_csv('../data/final_acct_df.csv')

In [3]:
accounts = final_acct_df['handle'].tolist()
accounts

['ec24m',
 'jiaqiwoo',
 'limrebecca',
 'speishi',
 'dreachong',
 'carriewst',
 'kimlimhl',
 'novitalam',
 'julietan_cxq',
 'mongabong',
 'aureliang_',
 'xplacidacidx',
 'melissackoh',
 'nicolekittykatx',
 'zoetay10',
 'hayleywoojiayi',
 'yahuiyh',
 'denisesoongeelyn',
 'rchlwngxx',
 'saffronsharpe',
 'euniceannabel',
 'katepang311',
 'honglingg_',
 'elaineruimin',
 'mirchelley',
 'sheila_sim',
 'fionafussi',
 'cheyennechesney',
 'soniachew',
 'narellekheng',
 'ohhowstrange',
 'xoxoapo',
 'jacelyn_tay',
 'xinlinnn']

In [4]:
# #splits accounts list into smaller chunks to make processing more manageable

# accounts_list = list()
# chunk_size = 5

# for i in range(0, len(accounts), chunk_size):
#     accounts_list.append(accounts[i:i+chunk_size])
    
# len(accounts_list)


In [5]:
#instantiate empty list to append data from each photo
rgba_ravel_list = []
image_list = []
instagram_handle = []
date_post = []
seq_post_samedate = []

In [6]:
#set size for resizing each photo so that they are all the same size
new_size = (400,400)

In [7]:
%%time
#for loop to loop through all photos in the 34 accounts
#append rgba information, image path, instagram handle, date of post
#and also sequence of post if there are more than one post a day

for account in accounts:
    in_path ='../pictures/wo_background/' + account
    for image_path in os.listdir(in_path):
        # image_path contains name of the image 
        input_path = os.path.join(in_path, image_path)
        imgfile = Image.open(input_path)
        imgfile = imgfile.resize(new_size)
        na = np.array(imgfile.getdata(), np.uint8)
        row = na.ravel()

        rgba_ravel_list.append(row)
        image_list.append(input_path)
        instagram_handle.append(account)
        date_post.append(image_path[-16:-6])
        seq_post_samedate.append(image_path[-5:-4])

CPU times: user 16min 8s, sys: 5.39 s, total: 16min 14s
Wall time: 16min 28s


In [8]:
rgba_df = pd.DataFrame(zip(instagram_handle, image_list, date_post, seq_post_samedate, rgba_ravel_list), \
                       columns = ['instagram_handle', 'image_list', 'date_post', 'seq_post_samedate', 'rgba_ravel_list'])
rgba_df.shape

(6639, 5)

In [9]:
#delete corrupted posts that contains .DS
rgba_df = rgba_df[~rgba_df.date_post.str.contains('.DS')]
rgba_df.shape

(6638, 5)

In [10]:
#translate seq_post_samedate
rgba_df['seq_post_samedate'].value_counts()

1    1863
C    1754
2    1621
3     758
4     316
5     124
6      77
7      43
8      33
9      29
0      12
c       8
Name: seq_post_samedate, dtype: int64

In [11]:
rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate']\
.map({'C':'1','c':'1','0':'11','1':'2','2':'3','3':'4','4':'5','5':'6','6':'7','7':'8','8':'9','9':'10'})
rgba_df['seq_post_samedate'].value_counts()

2     1863
1     1762
3     1621
4      758
5      316
6      124
7       77
8       43
9       33
10      29
11      12
Name: seq_post_samedate, dtype: int64

In [12]:
rgba_df['date_post'] = pd.to_datetime(rgba_df['date_post'])
rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate'].astype(str).astype(int)
rgba_df.to_csv('../data/accounts_rgba.csv')

In [13]:
def cluster2(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] !=0], np.uint8)
    clt = KMeans(n_clusters = 2, max_iter=500, random_state=5).fit(x)
    centers = clt.cluster_centers_
    centers = np.uint8(centers)
    centers_ravel = centers.ravel()
    return centers_ravel

In [14]:
%%time
rgba_df['rgba_centers_2'] = rgba_df['rgba_ravel_list'].apply(cluster2)

CPU times: user 3h 21min 20s, sys: 34min 20s, total: 3h 55min 41s
Wall time: 34min 4s


In [15]:
rgba_df.head()

Unnamed: 0,instagram_handle,image_list,date_post,seq_post_samedate,rgba_ravel_list,rgba_centers_2
0,ec24m,../pictures/wo_background/ec24m/ec24m_2021-01-...,2021-01-31,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[177, 131, 134, 253, 13, 10, 9, 103]"
1,ec24m,../pictures/wo_background/ec24m/ec24m_2021-11-...,2021-11-09,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[111, 101, 97, 250, 5, 5, 4, 15]"
2,ec24m,../pictures/wo_background/ec24m/ec24m_2021-03-...,2021-03-03,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[66, 52, 49, 189, 221, 206, 188, 252]"
3,ec24m,../pictures/wo_background/ec24m/ec24m_2021-06-...,2021-06-29,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[177, 79, 58, 251, 41, 22, 19, 184]"
4,ec24m,../pictures/wo_background/ec24m/ec24m_2021-02-...,2021-02-03,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[191, 138, 123, 249, 25, 12, 11, 76]"


In [16]:
rgba_df.tail()

Unnamed: 0,instagram_handle,image_list,date_post,seq_post_samedate,rgba_ravel_list,rgba_centers_2
6634,xinlinnn,../pictures/wo_background/xinlinnn/xinlinnn_20...,2021-01-15,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 4, 4, 20, 112, 91, 87, 229]"
6635,xinlinnn,../pictures/wo_background/xinlinnn/xinlinnn_20...,2021-03-03,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[168, 148, 143, 251, 42, 30, 25, 215]"
6636,xinlinnn,../pictures/wo_background/xinlinnn/xinlinnn_20...,2021-02-03,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[17, 4, 4, 32, 170, 91, 78, 243]"
6637,xinlinnn,../pictures/wo_background/xinlinnn/xinlinnn_20...,2021-08-18,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[192, 164, 157, 253, 50, 38, 35, 210]"
6638,xinlinnn,../pictures/wo_background/xinlinnn/xinlinnn_20...,2021-01-31,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[36, 29, 28, 96, 193, 172, 169, 250]"


In [17]:
rgba_df.to_csv('../data/accounts_rgba_cluster.csv')

In [18]:
def cluster3(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] !=0], np.uint8)
    clt = KMeans(n_clusters = 3, max_iter=500, random_state=5).fit(x)
    centers = clt.cluster_centers_
    centers = np.uint8(centers)
    centers_ravel = centers.ravel()
    return centers_ravel

In [19]:
%%time
rgba_df['rgba_centers_3'] = rgba_df['rgba_ravel_list'].apply(cluster3)

CPU times: user 3h 44min 34s, sys: 39min 42s, total: 4h 24min 17s
Wall time: 38min 29s


In [20]:
rgba_df.to_csv('../data/accounts_rgba_cluster_3.csv')

In [21]:
def cluster4(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] !=0], np.uint8)
    clt = KMeans(n_clusters = 4, max_iter=500, random_state=5).fit(x)
    centers = clt.cluster_centers_
    centers = np.uint8(centers)
    centers_ravel = centers.ravel()
    return centers_ravel

In [22]:
%%time
rgba_df['rgba_centers_4'] = rgba_df['rgba_ravel_list'].apply(cluster4)
rgba_df.to_csv('../data/accounts_rgba_cluster_4.csv')

CPU times: user 4h 26min 41s, sys: 48min 17s, total: 5h 14min 58s
Wall time: 45min 57s


In [1]:
rgba_df

NameError: name 'rgba_df' is not defined