In [1]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import datetime

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from PIL import Image
from PIL import ImageFile
import os
import numpy as np
import pandas as pd

In [2]:
# Load list of top 34 insta accounts
final_acct_df = pd.read_csv('../data/final_acct_df.csv')

In [3]:
accounts = final_acct_df['handle'].tolist()
accounts

['ec24m',
 'jiaqiwoo',
 'limrebecca',
 'speishi',
 'dreachong',
 'carriewst',
 'kimlimhl',
 'novitalam',
 'julietan_cxq',
 'mongabong',
 'aureliang_',
 'xplacidacidx',
 'melissackoh',
 'nicolekittykatx',
 'zoetay10',
 'hayleywoojiayi',
 'yahuiyh',
 'denisesoongeelyn',
 'rchlwngxx',
 'saffronsharpe',
 'euniceannabel',
 'katepang311',
 'honglingg_',
 'elaineruimin',
 'mirchelley',
 'sheila_sim',
 'fionafussi',
 'cheyennechesney',
 'soniachew',
 'narellekheng',
 'ohhowstrange',
 'xoxoapo',
 'jacelyn_tay',
 'xinlinnn']

In [4]:
#splits accounts list into smaller chunks to make processing more manageable

accounts_list = list()
chunk_size = 5

for i in range(0, len(accounts), chunk_size):
    accounts_list.append(accounts[i:i+chunk_size])
    
len(accounts_list)


7

In [5]:
#instantiate empty list to append data from each photo
rgba_ravel_list = []
image_list = []
instagram_handle = []
date_post = []
seq_post_samedate = []

In [6]:
#set size for resizing each photo so that they are all the same size
new_size = (400,400)

In [19]:
%%time
#for loop to loop through all photos within the 5 accounts (split 34 accounts into groups of 5)
#append rgba information, image path, instagram handle, date of post
#and also sequence of post if there are more than one post a day

for account in accounts_list[0]:
    in_path ='../pictures/wo_background/' + account
    for image_path in os.listdir(in_path):
        # image_path contains name of the image 
        input_path = os.path.join(in_path, image_path)
        imgfile = Image.open(input_path)
        imgfile = imgfile.resize(new_size)
        na = np.array(imgfile.getdata(), np.uint8)
        row = na.ravel()

        rgba_ravel_list.append(row)
        image_list.append(input_path)
        instagram_handle.append(account)
        date_post.append(image_path[-16:-6])
        seq_post_samedate.append(image_path[-5:-4])

CPU times: user 3min 36s, sys: 1.28 s, total: 3min 38s
Wall time: 3min 41s


In [20]:
rgba_df = pd.DataFrame(zip(instagram_handle, image_list, date_post, seq_post_samedate, rgba_ravel_list), \
                       columns = ['instagram_handle', 'image_list', 'date_post', 'seq_post_samedate', 'rgba_ravel_list'])
rgba_df.shape

(1671, 5)

In [21]:
#delete corrupted posts that contains .DS
rgba_df = rgba_df[~rgba_df.date_post.str.contains('.DS')]
rgba_df.shape

(1670, 5)

In [22]:
#translate seq_post_samedate
rgba_df['seq_post_samedate'].value_counts()

1    461
2    424
C    415
3    208
4     78
5     32
6     23
7     13
8      6
9      5
0      3
c      2
Name: seq_post_samedate, dtype: int64

In [23]:
rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate']\
.map({'C':'1','c':'1','0':'11','1':'2','2':'3','3':'4','4':'5','5':'6','6':'7','7':'8','8':'9','9':'10'})
rgba_df['seq_post_samedate'].value_counts()

2     461
3     424
1     417
4     208
5      78
6      32
7      23
8      13
9       6
10      5
11      3
Name: seq_post_samedate, dtype: int64

In [24]:
rgba_df['date_post'] = pd.to_datetime(rgba_df['date_post'])
rgba_df['seq_post_samedate'] = rgba_df['seq_post_samedate'].astype(str).astype(int)
rgba_df.to_csv('../data/accounts_0.csv')

In [25]:
rgba_df.head()

Unnamed: 0,instagram_handle,image_list,date_post,seq_post_samedate,rgba_ravel_list
0,ec24m,../pictures/wo_background/ec24m/ec24m_2021-01-...,2021-01-31,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,ec24m,../pictures/wo_background/ec24m/ec24m_2021-11-...,2021-11-09,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,ec24m,../pictures/wo_background/ec24m/ec24m_2021-03-...,2021-03-03,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,ec24m,../pictures/wo_background/ec24m/ec24m_2021-06-...,2021-06-29,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,ec24m,../pictures/wo_background/ec24m/ec24m_2021-02-...,2021-02-03,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [26]:
rgba_df['rgba_ravel_list'][0].reshape(-1,4)

array([[  0,   0,   0,   0],
       [  0,   0,   0,   0],
       [  0,   0,   0,   0],
       ...,
       [172, 120, 140, 230],
       [155, 102, 121, 206],
       [135,  80,  98, 181]], dtype=uint8)

In [27]:
def cluster2(x):
    x = x.reshape(-1, 4)
    x = np.array([f for f in x if f[3] !=0], np.uint8)
    clt = KMeans(n_clusters = 2, max_iter=500, random_state=5).fit(x)
    centers = clt.cluster_centers_
    centers = np.uint8(centers)
    centers_ravel = centers.ravel()
    return centers_ravel

In [28]:
%%time
rgba_df['rgba_centers_2'] = rgba_df['rgba_ravel_list'].apply(cluster2)

In [29]:
rgba_df.head()

Unnamed: 0,instagram_handle,image_list,date_post,seq_post_samedate,rgba_ravel_list,rgba_centers_2
0,ec24m,../pictures/wo_background/ec24m/ec24m_2021-01-...,2021-01-31,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[177, 131, 134, 253, 13, 10, 9, 103]"
1,ec24m,../pictures/wo_background/ec24m/ec24m_2021-11-...,2021-11-09,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[111, 101, 97, 250, 5, 5, 4, 15]"
2,ec24m,../pictures/wo_background/ec24m/ec24m_2021-03-...,2021-03-03,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[66, 52, 49, 189, 221, 206, 188, 252]"
3,ec24m,../pictures/wo_background/ec24m/ec24m_2021-06-...,2021-06-29,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[177, 79, 58, 251, 41, 22, 19, 184]"
4,ec24m,../pictures/wo_background/ec24m/ec24m_2021-02-...,2021-02-03,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[191, 138, 123, 249, 25, 12, 11, 76]"


In [30]:
rgba_df.tail()

Unnamed: 0,instagram_handle,image_list,date_post,seq_post_samedate,rgba_ravel_list,rgba_centers_2
1666,dreachong,../pictures/wo_background/dreachong/dreachong_...,2021-11-26,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[129, 95, 76, 214, 216, 208, 200, 253]"
1667,dreachong,../pictures/wo_background/dreachong/dreachong_...,2021-07-19,8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[23, 20, 17, 159, 172, 148, 126, 248]"
1668,dreachong,../pictures/wo_background/dreachong/dreachong_...,2021-10-19,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[149, 118, 90, 241, 14, 11, 8, 25]"
1669,dreachong,../pictures/wo_background/dreachong/dreachong_...,2021-07-26,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[30, 22, 17, 248, 10, 9, 8, 17]"
1670,dreachong,../pictures/wo_background/dreachong/dreachong_...,2021-04-14,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[162, 128, 106, 252, 14, 11, 9, 44]"


In [32]:
rgba_df_0 = rgba_df

In [None]:
%%time
#for loop to loop through all photos within the 5 accounts (split 34 accounts into groups of 5)
#append rgba information, image path, instagram handle, date of post
#and also sequence of post if there are more than one post a day

for account in accounts_list[0]:
    in_path ='../pictures/wo_background/' + account
    for image_path in os.listdir(in_path):
        # image_path contains name of the image 
        input_path = os.path.join(in_path, image_path)
        imgfile = Image.open(input_path)
        imgfile = imgfile.resize(new_size)
        na = np.array(imgfile.getdata(), np.uint8)
        row = na.ravel()

        rgba_ravel_list.append(row)
        image_list.append(input_path)
        instagram_handle.append(account)
        date_post.append(image_path[-16:-6])
        seq_post_samedate.append(image_path[-5:-4])

In [None]:
rgba_df = pd.DataFrame(zip(instagram_handle, image_list, date_post, seq_post_samedate, rgba_ravel_list), \
                       columns = ['instagram_handle', 'image_list', 'date_post', 'seq_post_samedate', 'rgba_ravel_list'])
rgba_df.shape

In [None]:
#delete corrupted posts that contains .DS
rgba_df = rgba_df[~rgba_df.date_post.str.contains('.DS')]
rgba_df.shape

In [None]:
#translate seq_post_samedate
rgba_df['seq_post_samedate'].value_counts()