In [1]:
%%file ClusteringTrainKM.py

import skimage
import os
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from skimage import data
from skimage import io
import glob
from skimage.viewer import ImageViewer
import cv2
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib

class ClusteringKM:
    
    #Initialization
    def __init__(self, train_path, save_path):
        self.train_path = train_path
        self.save_path = save_path
        
    '''#Helper Function to get images from train path
    def get_train_images(self, user_list):
        list_users = user_list
        self.train_imgs = {}
        for i in list_users:
            temp_path = self.train_path + i
            for j in glob.glob(temp_path + '/*.jpg'):
                file_name = j.replace(temp_path,'')[1:]
                img = io.imread(j)
                self.train_imgs[(i,file_name)] = img 
        print("Number of images loaded:", len(self.train_imgs))'''

    def get_train_images(self, train_path, user_list):
        #list_users = user_list
        self.train_imgs = {}
        print(len(user_list))
        for i in user_list:
            file_name = i
            img = io.imread(train_path+str(i)+'.jpg')
            print(train_path+str(i)+'.jpg')
            self.train_imgs[(i,file_name)] = img 
        print("Number of images loaded:", len(self.train_imgs))

    
    #Helper function to convert image to d-dimension vector for each image and 
    #return dataframe of all images
    def convert_to_features(self, columns):
        features = []
        for i in self.train_imgs.items():
            r_mean, r_std, r_med = np.mean(i[1][:,:,0].ravel()), np.std(i[1][:,:,0].ravel()), np.median(i[1][:,:,0].ravel())
            g_mean, g_std, g_med  = np.mean(i[1][:,:,1].ravel()), np.std(i[1][:,:,1].ravel()), np.median(i[1][:,:,1].ravel())
            b_mean, b_std, b_med  = np.mean(i[1][:,:,2].ravel()), np.std(i[1][:,:,2].ravel()), np.median(i[1][:,:,2].ravel())
            canny = np.mean(np.ravel(cv2.Canny(cv2.cvtColor(i[1], cv2.COLOR_BGR2HSV),100,200,L2gradient = True)))
            try:
                orb = cv2.ORB_create(100)
                kp = orb.detect(i[1],None)
                kp, des = orb.compute(i[1], kp)
                orb_centers = list(KMeans(1).fit([i.pt for i in kp]).cluster_centers_)
                orbx1, orby1 = orb_centers[0][0]*255/np.shape(i[1])[0], orb_centers[0][1]*255/np.shape(i[1])[1]
            except ValueError:
                continue
            features.append(np.array([i[0][0],i[0][1], r_mean, r_std, r_med, g_mean, g_std, g_med, b_mean, b_std, b_med, canny, orbx1, orby1]))
        df = pd.DataFrame(features, columns = columns)
        return df
    
    def model_images_fit(self, df, k, extra_cols, rand_state):
        data = df.copy(deep=True)
        
        #Delete reference columns
        for i in extra_cols:
            del data[i]

        #Implement Gaussian Mixture Model Algortihm 
        model = GaussianMixture(n_components=k, random_state=9001)

        #Fit Model and Predict
        model.fit(data)
        y_pred = model.predict_proba(data)

        #Add prediction to dataframe and return 
        for i in range(0,k+1):
            if i==k:
                label = "Prediction"
                df[label] = model.predict(data)
                break
            label = "Prob_" + str(i)
            df[label] = y_pred[:,i]
        return df, model
    
    def model_users_fit(self, df, k, extra_cols, rand_state):
        data = df.copy(deep=True)
        
        #Delete reference columns
        for i in extra_cols:
            del data[i]

        #Implement K-Means Algortihm
        model = KMeans(n_clusters=k, random_state=rand_state)

        #Fit Model, Predict and Return
        model.fit(data)
        y_pred = model.predict(data)
        df['Prediction'] = model.labels_
        return df, model
    
    #Helper function to create folders for Image Clustering
    def save_clusters(self, df, label):
        self.save_path += label + "/"
        for i in self.train_imgs.items():
            temp_row = df[df["URL"]==i[0][1]]
            try:
                name, pred_folder = i[0][1], str(temp_row['Prediction'].values[0])
            except IndexError:
                continue
            temp_path = self.save_path + "Cluster" + pred_folder + "/"
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)
            io.imsave(temp_path+name, i[1])
        print("All Images Saved.")
            
    #Helper function to obtain percentage of Cluster Presence
    def get_cluster_presence(self, df, k):
        cluster_presence = []
        for i in list(df['User_Handle'].unique()):
            user_dict = {}
            temp_df = df[df['User_Handle']==i]
            post_count  =  len(temp_df)
            user_dict['User_Handle'] = i
            for j in range(0, k):
                user_dict["Cluster_"+str(j)] = sum(temp_df['Prob_'+str(j)])/post_count
            cluster_presence.append(user_dict)
        df_presence = pd.DataFrame(cluster_presence)
        df_presence = df_presence.fillna(0)
        return df_presence
    
    #Helper function to save model 
    def save_model(self, model, path):
        joblib.dump(model, path) 
        print("Model Saved.")


import skimage
import os
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from skimage import data
from skimage import io
import glob
from skimage.viewer import ImageViewer
import cv2
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib

class Ranking:
    
    #Initialization
    def __init__(self, target_path):
        self.target_path = target_path

    #Helper Function to get images from target path 
    def get_images_target(self):
        #Get all Images of Users in the List
        self.target_imgs = {}
        for j in glob.glob(self.target_path + '/*.jpg'):
            file_name = j.replace(self.target_path,'')[:]
            img = io.imread(j)
            self.target_imgs[("Input/Target",file_name)] = img
        print("Number of images loaded:", len(self.target_imgs))
    
    #Helper function to load saved model
    def load_model(self, path):
        model = joblib.load(path)
        print ("Model loaded.")
        return model

    #Helper function to convert image to d-dimension vector for each image and 
    #return dataframe of all images
    def convert_to_features(self, columns):
        features = []
        for i in self.target_imgs.items():
            r_mean, r_std, r_med = np.mean(i[1][:,:,0].ravel()), np.std(i[1][:,:,0].ravel()), np.median(i[1][:,:,0].ravel())
            g_mean, g_std, g_med  = np.mean(i[1][:,:,1].ravel()), np.std(i[1][:,:,1].ravel()), np.median(i[1][:,:,1].ravel())
            b_mean, b_std, b_med  = np.mean(i[1][:,:,2].ravel()), np.std(i[1][:,:,2].ravel()), np.median(i[1][:,:,2].ravel())
            canny = np.mean(np.ravel(cv2.Canny(cv2.cvtColor(i[1], cv2.COLOR_BGR2HSV),100,200,L2gradient = True)))
            try:
                orb = cv2.ORB_create(100)
                kp = orb.detect(i[1],None)
                kp, des = orb.compute(i[1], kp)
                orb_centers = list(KMeans(1).fit([i.pt for i in kp]).cluster_centers_)
                orbx1, orby1 = orb_centers[0][0]*255/np.shape(i[1])[0], orb_centers[0][1]*255/np.shape(i[1])[1]
            except ValueError:
                continue
            features.append(np.array([i[0][0],i[0][1], r_mean, r_std, r_med, g_mean, g_std, g_med, b_mean, b_std, b_med, canny, orbx1, orby1]))
        df = pd.DataFrame(features, columns = columns)
        return df

    #Helper function to make prediction for target images using image model
    def predict(self, df, model, k, cluster_names, extra_cols):
        data = df.copy(deep=True)
        
        #Delete reference columns
        for i in extra_cols:
            del data[i]
            
        #Make Prediction
        y_pred = model.predict_proba(data)
        
        #Add prediction to dataframe and return 
        for i in range(0,k+1):
            if i==k:
                label = "Prediction"
                df[label] = model.predict(data)
                break
            label = cluster_names[i] + " (" + str(i) + ")"
            df[label] = y_pred[:,i]
        return df
    
    #Helper function to generate distance dictionary
    def get_result(self, df, k, model):
        final_dict = {}
        for i in range(0,len(df)):
            temp_file = df.iloc[i,1]
            temp_dist = []
            for j in range(0, k):
                temp_dist.append(np.linalg.norm(df.iloc[i,14:14+k].astype(float)-model.cluster_centers_[j]))
            final_dict[temp_file] = temp_dist
        return final_dict

Overwriting ClusteringTrainKM.py


In [3]:
import cv2 ## issues 
import skimage
from skimage import io
import pickle
#define File-Path to Users Folder
#import scikit-image
import skimage
import os
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from skimage import data
from skimage import io
from sklearn.mixture import GaussianMixture
import glob
from skimage.viewer import ImageViewer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
%matplotlib inline

#img = io.imread('~/Users/kimia/Desktop/Capstone/imgcluster/Cluster4/11117159_1591691794419786_1496739027_n.jpg')

  warn('Viewer requires Qt')


In [5]:


import ClusteringTrainKM

train_path = '/Users/kimia/Desktop/Capstone/imgcluster/Cluster4/'
save_path = '/Users/kimia/Desktop/Capstone/imgcluster/Cluster4/'
dest_img =  '/Users/kimia/Desktop/Capstone/imgcluster/Cluster4/'
userlist= [1,2,3]
#user list will be the picture number! 

#Helper Function to get images from train path


c = ClusteringTrainKM.ClusteringKM(train_path, save_path )
c.get_train_images(train_path, user_list = ['1','2','3'])
#Create Object


#obj_train = ClusteringTrain.Clustering(train_path, dest_path)

3
/Users/kimia/Desktop/Capstone/imgcluster/Cluster4/1.jpg
/Users/kimia/Desktop/Capstone/imgcluster/Cluster4/2.jpg
/Users/kimia/Desktop/Capstone/imgcluster/Cluster4/3.jpg
Number of images loaded: 3


In [6]:
#Get train dataframe
train_df_cols = ["User_Handle","URL","R_Mean", "R_STD", "R_MED", "G_Mean", "G_STD", 
                "G_MED", "B_Mean", "B_STD", "B_MED", "Canny", "ORB_X", "ORB_Y"]

train_df = c.convert_to_features(train_df_cols)
display(train_df.head())

Unnamed: 0,User_Handle,URL,R_Mean,R_STD,R_MED,G_Mean,G_STD,G_MED,B_Mean,B_STD,B_MED,Canny,ORB_X,ORB_Y
0,1,1,135.0129197103678,87.63115693479644,153.0,115.81277500106796,85.89295297982054,109.0,58.67071158528771,67.8223132968387,33.0,22.53199891067538,146.9227747599284,130.7175958633423
1,2,2,182.6249466017344,71.07541423145153,214.0,114.09603945063864,61.1908801925312,116.0,59.164076850783886,49.10236823158424,42.0,15.82652505446623,158.0485753377279,124.39350624084474
2,3,3,224.64943771626295,37.1131291445464,242.0,177.51967726088256,68.5927534247089,184.0,103.62004998077664,101.47531061170602,66.0,22.36791938997821,127.65924498240156,189.51394405364988


In [21]:
df = train_df
del df['User_Handle']
del df['URL']

In [23]:
# Model

In [24]:
from sklearn.externals import joblib
model = joblib.load('model_images.plk')

In [62]:
model_user = joblib.load('model_users.plk')
model_user.cluster_centers_

array([[9.48407625e-06, 1.73430122e-02, 2.90112949e-02, 3.71834975e-02,
        9.16452711e-01],
       [4.49303540e-02, 8.15995366e-01, 7.48122668e-02, 2.66606310e-04,
        6.39954070e-02],
       [2.39535693e-01, 3.06436341e-01, 2.56485322e-01, 8.08224675e-02,
        1.16720177e-01],
       [0.00000000e+00, 6.35781539e-10, 0.00000000e+00, 9.99971800e-01,
        2.81989199e-05],
       [8.88222290e-02, 1.17511792e-01, 7.88032046e-01, 9.71445147e-17,
        5.63393249e-03]])

In [25]:
#Define Dataset
data = df.copy(deep=True)
#model.fit(data)
y_pred = model.predict_proba(data)

In [29]:
cluster_count = 5
#Add prediction to dataframe     
for i in range(0,cluster_count+1):
    if i==cluster_count:
        label = "Prediction"
        df[label] = model.predict(data)
        break
    label = "Prob_" + str(i)
    df[label] = y_pred[:,i]
    

display(df.head())

Unnamed: 0,R_Mean,R_STD,R_MED,G_Mean,G_STD,G_MED,B_Mean,B_STD,B_MED,Canny,ORB_X,ORB_Y,Prob_0,Prob_1,Prob_2,Prob_3,Prob_4,Prediction
0,135.0129197103678,87.63115693479644,153.0,115.81277500106796,85.89295297982054,109.0,58.67071158528771,67.8223132968387,33.0,22.53199891067538,146.9227747599284,130.7175958633423,5.078126e-18,1.441767e-08,0.3988279,2.107406e-20,0.601172,4
1,182.6249466017344,71.07541423145153,214.0,114.09603945063864,61.1908801925312,116.0,59.164076850783886,49.10236823158424,42.0,15.82652505446623,158.0485753377279,124.39350624084474,3.710005e-15,4.197616e-06,0.8920294,1.499184e-14,0.107966,2
2,224.64943771626295,37.1131291445464,242.0,177.51967726088256,68.5927534247089,184.0,103.62004998077664,101.47531061170602,66.0,22.36791938997821,127.65924498240156,189.51394405364988,4.270073e-62,0.0004177991,9.59186e-07,1.6200369999999998e-87,0.999581,4


SyntaxError: invalid syntax (<ipython-input-74-ac259ba0006b>, line 1)

In [None]:
# targeting PEOPLE
# distirbut rep img, compare to dist repre person. 
## distribution over images, same cluster of images. 


## Now need to compare it to the files.. So is 'presence DF' the communities? Like the centroid?

## or...should I take a summary statistic of each cluster across all and comapre to that.

In [40]:
imgdf = pd.read_csv('/Users/kimia/Desktop/Capstone/imgcluster/ImageDF.csv')

In [41]:
Presdf = pd.read_csv('/Users/kimia/Desktop/Capstone/imgcluster/PresenceDF.csv')

In [50]:
imgdf

Unnamed: 0.1,Unnamed: 0,User_Handle,URL,R_Mean,R_STD,R_MED,G_Mean,G_STD,G_MED,B_Mean,...,B_MED,Canny,ORB_X,ORB_Y,Prob_0,Prob_1,Prob_2,Prob_3,Prob_4,Prediction
0,0,vaso1977,21827077_506375753029628_3465546456159485952_n...,172.259250,56.227099,183.0,146.721663,64.759587,138.0,137.339684,...,121.0,9.192101,155.486795,149.935692,6.693617e-01,1.060961e-02,4.961585e-06,2.070243e-04,3.198167e-01,0
1,1,nsb.koc,15046992_217693265334940_4394742452892729344_a...,98.425589,72.363563,81.0,64.076227,54.718698,46.0,55.611240,...,35.0,32.337149,126.328119,107.812731,1.248927e-11,5.308764e-86,9.996695e-01,3.414626e-88,3.304517e-04,2
2,2,vivpeng,11939385_396871077175871_118051651_a.jpg,201.325401,87.815538,252.0,62.262006,54.399321,50.0,47.814006,...,34.0,12.118515,122.266867,170.398630,3.166672e-106,0.000000e+00,9.195696e-24,0.000000e+00,1.000000e+00,4
3,3,amrynevillek,12716665_662455757228752_1642125713_n.jpg,114.095361,94.209438,87.0,109.390010,93.569121,72.0,99.461399,...,54.0,39.386169,117.797005,118.173865,1.235896e-02,8.060160e-05,1.735250e-06,9.875587e-01,6.245210e-09,3
4,4,amrynevillek,10852606_628314927290546_2119210677_a.jpg,190.652311,67.111186,214.0,161.254044,84.122425,177.0,154.163689,...,156.0,46.353333,129.648461,106.242975,1.858675e-01,8.141261e-01,2.777746e-15,5.852880e-08,6.394297e-06,1
5,5,amrynevillek,12728646_487448638113445_821296255_n.jpg,177.049531,67.542306,178.0,160.916072,75.275741,156.0,145.451311,...,125.0,36.691113,116.628728,120.460276,2.089845e-01,5.156290e-01,7.709886e-09,2.753433e-01,4.309334e-05,1
6,6,danalev7,14733313_245212735896813_2838680762239156224_a...,110.036148,48.540876,103.0,137.933204,58.143363,137.0,137.343076,...,147.0,24.255086,133.770323,54.008643,4.253952e-08,2.365576e-52,7.066871e-04,8.467038e-67,9.992933e-01,4
7,7,irienyree,26430822_1424168774377458_8929179722110205952_...,155.126029,99.483736,207.0,147.763209,99.284687,173.0,150.273795,...,151.0,5.037474,122.337758,77.339165,8.560446e-08,6.225785e-04,1.897532e-43,3.858472e-07,9.993770e-01,4
8,8,lilachturgeman,13391159_1546421489000056_888671953_n.jpg,135.342278,63.211233,154.0,118.202515,71.151759,134.0,111.811640,...,118.0,7.419560,86.428868,219.068884,3.688045e-01,8.456452e-32,3.437180e-07,4.383313e-27,6.311951e-01,4
9,9,lilachturgeman,13249781_994608703990052_1289056129_n.jpg,141.409714,79.199018,162.0,125.051600,78.147231,129.0,113.513634,...,111.0,17.268318,99.684687,210.029382,6.758963e-01,4.595316e-16,5.150035e-05,1.245375e-06,3.240509e-01,0


In [39]:
Presdf

Unnamed: 0.1,Unnamed: 0,Cluster_0,Cluster_1,Cluster_2,Cluster_3,Cluster_4,User_Handle,Prediction
0,0,0.6693617,0.01060961,4.961585e-06,0.0002070243,0.319817,vaso1977,2
1,1,1.248927e-11,5.308764e-86,0.9996695,3.4146259999999997e-88,0.00033,nsb.koc,3
2,2,3.166672e-106,0.0,9.195696e-24,0.0,1.0,vivpeng,0
3,3,0.135737,0.4432786,5.809865e-07,0.4209674,1.6e-05,amrynevillek,1
4,4,4.253952e-08,2.3655760000000003e-52,0.0007066871,8.467038e-67,0.999293,danalev7,0
5,5,8.560446e-08,0.0006225785,1.897532e-43,3.858472e-07,0.999377,irienyree,0
6,6,0.3081923,0.07207013,0.1721948,0.009413517,0.438129,lilachturgeman,2
7,7,0.1988187,0.2620269,0.09654036,0.3277307,0.114883,thiswhomustbekept,1
8,8,0.9986956,4.64769e-06,0.000196375,1.493064e-11,0.001103,j_f_lil,4


In [None]:
## Treating Presdf as the centroids

In [None]:
## KL Divergence

In [63]:
from scipy import stats

row0_vaso = ([9.48407625e-06, 1.73430122e-02, 2.90112949e-02, 3.71834975e-02,
        9.16452711e-01])
myown4 = [5.078126e-18, 1.441767e-08, 3.988279e-01, 2.107406e-20, 0.601172]
print(stats.entropy(pk=row0_vaso, qk=myown4))

myown2 = [3.710005e-15, 4.197616e-06, 8.920294e-01, 1.499184e-14, 0.107966]
#print(stats.entropy(pk=row0_vaso, qk=myown2))




2.1156842395594326


In [72]:
## This is the score depending on which cluster you'd like to optimize for 
for i in range(model_user.n_clusters):
    user_centroid = model_user.cluster_centers_[i]
    print(stats.entropy(pk=user_centroid, qk=myown4))
    

2.11568423826652
15.957836424717797
17.520506566471457
45.304657081334575
5.702374000473236


In [64]:
# rank from lowest to highest 

In [None]:
# Know user type, are imagines relev to user type? 
# Math and qualit analysis 


In [None]:
two groups of images, for two types of users --> 3
Know how the ranking should 
# selfie person --> dont target them with landscape
# 

In [None]:
People to test analysis on: 
# Go grab painters on IG, illustrators.
# All their artwork.

# Selfie people --> Beauty. Ask Ale and Andreea --> cosmetics. 

# retrain the model. 
# see if ranking, does exactly this. 

# https://www.instagram.com/explore/tags/mountaineering/

In [None]:
# next steps, engagement 
# predicted engagement value --> how to combine scores (KL, engagement)
# Predict number of shares, likes, comment. 
# predictive model. , likes, shares, etc. image with features. image --> engagement. 
# could be lienar, couild be else. 