In [None]:
import skimage
import os
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from skimage import data
from skimage import io
from sklearn.mixture import GaussianMixture
import glob
from skimage.viewer import ImageViewer
import cv2
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
%matplotlib inline

In [None]:
#Define List of Users and define File-Path to Users Folder
list_users = ['ana_brandine', 'vicky_regouli', 'luismiguelpss', 'ilariabiagini', 
              'emnegg', 'kerendhahn', 'agiorgina', 'roulamatta', 'jussbieber9827', 
              'eremiaheidr', 'eunhuiheo', 'anastasiakaps', 'achaelilsone', 'orit_talbi',
              'sorayaalassmi', 'altonolnlis', 'vaso1977', 'theunrealobserver', 'nsb.koc',
              'vivpeng', 'amrynevillek', 'danalev7', 'irienyree', 'lilachturgeman', 
              'emel_karakoc', 'thiswhomustbekept', 'j_f_lil', 'ulietteearneye', 
              'gilanaz', 'sarrahdolly', 'alexchahine97', 'photographerarson', 
              'angecanindo', 'fiona_smithson', 'chelsea_xu620']

path = "/Users/kmotwani/Dropbox/Harvard/Capstone_EmpSirenuse/Datasets/sample_users_media/"
dest_path = "/Users/kmotwani/Desktop/Me - Local/Education/Courses/Capstone Project/Clustering_kNN_"

In [None]:
#Helper Function to get images from path
def get_images(path, list_users):
    #Get all Images of Users in the List
    user_imgs = {}
    for i in list_users:
        temp_path = path + i
        for j in glob.glob(temp_path + '/*.jpg'):
            file_name = j.replace(temp_path,'')[1:]
            img = io.imread(j)
            user_imgs[(i,file_name)] = img
    return user_imgs 


#Get Images from User List and Path
user_imgs = get_images(path, list_users)
print("Number of images loaded:", len(user_imgs))

In [None]:
#Helper function to convert image to d-dimension vector
def convert(user_imgs):
    user_features = []
    for i in user_imgs.items():
        r_mean, r_std, r_med = np.mean(i[1][:,:,0].ravel()), np.std(i[1][:,:,0].ravel()), np.median(i[1][:,:,0].ravel())
        g_mean, g_std, g_med  = np.mean(i[1][:,:,1].ravel()), np.std(i[1][:,:,1].ravel()), np.median(i[1][:,:,1].ravel())
        b_mean, b_std, b_med  = np.mean(i[1][:,:,2].ravel()), np.std(i[1][:,:,2].ravel()), np.median(i[1][:,:,2].ravel())
        canny = np.mean(np.ravel(cv2.Canny(cv2.cvtColor(i[1], cv2.COLOR_BGR2HSV),100,200,L2gradient = True)))
        try:
            orb = cv2.ORB_create(100)
            kp = orb.detect(i[1],None)
            kp, des = orb.compute(i[1], kp)
            orb_centers = list(KMeans(1).fit([i.pt for i in kp]).cluster_centers_)
            orbx1, orby1 = orb_centers[0][0]*255/np.shape(i[1])[0], orb_centers[0][1]*255/np.shape(i[1])[1]
        except ValueError:
            continue
        user_features.append(np.array([i[0][0],i[0][1], r_mean, r_std, r_med, g_mean, g_std, g_med, b_mean, b_std, b_med, canny, orbx1, orby1]))
    return user_features
    
#Convert Images
user_features = convert(user_imgs)
print("Feature Vectors Created.")

In [None]:
#Create Dataframe of Features
df = pd.DataFrame(user_features)
df.columns = ["User_Handle","URL","R_Mean", "R_STD", "R_MED", "G_Mean", "G_STD", "G_MED", "B_Mean", "B_STD", "B_MED", "Canny", "ORB_X", "ORB_Y"]
display(df.head())

In [None]:
#Define Cluster Count
cluster_count = 4

#Define Dataset
data = df.copy(deep=True)
del data["User_Handle"]
del data["URL"]
display(data.head())

#Implement Gaussian Mixture Model Algortihm 
model_gaussian = GaussianMixture(n_components=cluster_count, random_state=9001)

#Fit Model and Predict
model_gaussian.fit(data)
y_pred = model_gaussian.predict_proba(data)

#Add prediction to dataframe     
for i in range(0,cluster_count+1):
    if i==cluster_count:
        label = "Prediction"
        df[label] = model_gaussian.predict(data)
        break
    label = "Prob_" + str(i)
    df[label] = y_pred[:,i]
    

display(df.head())

In [None]:
#Helper function to create folders for Image Clustering
def save_clusters(df, user_imgs, dest_path, label):
    dest_path += label + "/"
    for i in user_imgs.items():
        temp_row = df[df["URL"]==i[0][1]]
        try:
            name, pred_folder = i[0][1], str(temp_row['Prediction'].values[0])
        except IndexError:
            continue
        temp_path = dest_path + "Cluster" + pred_folder + "/"
        if not os.path.exists(temp_path):
            os.makedirs(temp_path)
        io.imsave(temp_path+name, i[1])

save_clusters(df, user_imgs, dest_path, "Competitors")
print("All Images Saved.")

In [None]:
#Helper function to obtain percentage of Cluster Presence
def cluster_presence(df):
    cluster_presence = []
    for i in list(df['User_Handle'].unique()):
        user_dict = {}
        temp_df = df[df['User_Handle']==i]
        post_count  =  len(temp_df)
        user_dict['User_Handle'] = i
        for j in range(0, cluster_count):
            user_dict["Cluster_"+str(j)] = sum(temp_df['Prob_'+str(j)])/post_count
        cluster_presence.append(user_dict)
    return cluster_presence

#Create Cluster Presence Dataframe
presence_list = cluster_presence(df)
df_presence = pd.DataFrame(presence_list)
df_presence = df_presence.fillna(0)
display(df_presence)

In [None]:
#Define Cluster Count
cluster_count = 4

#Define Dataset
data_presence = df_presence.copy(deep=True)
del data_presence["User_Handle"]

#Implement K-Means Algortihm
model_kmeans_users = KMeans(n_clusters=cluster_count, random_state=9001)

#Fit Model and Predict
model_kmeans_users.fit(data_presence)
print("Representative User Vectors:\n\n",model_kmeans_users.cluster_centers_)

In [None]:
#Helper Function to get images from target path 
def get_images_test(path):
    #Get all Images of Users in the List
    user_imgs = {}
    for j in glob.glob(path + '/*.jpg'):
        file_name = j.replace(path,'')[1:]
        img = io.imread(j)
        user_imgs[("Target",file_name)] = img
    return user_imgs 

In [None]:
#Define target image folder path
target_path = "/Users/kmotwani/Dropbox/Harvard/Capstone_EmpSirenuse/Images/Test/"
target_dict = get_images_test(target_path)

#Convert image to feature vector and define columns
target_df = pd.DataFrame(convert(target_dict))
target_df.columns = ["User_Handle","URL","R_Mean", "R_STD", "R_MED", "G_Mean", "G_STD", 
                "G_MED", "B_Mean", "B_STD", "B_MED", "Canny", "ORB_X", "ORB_Y"]

#Define Target DF
data = target_df.copy(deep=True)
del data["User_Handle"]
del data["URL"]

#Make Prediction
y_pred = model_gaussian.predict_proba(data)

#Add prediction to dataframe     
for i in range(0,cluster_count+1):
    if i==cluster_count:
        label = "Prediction"
        target_df[label] = model_gaussian.predict(data)
        break
    label = "Prob_" + str(i)
    target_df[label] = y_pred[:,i]

display(target_df.head())

In [None]:
#Helper function to generate distance dictionary
def get_dist_dict(df, k, model):
    final_dict = {}
    for i in range(0,len(df)):
        temp_file = df.iloc[i,1]
        temp_dist = []
        for j in range(0, k):
            temp_dist.append(np.linalg.norm(df.iloc[i,14:14+k].astype(float)-model.cluster_centers_[j]))
        final_dict[temp_file] = temp_dist
    return final_dict


final = get_dist_dict(target_df, cluster_count, model_kmeans_users)
print(final)

In [None]:
from sklearn.externals import joblib
joblib.dump(model_kmeans_users, 'model_kmeans.pkl') 
del model_kmeans_users
model_kmeans_users = joblib.load('model_kmeans.pkl')
print(model_kmeans_users.cluster_centers_)

In [None]:
%%file ClusteringTrain.py

import skimage
import os
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from skimage import data
from skimage import io
import glob
from skimage.viewer import ImageViewer
import cv2
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib

class Clustering:
    
    #Initialization
    def __init__(self, train_path, save_path):
        self.train_path = train_path
        self.save_path = save_path
        
    #Helper Function to get images from train path
    def get_train_images(self, user_list):
        list_users = user_list
        self.train_imgs = {}
        for i in list_users:
            temp_path = self.train_path + i
            for j in glob.glob(temp_path + '/*.jpg'):
                file_name = j.replace(temp_path,'')[1:]
                img = io.imread(j)
                self.train_imgs[(i,file_name)] = img 
        print("Number of images loaded:", len(self.train_imgs))
    
    #Helper function to convert image to d-dimension vector for each image and 
    #return dataframe of all images
    def convert_to_features(self, columns):
        features = []
        for i in self.train_imgs.items():
            r_mean, r_std, r_med = np.mean(i[1][:,:,0].ravel()), np.std(i[1][:,:,0].ravel()), np.median(i[1][:,:,0].ravel())
            g_mean, g_std, g_med  = np.mean(i[1][:,:,1].ravel()), np.std(i[1][:,:,1].ravel()), np.median(i[1][:,:,1].ravel())
            b_mean, b_std, b_med  = np.mean(i[1][:,:,2].ravel()), np.std(i[1][:,:,2].ravel()), np.median(i[1][:,:,2].ravel())
            canny = np.mean(np.ravel(cv2.Canny(cv2.cvtColor(i[1], cv2.COLOR_BGR2HSV),100,200,L2gradient = True)))
            try:
                orb = cv2.ORB_create(100)
                kp = orb.detect(i[1],None)
                kp, des = orb.compute(i[1], kp)
                orb_centers = list(KMeans(1).fit([i.pt for i in kp]).cluster_centers_)
                orbx1, orby1 = orb_centers[0][0]*255/np.shape(i[1])[0], orb_centers[0][1]*255/np.shape(i[1])[1]
            except ValueError:
                continue
            features.append(np.array([i[0][0],i[0][1], r_mean, r_std, r_med, g_mean, g_std, g_med, b_mean, b_std, b_med, canny, orbx1, orby1]))
        df = pd.DataFrame(features, columns = columns)
        return df
    
    def model_images_fit(self, df, k, extra_cols, rand_state):
        data = df.copy(deep=True)
        
        #Delete reference columns
        for i in extra_cols:
            del data[i]

        #Implement Gaussian Mixture Model Algortihm 
        model = GaussianMixture(n_components=k, random_state=9001)

        #Fit Model and Predict
        model.fit(data)
        y_pred = model.predict_proba(data)

        #Add prediction to dataframe and return 
        for i in range(0,k+1):
            if i==k:
                label = "Prediction"
                df[label] = model.predict(data)
                break
            label = "Prob_" + str(i)
            df[label] = y_pred[:,i]
        return df, model
    
    def model_users_fit(self, df, k, extra_cols, rand_state):
        data = df.copy(deep=True)
        
        #Delete reference columns
        for i in extra_cols:
            del data[i]

        #Implement K-Means Algortihm
        model = KMeans(n_clusters=k, random_state=rand_state)

        #Fit Model, Predict and Return
        model.fit(data)
        y_pred = model.predict(data)
        df['Prediction'] = model.labels_
        return df, model
    
    #Helper function to create folders for Image Clustering
    def save_clusters(self, df, label):
        self.save_path += label + "/"
        for i in self.train_imgs.items():
            temp_row = df[df["URL"]==i[0][1]]
            try:
                name, pred_folder = i[0][1], str(temp_row['Prediction'].values[0])
            except IndexError:
                continue
            temp_path = self.save_path + "Cluster" + pred_folder + "/"
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)
            io.imsave(temp_path+name, i[1])
        print("All Images Saved.")
            
    #Helper function to obtain percentage of Cluster Presence
    def get_cluster_presence(self, df, k):
        cluster_presence = []
        for i in list(df['User_Handle'].unique()):
            user_dict = {}
            temp_df = df[df['User_Handle']==i]
            post_count  =  len(temp_df)
            user_dict['User_Handle'] = i
            for j in range(0, k):
                user_dict["Cluster_"+str(j)] = sum(temp_df['Prob_'+str(j)])/post_count
            cluster_presence.append(user_dict)
        df_presence = pd.DataFrame(cluster_presence)
        df_presence = df_presence.fillna(0)
        return df_presence
    
    #Helper function to save model 
    def save_model(self, model, path):
        joblib.dump(model, path) 
        print("Model Saved.")

In [None]:
%%file ClusteringTest.py

import skimage
import os
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from skimage import data
from skimage import io
import glob
from skimage.viewer import ImageViewer
import cv2
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib

class Ranking:
    
    #Initialization
    def __init__(self, target_path):
        self.target_path = target_path

    #Helper Function to get images from target path 
    def get_images_target(self):
        #Get all Images of Users in the List
        self.target_imgs = {}
        for j in glob.glob(self.target_path + '/*.jpg'):
            file_name = j.replace(self.target_path,'')[:]
            img = io.imread(j)
            self.target_imgs[("Input/Target",file_name)] = img
        print("Number of images loaded:", len(self.target_imgs))
    
    #Helper function to load saved model
    def load_model(self, path):
        model = joblib.load(path)
        print ("Model loaded.")
        return model

    #Helper function to convert image to d-dimension vector for each image and 
    #return dataframe of all images
    def convert_to_features(self, columns):
        features = []
        for i in self.target_imgs.items():
            r_mean, r_std, r_med = np.mean(i[1][:,:,0].ravel()), np.std(i[1][:,:,0].ravel()), np.median(i[1][:,:,0].ravel())
            g_mean, g_std, g_med  = np.mean(i[1][:,:,1].ravel()), np.std(i[1][:,:,1].ravel()), np.median(i[1][:,:,1].ravel())
            b_mean, b_std, b_med  = np.mean(i[1][:,:,2].ravel()), np.std(i[1][:,:,2].ravel()), np.median(i[1][:,:,2].ravel())
            canny = np.mean(np.ravel(cv2.Canny(cv2.cvtColor(i[1], cv2.COLOR_BGR2HSV),100,200,L2gradient = True)))
            try:
                orb = cv2.ORB_create(100)
                kp = orb.detect(i[1],None)
                kp, des = orb.compute(i[1], kp)
                orb_centers = list(KMeans(1).fit([i.pt for i in kp]).cluster_centers_)
                orbx1, orby1 = orb_centers[0][0]*255/np.shape(i[1])[0], orb_centers[0][1]*255/np.shape(i[1])[1]
            except ValueError:
                continue
            features.append(np.array([i[0][0],i[0][1], r_mean, r_std, r_med, g_mean, g_std, g_med, b_mean, b_std, b_med, canny, orbx1, orby1]))
        df = pd.DataFrame(features, columns = columns)
        return df

    #Helper function to make prediction for target images using image model
    def predict(self, df, model, k, cluster_names, extra_cols):
        data = df.copy(deep=True)
        
        #Delete reference columns
        for i in extra_cols:
            del data[i]
            
        #Make Prediction
        y_pred = model.predict_proba(data)
        
        #Add prediction to dataframe and return 
        for i in range(0,k+1):
            if i==k:
                label = "Prediction"
                df[label] = model.predict(data)
                break
            label = cluster_names[i] + " (" + str(i) + ")"
            df[label] = y_pred[:,i]
        return df
    
    #Helper function to generate distance dictionary
    def get_result(self, df, k, model):
        final_dict = {}
        for i in range(0,len(df)):
            temp_file = df.iloc[i,1]
            temp_dist = []
            for j in range(0, k):
                temp_dist.append(np.linalg.norm(df.iloc[i,14:14+k].astype(float)-model.cluster_centers_[j]))
            final_dict[temp_file] = temp_dist
        return final_dict

In [1]:
import ClusteringTrain

  warn("Recommended matplotlib backend is `Agg` for full "


In [2]:
#Define paths
train_path = "/Users/kmotwani/Dropbox/Harvard/Capstone_EmpSirenuse/Datasets/sample_users_media/"
dest_path = "/Users/kmotwani/Desktop/Me - Local/Education/Courses/Capstone Project/Clustering_"

#Create Object
obj_train = ClusteringTrain.Clustering(train_path, dest_path)

In [3]:
#Define User List
user_list = ['ana_brandine', 'vicky_regouli', 'luismiguelpss', 'ilariabiagini', 
              'emnegg', 'kerendhahn', 'agiorgina', 'roulamatta', 'jussbieber9827', 
              'eremiaheidr', 'eunhuiheo', 'anastasiakaps', 'achaelilsone', 'orit_talbi',
              'sorayaalassmi', 'altonolnlis', 'vaso1977', 'theunrealobserver', 'nsb.koc',
              'vivpeng', 'amrynevillek', 'danalev7', 'irienyree', 'lilachturgeman', 
              'emel_karakoc', 'thiswhomustbekept', 'j_f_lil', 'ulietteearneye', 
              'gilanaz', 'sarrahdolly', 'alexchahine97', 'photographerarson', 
              'angecanindo', 'fiona_smithson', 'chelsea_xu620']


#Get Train Images
obj_train.get_train_images(user_list)

Number of images loaded: 4204


In [4]:
#Get train dataframe
train_df_cols = ["User_Handle","URL","R_Mean", "R_STD", "R_MED", "G_Mean", "G_STD", 
                "G_MED", "B_Mean", "B_STD", "B_MED", "Canny", "ORB_X", "ORB_Y"]

train_df = obj_train.convert_to_features(train_df_cols)
display(train_df.head())

Unnamed: 0,User_Handle,URL,R_Mean,R_STD,R_MED,G_Mean,G_STD,G_MED,B_Mean,B_STD,B_MED,Canny,ORB_X,ORB_Y
0,ana_brandine,26372498_389847204770398_6966953664447512576_n...,124.28693364197532,54.982721115324914,129.0,110.94144135802468,64.97294688605628,103.0,116.41365123456792,52.208674194760576,112.0,11.173171296296296,58.24248780059813,223.83257664362588
1,ana_brandine,26863920_144401739575154_4418153808522117120_n...,217.50125578703705,21.809475460762837,221.0,226.17765432098764,19.62935641804588,229.0,221.7790027006173,17.751962291788974,225.0,10.415943287037036,109.63589888572696,126.32398951848349
2,ana_brandine,26321037_1877332548975256_4414982808397676544_...,187.1023487654321,78.30435860920738,240.0,177.43719598765432,99.80029637985992,253.0,213.90927469135804,56.55086173846352,253.0,16.77451388888889,81.69721168136597,163.06849195480342
3,ana_brandine,26867951_1602223876551766_4978939201505460224_...,141.8696871570967,72.02035276135676,164.0,141.8696871570967,72.02035276135676,164.0,141.8696871570967,72.02035276135676,164.0,8.377151418420668,135.22561952797,123.46301435275092
4,ana_brandine,26864593_177802629495404_5286553074301665280_n...,125.94650634430728,49.70044722234354,129.0,112.06623113854596,48.545398914361904,111.0,111.26442386831276,45.41810073747563,108.0,3.833526234567901,165.4252676010132,113.71512344360352


In [5]:
#Define cluster count for users and images 
k = 5

In [6]:
#Fit Model
train_df, model_images = obj_train.model_images_fit(train_df, k, ["User_Handle","URL"], 9001)
display(train_df.head())

Unnamed: 0,User_Handle,URL,R_Mean,R_STD,R_MED,G_Mean,G_STD,G_MED,B_Mean,B_STD,B_MED,Canny,ORB_X,ORB_Y,Prob_0,Prob_1,Prob_2,Prob_3,Prob_4,Prediction
0,ana_brandine,26372498_389847204770398_6966953664447512576_n...,124.28693364197532,54.982721115324914,129.0,110.94144135802468,64.97294688605628,103.0,116.41365123456792,52.208674194760576,112.0,11.173171296296296,58.24248780059813,223.83257664362588,0.0001768754,0.4455754,0.4874286,2.504207e-25,0.06681919,2
1,ana_brandine,26863920_144401739575154_4418153808522117120_n...,217.50125578703705,21.809475460762837,221.0,226.17765432098764,19.62935641804588,229.0,221.7790027006173,17.751962291788974,225.0,10.415943287037036,109.63589888572696,126.32398951848349,1.792537e-29,1.47862e-10,2.181585e-33,1.0,1.430933e-08,3
2,ana_brandine,26321037_1877332548975256_4414982808397676544_...,187.1023487654321,78.30435860920738,240.0,177.43719598765432,99.80029637985992,253.0,213.90927469135804,56.55086173846352,253.0,16.77451388888889,81.69721168136597,163.06849195480342,7.135394e-226,5.460699e-08,3.0544630000000002e-27,4.3785240000000005e-55,0.9999999,4
3,ana_brandine,26867951_1602223876551766_4978939201505460224_...,141.8696871570967,72.02035276135676,164.0,141.8696871570967,72.02035276135676,164.0,141.8696871570967,72.02035276135676,164.0,8.377151418420668,135.22561952797,123.46301435275092,6.596532e-07,0.02628881,4.374024e-07,0.9736655,4.456139e-05,3
4,ana_brandine,26864593_177802629495404_5286553074301665280_n...,125.94650634430728,49.70044722234354,129.0,112.06623113854596,48.545398914361904,111.0,111.26442386831276,45.41810073747563,108.0,3.833526234567901,165.4252676010132,113.71512344360352,0.9064867,0.01260213,0.004301548,0.07642772,0.000181896,0


In [8]:
train_df.to_csv("/Users/kmotwani/Desktop/Me/Education/Courses/Capstone Project/ImageDF.csv")

In [None]:
#Save Clusters to Local Directory
obj_train.save_clusters(train_df, "Users")

In [9]:
#Get each user's presence in each cluster 
presence_df = obj_train.get_cluster_presence(train_df, k)
display(presence_df.head())

Unnamed: 0,Cluster_0,Cluster_1,Cluster_2,Cluster_3,Cluster_4,User_Handle
0,0.1824389,0.28706,0.172939,0.2372551,0.120307,ana_brandine
1,9.43959e-06,0.351061,0.648356,3.381998e-29,0.000574,vicky_regouli
2,0.3083974,0.168561,0.252934,0.0518019,0.218305,luismiguelpss
3,0.1230662,0.001473,0.859725,2.866358e-16,0.015735,ilariabiagini
4,4.836457e-09,0.706811,0.280734,9.070713e-35,0.012455,emnegg


In [10]:
#Fit Presence Model
presence_df, model_users = obj_train.model_users_fit(presence_df, k, ["User_Handle"], 9001)
display(presence_df.head())

Unnamed: 0,Cluster_0,Cluster_1,Cluster_2,Cluster_3,Cluster_4,User_Handle,Prediction
0,0.1824389,0.28706,0.172939,0.2372551,0.120307,ana_brandine,2
1,9.43959e-06,0.351061,0.648356,3.381998e-29,0.000574,vicky_regouli,4
2,0.3083974,0.168561,0.252934,0.0518019,0.218305,luismiguelpss,2
3,0.1230662,0.001473,0.859725,2.866358e-16,0.015735,ilariabiagini,4
4,4.836457e-09,0.706811,0.280734,9.070713e-35,0.012455,emnegg,1


In [12]:
presence_df.to_csv("/Users/kmotwani/Desktop/Me/Education/Courses/Capstone Project/PresenceDF.csv")

In [13]:
#Save Model
obj_train.save_model(model_images, "/Users/kmotwani/Dropbox/Harvard/Capstone_EmpSirenuse/model_images.plk")
obj_train.save_model(model_users, "/Users/kmotwani/Dropbox/Harvard/Capstone_EmpSirenuse/model_users.plk")

Model Saved.
Model Saved.


In [14]:
import ClusteringTest

In [15]:
#Define target path and create test object
target_path = "/Users/kmotwani/Dropbox/Harvard/Capstone_EmpSirenuse/Images/Test/"

obj_test = ClusteringTest.Ranking(target_path)

In [16]:
#Get Target Images
obj_test.get_images_target()

Number of images loaded: 10


In [17]:
#Get target dataframe
target_df_cols = ["User_Handle","URL","R_Mean", "R_STD", "R_MED", "G_Mean", "G_STD", 
                "G_MED", "B_Mean", "B_STD", "B_MED", "Canny", "ORB_X", "ORB_Y"]
target_df = obj_test.convert_to_features(target_df_cols)

In [18]:
#Load Model
model_images = obj_test.load_model("/Users/kmotwani/Dropbox/Harvard/Capstone_EmpSirenuse/model_images.plk")
model_users = obj_test.load_model("/Users/kmotwani/Dropbox/Harvard/Capstone_EmpSirenuse/model_users.plk")

Model loaded.
Model loaded.


In [19]:
#Get Prediction Dataframe
target_df = obj_test.predict(target_df, model_images, k, ["1","2","3","4","5"], ["User_Handle","URL"])
display(target_df.head())

Unnamed: 0,User_Handle,URL,R_Mean,R_STD,R_MED,G_Mean,G_STD,G_MED,B_Mean,B_STD,B_MED,Canny,ORB_X,ORB_Y,1 (0),2 (1),3 (2),4 (3),5 (4),Prediction
0,Input/Target,1172274_162179640810532_621976427_n.jpg,159.61961158435273,102.3060750415978,217.0,131.6118496691072,89.23390943789735,173.0,97.3636791290096,71.19959468838788,113.0,24.103654027446133,214.3498250451108,122.67581192456156,2.117123e-28,0.171226,6.43401e-09,5.181239e-06,0.828769,4
1,Input/Target,1168618_1019574711417428_341248175_n.jpg,173.1028212945591,66.4166436304917,175.0,170.8887734521576,70.64238352796758,171.0,166.7540712945591,75.53735142901485,159.0,58.18174249530957,130.0917470871172,81.74434149885178,0.9391978,0.059649,0.001124692,2.222381e-10,2.8e-05,0
2,Input/Target,10296630_1212452475434819_2130836158_n.jpg,139.7794558213978,59.67489351378914,152.0,151.58723449161337,53.16628069270577,164.0,160.48265511318002,47.47821543673484,166.0,19.93278380010436,153.01452244165156,161.17794589211232,5.831663e-07,0.054752,9.299633e-05,0.9415012,0.003654,3
3,Input/Target,1169155_1017226601649759_282002397_n.jpg,192.29888887885983,47.22027688245933,218.0,183.80816687577288,54.15898470105433,217.0,178.0515394127576,58.14286471163767,214.0,16.357526469234312,107.22949209407884,132.57832536510958,1.053317e-11,0.01008,1.194264e-10,0.9898964,2.4e-05,3
4,Input/Target,1390240_1642982635970351_50212024_n.jpg,155.77006768718903,38.78898824961655,164.0,155.77006768718903,38.78898824961655,164.0,155.77006768718903,38.78898824961655,164.0,8.292014514208969,213.23410027461816,105.24644761665088,0.0002068892,0.000413,1.823587e-08,0.9991556,0.000225,3


In [20]:
#Get Distance Dictionary
dist_dict = obj_test.get_result(target_df, k, model_users)
print("Distance to Cluster Dictionary:\n\n", dist_dict)

Distance to Cluster Dictionary:

 {'1172274_162179640810532_621976427_n.jpg': [0.18328240980559246, 1.004102026059796, 0.8093133975647372, 1.3099855958692677, 1.1442551071557596], '1168618_1019574711417428_341248175_n.jpg': [1.3137194445089824, 1.175285199767597, 0.7973639305352125, 1.3731698461725224, 1.1600608991740877], '10296630_1212452475434819_2130836158_n.jpg': [1.2857792418076797, 1.2151779799420146, 0.9695417789519185, 0.08018536937471418, 1.2325195843589591], '1169155_1017226601649759_282002397_n.jpg': [1.3222699759433139, 1.2808494147864344, 1.0251940110432503, 0.014251772302872622, 1.272930434920745], '1390240_1642982635970351_50212024_n.jpg': [1.3289063831793841, 1.2940703604063437, 1.036156539796962, 0.0009580981552399447, 1.280976394118665], '928206_1544976622463081_2106404359_n.jpg': [0.6598689631566814, 0.5133444021304655, 0.4506096748825946, 1.1728586408150223, 0.8868144104607348], '928197_1004690526279413_524829661_n.jpg': [1.3260290358915126, 1.2911982268270747, 1.0

In [21]:
print([(i[0],list(i[1]).index(min(i[1]))) for i in dist_dict.items()])

[('1172274_162179640810532_621976427_n.jpg', 0), ('1168618_1019574711417428_341248175_n.jpg', 2), ('10296630_1212452475434819_2130836158_n.jpg', 3), ('1169155_1017226601649759_282002397_n.jpg', 3), ('1390240_1642982635970351_50212024_n.jpg', 3), ('928206_1544976622463081_2106404359_n.jpg', 2), ('928197_1004690526279413_524829661_n.jpg', 3), ('1170193_779361642192757_292653038_n.jpg', 1), ('1941288_777364669073740_2047165296_n.jpg', 1), ('1516202_199562453718582_1688658549_n.jpg', 3)]
