In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from datetime import datetime

In [3]:
time_start = datetime.now()

vgg19_model_fc1_features = pd.read_csv('vgg19_model_fc1_features.csv', sep=',', header=None)

print('This step took time:', datetime.now() - time_start)

This step took time: 0:00:52.566453


In [4]:
vgg19_model_fc1_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,3.094828,2.550586,1.647557,0.0,0.0,1.284944,4.473051,0.0,0.0,0.0,...,0.0,2.053498,2.249888,0.0,1.680549,0.0,6.533272,0.557587,2.23029,1.856253
1,4.302448,3.643159,2.365661,0.0,0.0,1.405371,3.830365,0.0,0.0,0.0,...,0.0,0.0,1.833241,0.0,0.651541,0.877186,6.996201,0.0,1.502485,2.581533
2,1.310563,2.67615,0.0,0.0,0.0,3.034392,2.832518,0.0,0.0,0.0,...,0.107027,1.493688,4.27433,0.0,1.228995,0.0,4.333321,0.0,4.63923,0.888631
3,5.516655,0.0,2.521152,0.0,0.0,0.0,6.429861,0.0,2.919165,0.0,...,0.0,0.0,0.0,3.407895,0.0,0.0,5.7678,1.286931,0.0,4.303179
4,5.193856,1.0227,0.761596,0.0,0.0,2.82331,1.901202,0.0,0.0,0.0,...,0.0,0.0,2.264707,0.0,2.462982,0.0,3.978319,0.0,2.107209,0.0


In [5]:
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(vgg19_model_fc1_features.values)

In [6]:
time_start = datetime.now()

kmeans = KMeans(n_clusters=20, max_iter=30000, random_state=200)

print('This step took time:', datetime.now() - time_start)

This step took time: 0:00:00.003557


In [7]:
time_start = datetime.now()

kmeans.fit(X)

print('This step took time:', datetime.now() - time_start)

This step took time: 0:14:25.072024


In [9]:
time_start = datetime.now()

output = kmeans.predict(X)

print('This step took time:', datetime.now() - time_start)

This step took time: 0:00:07.284817


In [10]:
labels = [i for i in range(20)]
cluster_dict = dict.fromkeys(labels, None) # dict with cluster number as keys and indices of relevant patches as values

for i in range(len(output)):
    if cluster_dict[output[i]] == None:
        cluster_dict[output[i]] = [i]
    else:
        cluster_dict[output[i]].append(i)
        
[len(cluster_dict[key]) for key in cluster_dict]

[2396,
 906,
 1960,
 3530,
 1084,
 2992,
 2900,
 624,
 1547,
 680,
 2531,
 292,
 997,
 1954,
 1592,
 52,
 3722,
 3150,
 3370,
 3721]

In [39]:
lfdp_ground_labels = pd.read_csv("LFDP_ground_labels.csv")
lfdp_ground_labels = lfdp_ground_labels.drop('Unnamed: 0', axis=1)

lfdp_palm = lfdp_ground_labels.loc[(lfdp_ground_labels['SPECIES'] == 'PREMON') & (lfdp_ground_labels['ALIVE'] == 'A')]
lfdp_palm = lfdp_palm[lfdp_palm['DIAM'] >= 20]
lfdp_palm

Unnamed: 0,Unnamed: 0.1,STEM.TAG,QUAD,SUBQUAD,X,Y,SPECIES,CENSUS,DIAM,ALIVE,coords.x1,coords.x2,pix_1,pix_2
765,484772,6518.0,108,32,11.52,145.91,PREMON,6,20.0,A,265155.806401,254323.465978,8489,11719
2314,486384,17926.0,123,33,14.08,452.41,PREMON,6,20.7,A,265159.451067,254629.114280,8629,2547
3909,488055,171526.0,214,42,35.19,269.20,PREMON,6,20.2,A,265179.882966,254446.367808,9224,8033
4259,488423,14337.0,217,31,30.54,321.37,PREMON,6,20.0,A,265175.424270,254498.403076,9095,6471
4989,489173,17942.0,223,23,26.53,450.48,PREMON,6,20.8,A,265171.877860,254627.164549,9002,2606
6047,490268,6579.0,308,42,55.78,149.65,PREMON,6,20.9,A,265200.021532,254327.106555,9816,11614
6118,490341,7597.0,309,34,50.68,177.72,PREMON,6,21.0,A,265195.027876,254355.109293,9669,10773
7095,491364,16366.0,320,32,51.90,386.27,PREMON,6,20.0,A,265196.986595,254563.080827,9749,4532
8369,492676,5610.0,407,14,63.57,136.94,PREMON,6,20.2,A,265207.756179,254314.415979,10047,11996
8737,493062,9507.0,411,34,74.60,217.18,PREMON,6,21.0,A,265219.056529,254394.412156,10394,9596


In [18]:
from collections import defaultdict

palm_relevance = defaultdict(int)
#palm_relvance = dict

for index, entry in lfdp_palm.iterrows():
    patch_y = entry['pix_1'] // 100
    patch_x = entry['pix_2'] // 100 
    
    patches_i = patch_x*200 + patch_y
    palm_relevance[output[patches_i]] += 1
    
for cluster_i in palm_relevance:
    palm_relevance[cluster_i] /= len(cluster_dict[cluster_i])
    
palm_relevance

defaultdict(int,
            {10: 0.0007902015013828526,
             16: 0.0021493820526598604,
             14: 0.005653266331658292,
             6: 0.001379310344827586,
             5: 0.002005347593582888,
             12: 0.0010030090270812437,
             4: 0.0018450184501845018,
             18: 0.0008902077151335311,
             1: 0.006622516556291391,
             2: 0.0020408163265306124,
             11: 0.0136986301369863,
             19: 0.0018812147272238647,
             17: 0.0012698412698412698,
             3: 0.0022662889518413596,
             0: 0.0012520868113522537,
             8: 0.0019392372333548805})

In [31]:
lfdp_palm['pix_2'].tolist()

[11719,
 2547,
 8033,
 6471,
 2606,
 11614,
 10773,
 4532,
 11996,
 9596,
 9168,
 9034,
 9457,
 10119,
 9814,
 8889,
 3456,
 10220,
 10263,
 10700,
 10396,
 8895,
 7773,
 6424,
 5707,
 3119,
 10260,
 10073,
 9579,
 9370,
 7419,
 6765,
 6815,
 5361,
 15933,
 14616,
 9765,
 13075,
 14457,
 13087,
 12877,
 9632,
 9519,
 8803,
 8469,
 8302,
 7741,
 12472,
 9686,
 9960,
 9794,
 8862,
 8569,
 8033,
 8210,
 9772,
 9724,
 9632,
 9213,
 8163,
 8051,
 12161,
 10606,
 5461,
 5934,
 9980,
 6175,
 5789,
 5704,
 5471,
 5692,
 5080,
 4705,
 2934]

In [36]:
num_palm = np.zeros(20)

for i in range(lfdp_palm.shape[0]):
    loc_1 = lfdp_palm['pix_2'].tolist()[i] // 100
    loc_2 = lfdp_palm['pix_1'].tolist()[i] // 100
    location = loc_1 * 200 + loc_2
   
    lab = int(output[location])
    num_palm[lab] += 1
    
## size of each cluster in the area with labels
size_cluster = np.zeros(20)
for i in range(len(output)):
    clst = output[i]
    size_cluster[clst] += 1
ratio_palm = num_palm / (size_cluster + 1)

In [37]:
ratio_palm

array([0.00125156, 0.00661521, 0.00203978, 0.00226565, 0.00184332,
       0.00200468, 0.00137883, 0.        , 0.00193798, 0.        ,
       0.00078989, 0.01365188, 0.001002  , 0.        , 0.00564972,
       0.        , 0.0021488 , 0.00126944, 0.00088994, 0.00188071])

In [40]:
num_palm = np.zeros(20)

for i in range(lfdp_palm.shape[0]):
    loc_1 = lfdp_palm['pix_2'].tolist()[i] // 100
    loc_2 = lfdp_palm['pix_1'].tolist()[i] // 100
    location = loc_1 * 200 + loc_2
   
    lab = int(output[location])
    num_palm[lab] += 1
    
## size of each cluster in the area with labels
size_cluster = np.zeros(20)
for i in range(len(output)):
    clst = output[i]
    size_cluster[clst] += 1
ratio_palm = num_palm / (size_cluster + 1)

In [41]:
ratio_palm

array([0.00125156, 0.00661521, 0.00203978, 0.00226565, 0.00184332,
       0.00200468, 0.00137883, 0.        , 0.00193798, 0.        ,
       0.00078989, 0.01365188, 0.001002  , 0.        , 0.00564972,
       0.        , 0.0021488 , 0.00126944, 0.00088994, 0.00188071])