# KD-Tree

In [None]:
from scipy import spatial
import numpy as np

In [None]:
spatial.KDTree

In [None]:
x, y = np.mgrid[0:5, 2:8]

In [None]:
x

In [None]:
y

In [None]:
print(zip(x.ravel(), y.ravel()))

In [None]:
x.ravel()

In [None]:
np.sqrt(np.square(0.1) + np.square(0.1) ) 

In [None]:
tree = spatial.KDTree(list(zip(x.ravel(), y.ravel())))
pts = np.array([[0, 0], [2.1, 2.9]])
tree.query(pts)

# Calculating Euclidian Distance 

### reading csv from one hot encoded csv dataset

In [None]:
import pandas as pd
from scipy.spatial.distance import pdist, squareform

test_df = pd.read_csv("../herbarium-berlin-oneh.csv.gz",compression='gzip', encoding='utf-8', sep='|')

In [None]:
dist = pdist(test_df, 'euclidean')
# dist_df = pd.DataFrame(squareform(dist))

In [None]:
from sklearn.neighbors import DistanceMetric

dist = DistanceMetric.get_metric('euclidean')

X = [[0, 1, 2],[3, 4, 5]]
dist.pairwise(X)

In [None]:
test_df.values

In [None]:
dist.pairwise(test_df.values)

### reading csv from clean csv dataset

In [None]:
import pandas as pd

item_df_dr_0 = pd.read_csv("../herbarium-berlin-clean.csv", sep = '\t', index_col=False)
item_df_dr_0 = item_df_dr_0.drop(item_df_dr_0.columns[0], axis=1)

cal_btc_df = item_df_dr_0[["kingdom","phylum","class","order","family","genus","species","decimalLatitude","decimalLongitude"]]
oneh_cal_btc_df = pd.get_dummies(cal_btc_df,prefix=['kingdom','phylum','class','order','family','genus','species'])
oneh_cal_btc_df

In [None]:
item_df_dr_0

In [None]:
oneh_cal_btc_df.shape

In [None]:
from sklearn.neighbors import DistanceMetric

dist = DistanceMetric.get_metric('euclidean')
dist.pairwise(oneh_cal_btc_df.values)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from scipy import sparse

sparse_df = oneh_cal_btc_df.to_sparse(fill_value=0)
X = sparse.csr_matrix(sparse_df.to_coo())

res = euclidean_distances(X, X)

In [None]:
import numpy as np

X = oneh_cal_btc_df.to_sparse(fill_value=0)

In [None]:
X.density

In [None]:
X

In [None]:
sparse_df

### dot product method derived from the releation between euclidean distance and dot product

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from scipy import sparse

sparse_df = oneh_cal_btc_df.to_sparse(fill_value=0)
X = sparse.csr_matrix(sparse_df.to_coo())

In [None]:
type(X)

In [None]:
X.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X_norm = scaler.fit_transform(X.T)

In [None]:
X_norm

In [None]:
X_norm.dot(X_norm[0,:].T).toarray().flatten().argsort()

In [None]:
similarities = X_norm.T.dot(X_norm)

In [None]:
neighbors = similarities.argsort(axis=1)[:,-10:-1]

In [None]:
X[0]

In [None]:
X[0,:]

In [None]:
#X.dot(X[0,:].T).toarray().flatten().argsort()[1:5]
res0 = X.dot(X[0,:].T).toarray().flatten().argsort()
res0

In [None]:
np.where(res0==0)

In [None]:
res1 = X.dot(X[1,:].T).toarray().flatten().argsort()

In [None]:
np.where(res1==1)

In [None]:
res[173283]

In [None]:
item_df_dr_0.iloc[0]

In [None]:
item_df_dr_0.iloc[88]

In [None]:
from sklearn.preprocessing import StandardScaler

def sparse_similarity(X,top_k=5):
        
    #scaler = StandardScaler(with_mean=False)
    #X_norm = scaler.fit_transform(X.T).T

    return [X.dot(X[i,:].T).toarray().flatten().argsort()[1:top_k] for i in range(X.shape[0])]

In [None]:
smlrt = sparse_similarity(X)

In [None]:
','.join(str(e) for e in X.dot(X[1,:].T).toarray().flatten().argsort()[1:20])

In [None]:
range(X.shape[0])

In [None]:
#hs = open("../similar-result.txt","a")

with open("../similar-result.txt","a") as hs:

    for i in range(X.shape[0]):
    
        hs.write(','.join(str(e) for e in X.dot(X[i,:].T).toarray().flatten().argsort()[1:20]) + "\n")
        #hs.write("1 \n")
    
#hs.close()

### Test Normalization for dot product

In [None]:
import numpy as np

x = np.random.randn(100) * 5 + 2 

In [None]:
x

In [None]:
from sklearn.preprocessing import normalize

x_norm = normalize([x])

In [None]:
x_norm[0]

In [None]:
x_norm[0] @ x_norm[0].T 

In [None]:
big_x_norm = normalize(X)

In [None]:
big_x_norm

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from scipy import sparse
from sklearn.preprocessing import normalize

sparse_df = test_df.to_sparse(fill_value=0)
X = sparse.csr_matrix(sparse_df.to_coo())

x_norm = normalize(X)

In [None]:
x_norm.shape[0]

In [None]:
for i in range(x_norm.shape[0]):
    
        #hs.write(','.join(str(e) for e in X.dot(X[i,:].T).toarray().flatten().argsort()[1:20]) + "\n")
        print(','.join(str(e) for e in x_norm.dot(x_norm[i,:].T).toarray().flatten().argsort()[::-1][0:20]))

In [None]:
item_df_dr_0.iloc[0]

In [None]:
item_df_dr_0.iloc[4]

In [None]:
item_df_dr_0.iloc[2]

### calculating only on the taxonomy

In [None]:
tax_oneh_cal_btc_df = oneh_cal_btc_df.drop(columns=['decimalLatitude', 'decimalLongitude'])
tax_oneh_cal_btc_df

In [None]:
sparse_df = tax_oneh_cal_btc_df.to_sparse(fill_value=0)
X = sparse.csr_matrix(sparse_df.to_coo())

scaler = StandardScaler(with_mean=False)
X_norm = scaler.fit_transform(X.T)

In [None]:
similarities = X_norm.T.dot(X_norm)

In [None]:
X_norm

### chunk calculating euclidean distance

In [None]:
from numpy import linalg as LA
import numpy as np

In [None]:
test_df = oneh_cal_btc_df.iloc[:10]

test_df

In [None]:
oneh_cal_btc_df.iloc[0].values - oneh_cal_btc_df.values

In [None]:
LA.norm(oneh_cal_btc_df.iloc[0].values - oneh_cal_btc_df.values, axis=1)

In [None]:
def calculate_ed(val_array, main_df):
    sim_res = np.array([])
    
    for index, row in main_df.iterrows():
        dis = LA.norm(row.values - val_array)
        sim_res = np.append(sim_res, dis)
    
    #LA.norm(row.values - main_df.values, axis=1)
    
    return np.argsort(sim_res)[:20]

In [None]:
similar_item_array = []
#used_df = oneh_cal_btc_df
used_df = test_df

for index, row in used_df.iterrows():
    #print("row : " + str(index), end='\r')
    res = calculate_ed(row.values, used_df)
    similar_item_array.append(res)
    
    
end_result = np.array(similar_item_array)
#df1 = pd.DataFrame(end_result)
#df1.to_csv("../result-similarity.csv", encoding='utf-8', header=False, index=False)
end_result

In [None]:
similar_item_array

In [None]:
end_result.shape

In [None]:
df1 = pd.DataFrame(end_result)
df1.to_csv("../test-similarity.csv", encoding='utf-8', header=False, index=False)

In [None]:
used_df = test_df

with open("../test-similar-result.csv","a") as hs:

    for index, row in used_df.iterrows():
        #print("row : " + str(index), end='\r')
        res = calculate_ed(row.values, used_df)
        #similar_item_array.append(res)
        
        hs.write(','.join(str(e) for e in res) + "\n")

In [None]:
test = pd.read_csv("../caled-similar-result.csv", header=None, index_col=False)
test

In [None]:
import pandas as pd
from numpy import linalg as LA
import numpy as np


# reading the dataset
item_df_dr_0 = pd.read_csv("../herbarium-berlin-clean.csv", sep = '\t', index_col=False)
item_df_dr_0 = item_df_dr_0.drop(item_df_dr_0.columns[0], axis=1)

cal_btc_df = item_df_dr_0[["kingdom","phylum","class","order","family","genus","species","decimalLatitude","decimalLongitude"]]
oneh_cal_btc_df = pd.get_dummies(cal_btc_df,prefix=['kingdom','phylum','class','order','family','genus','species'])


# method to calculate the euclidean distance for each row
def calculate_ed(val_array, main_df):
    sim_res = np.array([])
    
    for j in range(main_df.shape[0]):
    #for index, row in main_df.iterrows():
        dis = LA.norm(main_df.iloc[j].values - val_array)
        sim_res = np.append(sim_res, dis)
    
    return np.argsort(sim_res)[:21]

# test dataset
test_df = oneh_cal_btc_df.iloc[:10]

# loop through each dataset row
similar_item_array = []
#used_df = oneh_cal_btc_df
used_df = test_df

# calculate the similarity and take the 20 most similar items
with open("../caled-similar-result.csv","a") as hs:

    for i in range(used_df.shape[0]):
    #for index, row in used_df.iterrows():
        #print("row : " + str(index), end='\r')
        #res = calculate_ed(row.values, used_df)
        #similar_item_array.append(res)
        
        res = calculate_ed(oneh_cal_btc_df.iloc[i].values, used_df)
        hs.write(','.join(str(e) for e in res) + "\n")

In [None]:
oneh_cal_btc_df.iloc[0].values