# KD-Tree

In [None]:
from scipy import spatial
import numpy as np

In [None]:
spatial.KDTree

In [None]:
x, y = np.mgrid[0:5, 2:8]

In [None]:
x

In [None]:
y

In [None]:
print(zip(x.ravel(), y.ravel()))

In [None]:
x.ravel()

In [None]:
np.sqrt(np.square(0.1) + np.square(0.1) ) 

In [None]:
tree = spatial.KDTree(list(zip(x.ravel(), y.ravel())))
pts = np.array([[0, 0], [2.1, 2.9]])
tree.query(pts)

# Calculating Euclidian Distance 

### reading csv from one hot encoded csv dataset

In [None]:
import pandas as pd
from scipy.spatial.distance import pdist, squareform

test_df = pd.read_csv("../herbarium-berlin-oneh.csv.gz",compression='gzip', encoding='utf-8', sep='|')

In [None]:
dist = pdist(test_df, 'euclidean')
# dist_df = pd.DataFrame(squareform(dist))

In [None]:
from sklearn.neighbors import DistanceMetric

dist = DistanceMetric.get_metric('euclidean')

X = [[0, 1, 2],[3, 4, 5]]
dist.pairwise(X)

In [None]:
test_df.values

In [None]:
dist.pairwise(test_df.values)

### reading csv from clean csv dataset

In [1]:
import pandas as pd

item_df_dr_0 = pd.read_csv("../herbarium-berlin-clean.csv", sep = '\t', index_col=False)
item_df_dr_0 = item_df_dr_0.drop(item_df_dr_0.columns[0], axis=1)

cal_btc_df = item_df_dr_0[["kingdom","phylum","class","order","family","genus","species","decimalLatitude","decimalLongitude"]]
oneh_cal_btc_df = pd.get_dummies(cal_btc_df,prefix=['kingdom','phylum','class','order','family','genus','species'])
oneh_cal_btc_df

  return f(*args, **kwds)
  return f(*args, **kwds)


Unnamed: 0,decimalLatitude,decimalLongitude,kingdom_Chromista,kingdom_Fungi,kingdom_Plantae,phylum_Ascomycota,phylum_Bryophyta,phylum_Charophyta,phylum_Chlorophyta,phylum_Marchantiophyta,...,species_Ziziphus lotus,species_Ziziphus mauritiana,species_Ziziphus spina-christi,species_Zornia contorta,species_Zosima absinthifolia,species_Zoysia matrella,species_Zuvanda meyeri,species_Zygophyllum fabago,species_Zygophyllum macropterum,species_Zygophyllum pterocarpum
0,43.000278,16.003611,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,43.000557,-3.003333,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,36.009998,15.007500,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,43.000557,-6.000278,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,36.009998,15.007222,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,43.000557,-3.003333,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,43.000557,-6.002222,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,-16.912500,-70.908058,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,51.425835,13.265833,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,52.406944,14.554723,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
item_df_dr_0

In [None]:
oneh_cal_btc_df.shape

In [None]:
from sklearn.neighbors import DistanceMetric

dist = DistanceMetric.get_metric('euclidean')
dist.pairwise(oneh_cal_btc_df.values)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from scipy import sparse

sparse_df = oneh_cal_btc_df.to_sparse(fill_value=0)
X = sparse.csr_matrix(sparse_df.to_coo())

res = euclidean_distances(X, X)

In [None]:
import numpy as np

X = oneh_cal_btc_df.to_sparse(fill_value=0)

In [None]:
X.density

In [None]:
X

In [None]:
sparse_df

### Bießmann's method

In [2]:
from sklearn.metrics.pairwise import euclidean_distances
from scipy import sparse

sparse_df = oneh_cal_btc_df.to_sparse(fill_value=0)
X = sparse.csr_matrix(sparse_df.to_co())

In [6]:
type(X)

scipy.sparse.csr.csr_matrix

In [7]:
X.shape

(184981, 13834)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X_norm = scaler.fit_transform(X.T)

In [None]:
X_norm

In [None]:
similarities = X_norm.T.dot(X_norm)

In [None]:
neighbors = similarities.argsort(axis=1)[:,-10:-1]

In [9]:
X[0]

<1x13834 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [12]:
X.dot(X[0,:].T).toarray().flatten().argsort()[1:5]

array([121810, 121818,  39900,    833])

In [16]:
from sklearn.preprocessing import StandardScaler

def sparse_similarity(X,top_k=5):
        
    scaler = StandardScaler(with_mean=False)
    X_norm = scaler.fit_transform(X.T).T

    return [X_norm.dot(X_norm[i,:].T).toarray().flatten().argsort()[1:top_k] for i in range(X.shape[0])]

In [None]:
smlrt = sparse_similarity(X)

### calculating only on the taxonomy

In [None]:
tax_oneh_cal_btc_df = oneh_cal_btc_df.drop(columns=['decimalLatitude', 'decimalLongitude'])
tax_oneh_cal_btc_df

In [None]:
sparse_df = tax_oneh_cal_btc_df.to_sparse(fill_value=0)
X = sparse.csr_matrix(sparse_df.to_coo())

scaler = StandardScaler(with_mean=False)
X_norm = scaler.fit_transform(X.T)

In [None]:
similarities = X_norm.T.dot(X_norm)

In [None]:
X_norm

### chunk calculating euclidean distance

In [2]:
from numpy import linalg as LA
import numpy as np

In [None]:
test_df = oneh_cal_btc_df.iloc[:10]

test_df

In [3]:
def calculate_ed(val_array, main_df):
    sim_res = np.array([])
    
    for index, row in main_df.iterrows():
        dis = LA.norm(row.values - val_array)
        sim_res = np.append(sim_res, dis)
    
    return np.argsort(sim_res)[:20]

In [None]:
similar_item_array = []
used_df = oneh_cal_btc_df

for index, row in used_df.iterrows():
    #print("row : " + str(index), end='\r')
    res = calculate_ed(row.values, used_df)
    similar_item_array.append(res)
    
    
end_result = np.array(similar_item_array)
df1 = pd.DataFrame(end_result)
df1.to_csv("../result-similarity.csv", encoding='utf-8', header=False, index=False)

In [None]:
end_result.shape

In [None]:
df1 = pd.DataFrame(end_result)
df1.to_csv("../test-similarity.csv", encoding='utf-8', header=False, index=False)