# Inspecting Dataset

In [None]:
import pandas as pd

test_df = pd.read_csv("../herbarium-berlin.csv", sep = '\t')

In [None]:
test_df.shape

In [None]:
print(len(test_df.columns))
list(test_df.columns)

In [None]:
test_df.iloc[:,0:3].head(10)

In [None]:
test_df.iloc[:,3:10].head(10)

In [None]:
test_df.iloc[:,10:16].head(10)

In [None]:
test_df.iloc[:,16:21].head(10)

In [None]:
test_df.iloc[:,21:31].head(10)

In [None]:
test_df.iloc[:,31:41].head(10)

In [None]:
test_df.iloc[:,41:46].head(10)

# Select Features for Item Similarity

In [None]:
item_df = test_df[[
    'catalogNumber',
    'occurrenceID',
    'scientificName',    
    'kingdom',
    'phylum',
    'class',
    'order',
    'family',
    'genus',
    'species',
    'countryCode',
    'locality',
    'decimalLatitude',
    'decimalLongitude'
]]

In [None]:
item_df.head(10)

# Number of unique value

In [None]:
print("total rows ", test_df.shape)
item_df.nunique()

# Missing Values

### Percentage of missing values

In [None]:
item_df.isna().sum() * 100 / len(item_df)

In [None]:
item_df.isna().sum()

### missing values in taxonomy kingdom

In [None]:
item_df[item_df['kingdom'].isna()]

### inspecting missing values in dataset 

- when dropping missing value in all of the features
  
  this shows the number of data point to be dropped at all

In [None]:
item_df_dr_0 = item_df.dropna()
remain_dt_0 = len(item_df_dr_0) * 100 / len(item_df)
drp_dt_0 = 100 - remain_dt_0
print("remaining data size ", len(item_df_dr_0))
print("remaining data ", remain_dt_0)
print("dropped data ", drp_dt_0)

In [None]:
type(item_df_dr_0)

In [None]:
item_df_dr_0.to_csv("../herbarium-berlin-clean.csv", sep='\t', encoding='utf-8')

- when dropping missing value in all of the taxonomy features
  
  this shows the number of data point to be dropped at all

In [None]:
item_df_dr_1 = item_df.dropna(how='all', subset=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
remain_dt_1 = len(item_df_dr_1) * 100 / len(item_df)
drp_dt_1 = 100 - remain_dt_1
print("remaining data size ", len(item_df_dr_1))
print("remaining data ", remain_dt_1)
print("dropped data ", drp_dt_1)

- when dropping missing value in either of taxonomy features

  this shows the number of data point to be imputed / set to zero / dropped at all

In [None]:
item_df_dr_2 = item_df_dr_1.dropna(subset=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
remain_dt_2 = len(item_df_dr_2) * 100 / len(item_df_dr_1)
drp_dt_2 = 100 - remain_dt_2
print("remaining data ", remain_dt_2, len(item_df_dr_2))
print("dropped data ", drp_dt_2, len(item_df_dr_1) - len(item_df_dr_2))

- when dropping nan in location country and the missing values in long and lat

  this shows the number of data point to be imputed / set to zero / dropped at all

In [None]:
item_df_dr_3 = item_df_dr_1.dropna(how='all', subset=['countryCode', 'decimalLatitude', 'decimalLongitude'])
remain_dt_3 = len(item_df_dr_3) * 100 / len(item_df_dr_1)
drp_dt_3 = 100 - remain_dt_3
print("remaining data ", remain_dt_3, len(item_df_dr_3))
print("dropped data ", drp_dt_3, len(item_df_dr_1) - len(item_df_dr_3))

    - checking if the number is same as dropping nan in location country or the missing values in long and lat

In [None]:
item_df_dr_3_check = item_df_dr_1.dropna(how='all', subset=['countryCode'])
len(item_df_dr_3_check)

    - checking row without coutry code but have long and lat values

In [None]:
item_df_dr_3[item_df_dr_3['countryCode'].isna()]

- when dropping the missing values in long and lat 
  
  this shows the number of data point to be completed by geocoding after the last drop operation

In [None]:
len(item_df_dr_3[item_df_dr_3['decimalLatitude'].isna() & item_df_dr_3['decimalLongitude'].isna() & item_df_dr_3['countryCode'].notna()]) * 100 / len(item_df_dr_1)

In [None]:
len(item_df_dr_3[item_df_dr_3['decimalLatitude'].isna() & item_df_dr_3['decimalLongitude'].isna() & item_df_dr_3['countryCode'].notna()])

In [None]:
len(item_df_dr_3[item_df_dr_3['decimalLatitude'].isna()])

# Imputing missing values

### inspecting missing values in other column to find input column for imputation

In [None]:
item_df_dr_1[item_df_dr_1['phylum'].isna()].isna().sum() * 100 / len(item_df_dr_1[item_df_dr_1['phylum'].isna()])

In [None]:
item_df_dr_1[item_df_dr_1['class'].isna()].isna().sum() * 100 / len(item_df_dr_1[item_df_dr_1['class'].isna()])

In [None]:
item_df_dr_1[item_df_dr_1['order'].isna()].isna().sum() * 100 / len(item_df_dr_1[item_df_dr_1['order'].isna()])

In [None]:
item_df_dr_1[item_df_dr_1['family'].isna()].isna().sum() * 100 / len(item_df_dr_1[item_df_dr_1['family'].isna()])

In [None]:
print(len(item_df_dr_1[item_df_dr_1['genus'].isna()]))
item_df_dr_1[item_df_dr_1['genus'].isna()].isna().sum() * 100 / len(item_df_dr_1[item_df_dr_1['genus'].isna()])

In [None]:
print(len(item_df_dr_1[item_df_dr_1['species'].isna()]))
item_df_dr_1[item_df_dr_1['species'].isna()].isna().sum() * 100 / len(item_df_dr_1[item_df_dr_1['species'].isna()])

In [None]:
print(len(item_df_dr_1[~item_df_dr_1.isin(item_df_dr_3).catalogNumber]))
item_df_dr_1[~item_df_dr_1.isin(item_df_dr_3).catalogNumber].isna().sum() * 100 / len(item_df_dr_1[~item_df_dr_1.isin(item_df_dr_3).catalogNumber])

### genus and species seem to have enough input column to suport imputation

here the number of data of each column that still can be imputed

### Genus

test set

In [None]:
g_test = item_df_dr_1[item_df_dr_1['genus'].isna()].dropna(subset=['kingdom', 'phylum', 'class', 'order', 'family', 'countryCode'])
len(g_test)

training set

In [None]:
print(len(item_df_dr_1[item_df_dr_1['genus'].notna()]))
g_train = item_df_dr_1[item_df_dr_1['genus'].notna()].dropna(subset=['kingdom', 'phylum', 'class', 'order', 'family', 'countryCode'])
len(g_train)

### Species

test set

In [None]:
s_test = item_df_dr_1[item_df_dr_1['species'].isna()].dropna(subset=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'countryCode'])
len(s_test)

training set

In [None]:
print(len(item_df_dr_1[item_df_dr_1['species'].notna()]))
s_train = item_df_dr_1[item_df_dr_1['species'].notna()].dropna(subset=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'countryCode'])
len(s_train)

### Location

test set

In [None]:
l_test = item_df_dr_1[~item_df_dr_1.isin(item_df_dr_3).catalogNumber].dropna(subset=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
len(l_test)

training set

In [None]:
print(len(item_df_dr_3))
l_train = item_df_dr_3.dropna(subset=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
len(l_train)

In [None]:
import datawig

# Crawling for Images Url

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib import parse
from urllib.parse import urlencode

def image_crawler(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    ret = ""
    
    if len(soup.find_all('img')) > 0 :
        imgurl = soup.find_all('img')[4]['src']
        if "mediastorage" in imgurl:
            urlcmp = parse.urlsplit(imgurl)
            qs = parse.parse_qs(urlcmp.query)
            qs['width'] = ['1000']
            qsret = urlencode(qs, doseq=True)
            ret = urlcmp.scheme + "://" + urlcmp.netloc + urlcmp.path + "?" + qsret 
            
        
    return ret

In [None]:
image_crawler("http://herbarium.bgbm.org/object/BW19937010")

In [None]:
test_df.iloc[0]

In [None]:
item_df = test_df[[
    'gbifID',
    'catalogNumber',
    'occurrenceID',
    'scientificName',    
    'kingdom',
    'phylum',
    'class',
    'order',
    'family',
    'genus',
    'species',
    'countryCode',
    'locality',
    'decimalLatitude',
    'decimalLongitude'
]]
item_df = item_df.dropna()
item_df = item_df[[
    'gbifID',
    'catalogNumber',
    'occurrenceID',
    'scientificName'
]]
item_df.shape

In [None]:
item_df.iloc[0]["gbifID"]

In [None]:
url = 'https://www.gbif.org/occurrence/' + str(item_df.iloc[0]["gbifID"])
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
print(url)

In [None]:
soup

In [None]:
import re
soup.find_all('a')

In [None]:
start = timer()

image_links = []

for i in range(test_df.shape[0]):
    seed_link = test_df.iloc[i]["occurrenceID"]
    res = image_crawler(seed_link)
    image_links.append(res)

end = timer()

print("time :", end - start)

image_links

# Similarity Modelling

In [None]:
btc_df = item_df_dr_0.copy()

In [None]:
btc_df.shape[0]

In [None]:
import numpy as np

btc_df['index_col'] = np.arange(btc_df.shape[0])
btc_df

In [None]:
cal_btc_df = btc_df[["kingdom","phylum","class","order","family","genus","species","decimalLatitude","decimalLongitude"]]
cal_btc_df

In [None]:
oneh_cal_btc_df = pd.get_dummies(cal_btc_df,prefix=['kingdom','phylum','class','order','family','genus','species'])

In [None]:
oneh_cal_btc_df

In [None]:
# oneh_cal_btc_df.to_csv("../herbarium-berlin-oneh.csv", encoding='utf-8')
oneh_cal_btc_df.to_csv('../herbarium-berlin-oneh.csv.gz'
         , sep='|'
         , header=True
         , index=False
         , chunksize=100000
         , compression='gzip'
         , encoding='utf-8')

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
euclidean_distances(oneh_cal_btc_df, oneh_cal_btc_df)

In [None]:
from scipy.spatial.distance import pdist, squareform

dist = pdist(oneh_cal_btc_df, 'euclidean')
dist_df = pd.DataFrame(squareform(dist))

In [None]:
from sklearn.neighbors import DistanceMetric


In [None]:
DistanceMetric.get_metric

In [None]:
import numpy as np
a = np.random.rand(800,2048).astype(np.float32)
a.shape

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

X = np.array([2,3,0,1,0])

scaler = StandardScaler()
X_norm = scaler.fit_transform(X.T)
X_norm

# Inserting to mongoDB

In [None]:
item_df = test_df[[
    'gbifID',
    'catalogNumber',
    'occurrenceID',
    'scientificName',    
    'kingdom',
    'phylum',
    'class',
    'order',
    'family',
    'genus',
    'species',
    'countryCode',
    'locality',
    'decimalLatitude',
    'decimalLongitude'
]]
item_df = item_df.dropna()
item_df.shape

In [None]:
import numpy as np

item_df['index_col'] = np.arange(item_df.shape[0])
item_df

In [None]:
similar_item_df = pd.read_csv("../norm-simdotp-similar-result.csv", header=None, index_col=False)
similar_item_df

In [None]:
#df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)

col = np.arange(1,20)
similar_item_df[to_str(col)].apply(lambda x: ''.join(x), axis=1)

In [None]:
similar_item_df['sim_item'] = similar_item_df[np.arange(1,20)].apply(lambda x: np.array(x), axis=1)

In [None]:
similar_item_df['sim_item']

In [None]:
similar_item_df['index_col'] = np.arange(item_df.shape[0])

In [None]:
similar_item_df.shape

In [None]:
item_df.shape

In [None]:
len(similar_item_df.iloc[0]['sim_item'])

In [None]:
sim_item = similar_item_df[['index_col','sim_item']]
sim_item

In [None]:
result_df = pd.merge(item_df, sim_item, on='index_col')
result_df

In [None]:
result_df.iloc[0].values

In [None]:
from pymongo import MongoClient
import json

client = MongoClient('localhost', 27017)
db = client.botanic_dataset

In [None]:
records = json.loads(result_df.T.to_json()).values()

In [None]:
db.item.insert(records)