In [114]:
#Ecopy installation to python enviroment doesn't work with new pip versions
#To make it work, you need to install an older version of pip by "pip install pip==21.3.1"
#After that you can update your pip back by "pip install --upgrade pip"

from scipy.special import comb
import pandas as pd
import numpy as np
import scipy.misc
scipy.misc.comb = comb
import ecopy as ep
import h3

import sys
sys.path.append('../')
from helper_functions.gridding import h3_grid

In [115]:
# Read and concatnate data into a single dataframe
df = pd.DataFrame({})

for year in range(2000, 2024, 2):
  df_year = pd.read_csv(f'../../data/birds/{year}-{year+1}.csv')
  df = pd.concat([df, df_year])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11398117 entries, 0 to 987437
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ScientificName  object 
 1   TaxonID         object 
 2   Date            object 
 3   Latitude        float64
 4   Longitude       float64
 5   Municipality    object 
dtypes: float64(2), object(4)
memory usage: 608.7+ MB


In [116]:
#Doing the gridding
#Drop NAs from Lat and Lon. Otherwise h3 cannot work with these.
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
df.head()
grid_object = h3_grid()
grid_object.fit(df)
grid_object.grid_info()

Unnamed: 0,h3_cell,observations_id,count,neighbors
0,85012603fffffff,"[83371, 83379, 83388, 83391, 83392, 83393, 833...",1450,"{85012603fffffff, 85012613fffffff, 85012607fff..."
1,85012613fffffff,"[83372, 83373, 83374, 83375, 83376, 83377, 833...",1165,"{85012603fffffff, 85012613fffffff, 850126c7fff..."
2,85012617fffffff,"[1012023, 1106452, 1106453, 1106454, 1106455, ...",52,"{85012603fffffff, 85012613fffffff, 850126bbfff..."
3,8501261bfffffff,"[4865, 4866, 5194, 5195, 653607, 2681, 4502, 4...",1365,"{85012603fffffff, 85012613fffffff, 850126c7fff..."
4,85012643fffffff,"[709475, 709476, 709477, 709478, 709479, 70948...",447,"{8501264bfffffff, 85012643fffffff, 8501265bfff..."
...,...,...,...,...
2063,85112ed3fffffff,"[196349, 530183, 537374, 569130, 582496, 61454...",2895,"{851123a7fffffff, 85112edbfffffff, 8511216ffff..."
2064,85112ed7fffffff,"[197417, 205764, 403088, 530176, 530177, 70028...",2062,"{85112ec7fffffff, 8511216ffffffff, 85112ed3fff..."
2065,85112edbfffffff,"[196347, 196348, 268174, 268175, 268176, 66772...",3742,"{851123a7fffffff, 85112edbfffffff, 85112ecbfff..."
2066,851135a7fffffff,[647965],1,"{851122d3fffffff, 8511266bfffffff, 851135a3fff..."


In [118]:
#Gettings cells for every row
cell_species = df[["ScientificName", "Latitude", "Longitude"]]
h3cell = []


cell_species["h3cell"] = cell_species.apply(lambda x: h3_grid().row_to_h3cell(x), axis=1)

cell_species = cell_species[["ScientificName", "h3cell"]]

cell_species.to_csv('cell_species.csv')

print(cell_species)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cell_species["h3cell"] = cell_species.apply(lambda x: h3_grid().row_to_h3cell(x), axis=1)


              ScientificName           h3cell
0                Strix aluco  851126c7fffffff
1          Fringilla coelebs  85089947fffffff
2         Certhia familiaris  85089977fffffff
3        Cyanistes caeruleus  8511202bfffffff
4            Corvus monedula  85089947fffffff
...                      ...              ...
987433  Picoides tridactylus  8508cd47fffffff
987434  Picoides tridactylus  8508cd47fffffff
987435       Motacilla flava  8511234ffffffff
987436      Delichon urbicum  8511234ffffffff
987437   Gallinago gallinago  8511206bfffffff

[11393012 rows x 2 columns]


In [119]:
#Adjusting the array to get freqs to columns and cells to rows
#cell_species = cell_species.set_index('h3cell')

cell_species_freq = cell_species.groupby(["ScientificName","h3cell"]).size().unstack(level=0).fillna(0)

cell_species_freq.to_csv('cell_species_freq.csv')


In [128]:
#Calculate diversity indicatiors by using ecopy

shannon = ep.diversity(cell_species_freq, 'shannon')

simpson = ep.diversity(cell_species_freq, 'simpson')

richness = ep.diversity(cell_species_freq, 'spRich')

shannon_alpha, shannon_beta, shannon_gamma = ep.div_partition(cell_species_freq, 'shannon')

def shannon_entropy(area_row):
    numpy_row = area_row.to_numpy()
    probabilities = numpy_row / np.sum(numpy_row)
    probabilities = probabilities[np.where(probabilities > 0)]
    return -np.sum(probabilities*np.log2(probabilities))

shannon_entropies = cell_species_freq.apply(shannon_entropy, axis=1)
shannon_entropies

def simpson_index(area_row):
    numpy_row = area_row.to_numpy()
    observed_species = numpy_row[np.where(numpy_row > 0.0)]
    species_index = observed_species * (observed_species - 1)
    total_observations = np.sum(observed_species)
    total_index = total_observations * (total_observations - 1)
    return 1 - (np.sum(species_index) / total_index)

simpson_indices = cell_species_freq.apply(simpson_index, axis=1)
simpson_indices

print(simpson)


[ 7.64573257 12.57283532 18.02666667 ... 37.93170293  1.
 14.51066856]


  return 1 - (np.sum(species_index) / total_index)
