In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from shapely.wkt import loads
from sklearn.neighbors import KDTree
from tqdm import tqdm

# Count Birds at Each Site

In [2]:
temp = pd.read_csv('gen_data/temp_covariates.csv', index_col=0)
temp.geometry = temp.geometry.apply(loads)
temp['longitude'] = temp.geometry.apply(lambda x: x.x)
temp['latitude'] = temp.geometry.apply(lambda x: x.y)

In [3]:
birds = pd.read_csv('gen_data/birds_top.csv')
birds.head()

Unnamed: 0,species,decimalLatitude,decimalLongitude,countryCode,recordedBy
0,Phasianus colchicus,52.177017,-2.380047,GB,59cfea51d60c1c90ed3b5816f56548a1848258f52ca84b...
1,Fringilla coelebs,54.742872,-2.210718,GB,4caf8d34231fae9768d7167598f7475d6a058cad144374...
2,Chroicocephalus ridibundus,52.539587,-0.230492,GB,ca5b0617f17e9db5c8508351a180286822d3c48345e069...
3,Columba palumbus,52.040072,-2.07731,GB,86df0b11fc5449ff84c0fd778a6ab87d9d9f33a2c86a74...
4,Turdus merula,49.225067,-2.2067,JE,368d15087cc3724f2b5bfc7affecda93d056a99ba8b900...


In [4]:
station_data = pd.DataFrame(temp.values, columns=temp.columns)
tree = KDTree(station_data[['longitude', 'latitude']])

for spec in tqdm(np.unique(birds.species)):
# spec = 'Accipiter nisus'
    spec_locations = birds[birds.species == spec][['decimalLongitude', 'decimalLatitude']]
    closest_station = tree.query(spec_locations, k=1, return_distance=False)
    app = np.hstack([closest_station.reshape(-1), station_data.index.values.reshape(-1)])
    station_ids, counts = np.unique(app, return_counts=True)
    counts -= 1

    assert ((station_ids[1:] - station_ids[:-1]) == 1).all()
    assert station_ids[0] == 0

    station_data[spec] = counts

station_data.head()

  station_data[spec] = counts
100%|██████████| 100/100 [00:07<00:00, 13.86it/s]


Unnamed: 0,geometry,avg_temp,avg_min_month,avg_max_month,longitude,latitude,Accipiter nisus,Acrocephalus schoenobaenus,Acrocephalus scirpaceus,Aegithalos caudatus,...,Tachybaptus ruficollis,Tadorna tadorna,Tringa totanus,Troglodytes troglodytes,Turdus iliacus,Turdus merula,Turdus philomelos,Turdus pilaris,Turdus viscivorus,Vanellus vanellus
0,POINT (-179.5 -89.5),-48.475474,-59.104248,-27.165876,-179.5,-89.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,POINT (-179.5 -88.5),-49.178864,-59.526585,-29.381388,-179.5,-88.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,POINT (-179.5 -87.5),-47.056091,-57.250343,-28.598877,-179.5,-87.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,POINT (-179.5 -86.5),-43.717779,-53.913155,-26.282127,-179.5,-86.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,POINT (-179.5 -85.5),-39.687731,-50.039585,-22.917107,-179.5,-85.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Integrate Country Data

In [11]:
codes = pd.read_csv('gen_data/grid_country_codes.csv')
codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24311 entries, 0 to 24310
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   latitude      24311 non-null  float64
 1   longitude     24311 non-null  float64
 2   country_code  15287 non-null  object 
dtypes: float64(2), object(1)
memory usage: 569.9+ KB


In [12]:
assert len(codes.latitude) == len(station_data.latitude)
assert (codes.latitude == station_data.latitude).all()
assert (codes.longitude == station_data.longitude).all()
station_data['country'] = codes.country_code

# Write out

In [30]:
cov_col = ['geometry', 'avg_temp', 'avg_min_month', 'avg_max_month', 'longitude', 'latitude', 'country']
bird_col = list(set(station_data.columns) - set(cov_col))
station_data["total_birds"] = station_data[bird_col].values.sum(axis=1)

station_data = station_data[cov_col + ['total_birds'] + bird_col]

In [31]:
station_data.to_csv('gen_data/station_data.csv')