In [None]:
import sys, os
cwd=os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, cwd)
import numpy as np
import pandas as pd
import geopandas as gpd
import re
import matplotlib.pyplot as plt
import folium
import utils.processing as pr
import math
from statsmodels.stats.proportion import proportion_confint 
import config.paths as path
pd.set_option('display.max_columns', None)

modelname = path.model_path #path to model data

path_to_read_file=f'../../data/{modelname}/raw-foci-data/'
path_to_write_file=f'../../data/{modelname}/raw-foci-data/'
file_name_microfoci = 'TBEV-main-database.csv'

df_foci = pd.read_csv(path_to_read_file+file_name_microfoci)
df_foci = df_foci[df_foci['include_exclude'] == 'include']
df_foci.head(2)

In [None]:
foci = df_foci.copy()
#rename columns, etc. 
col_name_mapping = {'data_tier (1/2/3)' : 'data_tier',
             'Unnamed: 3' : 'obs_type',
             'country' : 'country',
             'state' : 'state',
             'admin_locality' : 'district',
             'gps_n' : 'gps_north',
             'gps_e' : 'gps_east',
             'denominator_#_tested' : 'denominator_total',
             'numerator_#_infected_or_positive' : 'numerator_pos',
             'MIR' : 'mir'
}
foci = foci.rename(col_name_mapping, axis='columns')

In [None]:
#grab all rows with utm parameters
utm_index_list = foci[foci['gps_north'].str.contains('V').fillna(False)].index.to_list()
utm_index_list



foci['latitude_raw'] = foci['gps_n_imputed'].fillna(foci['gps_north'])
foci['longitude_raw'] = foci['gps_e_imputed'].fillna(foci['gps_east'])


#convert gps to decimal lat/long using gps_convert defined in utils/processing
foci['latitude_raw'] = foci['latitude_raw'].apply(pr.gps_convert)
foci['longitude_raw'] = foci['longitude_raw'].apply(pr.gps_convert)

#convert utm parameters for subset of data
utm_df = foci.loc[utm_index_list]
utm_converted = pr.convert_utm_to_latlon(utm_df, '32', 'V')
utm_converted[''] = utm_df.index.to_list()
long_utm_mapping = dict(utm_converted[['','longitude_raw']].values)
lat_utm_mapping = dict(utm_converted[['','latitude_raw']].values)
foci.loc[utm_index_list,'latitude_raw'] = foci.loc[utm_index_list,'latitude_raw'].index.map(utm_converted.set_index('')['latitude_raw'])
foci.loc[utm_index_list,'longitude_raw'] = foci.loc[utm_index_list,'longitude_raw'].index.map(utm_converted.set_index('')['longitude_raw'])

#round coords to nearest 5th decimal place - ~1m accuracy
foci['latitude_raw'] = foci['latitude_raw'].astype('float').round(5)
foci['longitude_raw'] = foci['longitude_raw'].astype('float').round(5)

#jitter points, first keep history of raw data points
foci['lat_orig'] = foci['latitude_raw']
foci['lon_orig'] = foci['longitude_raw']

#get list of coords indices that are overlaid at same point
index_to_jitter = foci[['latitude_raw','longitude_raw']].duplicated().index[foci[['latitude_raw','longitude_raw']].duplicated()].to_list()

#create jittered columns and pass lat/long coords to them
foci['lat_jitter'] = foci.apply(lambda row: np.round(float(row.latitude_raw) + .001*np.random.normal(0, 1),5) if row.name in index_to_jitter else row.latitude_raw,axis=1)
foci['lon_jitter'] = foci.apply(lambda row: np.round(float(row.longitude_raw) + .001*np.random.normal(0, 1),5) if row.name in index_to_jitter else row.longitude_raw,axis=1)

jitter_geom = gpd.points_from_xy(foci['lon_jitter'], foci['lat_jitter'])
spotfire_gdf = gpd.GeoDataFrame(foci, geometry=jitter_geom)

foci['country_code'] = foci['country'].map(path.ctry_map)

#fix 'No focus' to 'No Focus'

foci['obs_type'] = foci['obs_type'].replace('No focus','No Focus')

In [None]:
from shapely.geometry import Point, Polygon
from shapely.ops import nearest_points

### take nearest country shapefile and use as new coordinate for processed databases.
### do this for each country
df_list=[]
for countrycode in set(foci['country_code']):
    #read in nuts3 shapefile
    gdf_nuts = gpd.read_file(f"../../data/raw-data/shapefiles/NUTS_RG_20M_2021_4326.shp/NUTS_RG_20M_2021_4326.shp", 
                        where=f"CNTR_CODE='{countrycode}' AND LEVL_CODE=3")
    gdf_nuts = gdf_nuts.to_crs(crs='epsg:4326')
    gdf_nuts = gdf_nuts[~gdf_nuts['NUTS_ID'].isin(['FRY10','FRY20','FRY30','FRY40','FRY50'])]
    gdf_nuts['nutsgeom'] = gdf_nuts.geometry

    dummydf = foci[foci['country_code']==countrycode]

    foci_gdf = gpd.GeoDataFrame(dummydf, geometry=gpd.points_from_xy(dummydf.longitude_raw, dummydf.latitude_raw), crs="EPSG:4326")

    foci_nearest_join  = gpd.sjoin_nearest(foci_gdf,gdf_nuts,how='left',distance_col='distance')
    foci_nearest_join["nearest_point"] = foci_nearest_join.apply(lambda x: nearest_points(x["nutsgeom"],x["geometry"])[0] if x['distance'] >0 else x['geometry'], axis=1)
    df_list.append(foci_nearest_join)
foci_nearest_join = pd.concat(df_list)

## plot results

gdf_nuts = gpd.read_file(f"../../data/raw-data/shapefiles/NUTS_RG_20M_2021_4326.shp/NUTS_RG_20M_2021_4326.shp", 
                        where=f"LEVL_CODE=3")
gdf_nuts = gdf_nuts.to_crs(crs='epsg:4326')
gdf_nuts = gdf_nuts[~gdf_nuts['NUTS_ID'].isin(['FRY10','FRY20','FRY30','FRY40','FRY50'])]
fig, ax = plt.subplots(figsize=(15, 15))
gdf_nuts.plot(ax=ax,markersize=1,color='pink')
foci_nearest_join[foci_nearest_join['distance']>0].plot(ax=ax,markersize=1,color='navy')
foci_nearest_join[foci_nearest_join['distance']>0]['nearest_point'].plot(ax=ax,markersize=1,color='red')

foci_nearest_join['latitude'] = foci_nearest_join.nearest_point.y
foci_nearest_join['longitude'] = foci_nearest_join.nearest_point.x

foci=foci_nearest_join.copy()

foci.head()

In [None]:

conf_val = .95

foci['CI_lower'] = foci.apply(lambda row: 
                              proportion_confint(row.numerator_pos,row.denominator_total,alpha=1-conf_val,method='beta')[0] if ((row.numerator_pos==0) and (row.denominator_total !=0) & ~(math.isnan(row.denominator_total))) else (
                                  proportion_confint(row.numerator_pos,row.denominator_total,alpha=1-conf_val)[0] if ((row.denominator_total !=0) & ~(math.isnan(row.denominator_total))) else float('nan')
                              ),axis=1)
foci['CI_upper'] = foci.apply(lambda row: 
                              proportion_confint(row.numerator_pos,row.denominator_total,alpha=1-conf_val,method='beta')[1] if ((row.numerator_pos==0) and (row.denominator_total !=0) & ~(math.isnan(row.denominator_total))) else (
                                 proportion_confint(row.numerator_pos,row.denominator_total,alpha=1-conf_val)[1] if ((row.denominator_total !=0) & ~(math.isnan(row.denominator_total))) else float('nan')
                              ),axis=1)

#add in country code abbr
country_code_dict = path.ctry_map

foci['country_code'] = foci['country'].map(country_code_dict)


presence_dict = {
    'Focus' : 1,
    'No Focus' : 0,
    'Absent' : 0
}
foci['presence'] = foci['obs_type'].map(presence_dict)

foci['tick_animal'] = foci.apply(lambda row: 'Tick' if row['tick_animal'] in ['Tick', 'tick'] else 'Animal',axis=1)
foci.columns = map(str.lower, foci.columns)


In [None]:
### final filtering of points for collection year >= 2000 and focus collection types, drop duplicates
### collection year accounted for already in the include/exclude category

#add in common names
common_name_include = [
    'castor bean tick',
    'tick',
    'bank vole',
    'mice',
    'yellow-necked mouse',
    'rodent',
    'rodents',
    'small rodents',
    'squirrel',
    'multiple rodent species',
    'ornate dog tick',
    'voles'
]

foci['common_name'] = foci['common name'].str.title()


In [None]:
# output to local
local_save_path = f'../../data/{modelname}/processed-master-database/'
foci.to_csv(local_save_path + 'master-database-processed-no-dedupe.csv')

In [None]:
#diff dataframes for diff models (maxent and sklearn models)
gdf_proc_filtered = foci[foci['common_name'].str.lower().isin(common_name_include)]


### drop dupes for modeling
gdf_proc_all=gdf_proc_filtered.drop_duplicates(subset=['latitude', 'longitude'])

In [None]:
# output to local
local_save_path = f'../../data/{modelname}/processed-master-database/'
gdf_proc_filtered.to_csv(local_save_path + 'master-database-processed-filtered.csv')

gdf_proc_all.to_csv(local_save_path + 'master-database-processed-filtered-deduped.csv')

In [None]:
#Drop duplicate lat long coords and Output maxent model foci
maxent_proc = gdf_proc_all[gdf_proc_all['presence']==1]
maxent_proc = maxent_proc[['presence','latitude','longitude']].set_index('presence')

maxent_proc.info()
maxent_proc.to_csv(local_save_path + 'master-database-processed-maxent-filtered-deduped.csv')

### Save filtered foci pts too
maxent_proc = gdf_proc_filtered[gdf_proc_filtered['presence']==1]
maxent_proc = maxent_proc[['presence','latitude','longitude']].set_index('presence')

maxent_proc.info()
maxent_proc.to_csv(local_save_path + 'master-database-processed-maxent-filtered.csv')


## Apply kmeans grouping using previously developed shapefiles

## Kmeans clustering

In [None]:
import pandas as pd
from sklearn.cluster import KMeans, MiniBatchKMeans
import folium
from sklearn.preprocessing import StandardScaler
import geopandas as gpd
import numpy as np

fp = "../../"

proc = pd.read_csv(local_save_path + 'master-database-processed-filtered.csv')
cmap = {0:"orange", 1: "red", 2:"blue"}

kmeans = KMeans(n_clusters=3)
proc = proc[proc['presence']==1]
proc['mainland'] = np.where(proc['cntr_code'].isin(['FI', 'SE', 'NO']),1,0)
scaler = StandardScaler()
X = scaler.fit_transform(proc[['latitude', 'longitude', 'mainland']])
kmeans.fit(X)
proc['region'] = kmeans.labels_
proc = gpd.GeoDataFrame(
    proc, geometry=gpd.points_from_xy(proc.longitude, proc.latitude), crs="EPSG:4326")
proc['color'] = proc['region'].apply(lambda x: cmap[x])

gdf = proc[['latitude', 'longitude','region','color','obs_type','focus_type','reference','row_observation']]
gdf = gpd.GeoDataFrame(
    gdf, geometry=gpd.points_from_xy(gdf.longitude, gdf.latitude), crs="EPSG:4326")

location = [64, 18]

width = 700
height = 650

f = folium.Figure(width=width, height=height)
m = folium.Map(location=location, zoom_start=4, tiles="CartoDB Positron",
            width=width, height=height,
            control_scale=True,
            min_zoom=2,
            max_zoom=15,
            zoomDelta=0.5,
            zoomSnap=0.5, 
            wheelDebounceTime=20,
            wheelPxPerZoomLevel=20
            ).add_to(f)


for _, r in proc.iterrows():
        sim_geo = gpd.GeoSeries(r['geometry'])
        geo_j = sim_geo.to_json()
        geo_j = folium.GeoJson(data=geo_j,
                                zoom_on_click=False,
                                smooth_factor=1,
                                marker=folium.Circle(
                                    radius=4300,
                                    fill_color=r['color'], 
                                    fill_opacity=1, 
                                    color=r['color'], 
                                    weight=1,
                                    opacity = 1
                                ))
        geo_j.add_to(m)
        


In [None]:
proc.groupby('region').count()['presence']

In [None]:
### save kmeans-grouped outputs

proc.to_csv(fp+f'data/{modelname}/geographic-clustering/cluster_df.csv')
proc.groupby('color').count()['presence'].to_csv(fp+f'data/{modelname}/geographic-clustering/cluster_counts.csv')

gdf.to_file(fp+f'data/{modelname}/geographic-clustering/all_regions.shp')

m.save(f'../../data/{modelname}/geographic-clustering/regional_cluster_map.html')