## Newer

In [None]:
import sys, os
cwd=os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, cwd)
import rasterio
import folium
import branca
from folium.plugins import HeatMap
import re
import branca.colormap as cmp
import pandas as pd
import geopandas as gpd
import shapefile as shp
from shapely.geometry import Point
from shapely.geometry.polygon import Point, Polygon
import matplotlib.pyplot as plt
import rioxarray as rxr
import xarray as xr
import numpy as np
from scipy.spatial.distance import cdist
import utils.processing as pr
import utils.s3_utils as s3
import utils.plot as pl
from sklearn.preprocessing import PolynomialFeatures
from folium import plugins
import config.paths as path
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, RobustScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
import random
import utils.modeling as ml
from pathlib import Path

pd.set_option('display.max_columns', None)

region_code = 0
modelname = path.model_path

trainingdata_path = f'../../data/{modelname}/training/final/clustered/region_{region_code}/'
save_path=f'../../model/log-models/final-log-model-3/region-{region_code}/'

drop_features = 'y'
test_train = 'y'
test_split = .2
eval_method = 'f1_weighted'

# Import data
x = pd.read_csv(trainingdata_path + 'training_x/x.csv')
x = x.drop(['Unnamed: 0'],axis=1)
y = pd.read_csv(trainingdata_path + 'training_y/y.csv')
y = y.drop(['Unnamed: 0'],axis=1)
dummies_train = pd.read_csv(trainingdata_path + 'training_x/dummies.csv')
dummies_train = dummies_train.drop(['Unnamed: 0'],axis=1)


#Remove pearson correlated features, while keeping the highest coeff feature in orig. model
if drop_features == 'y':
    drop_cols = path.logreg_col_drop_dict[region_code]
    x = x.drop(drop_cols, axis=1)
    save_path=save_path+'feature-selection/'
    Path(f"{save_path}").mkdir(parents=True, exist_ok=True)
else: 
    save_path=save_path
    Path(f"{save_path}").mkdir(parents=True, exist_ok=True)

In [None]:
# initialize variables

nuts3_list = list(y.nuts_id.unique()) #will drop by 20% each run
nuts3_total_districts = len(nuts3_list)
random.seed(11)
drop_descriptor_cols=['nuts_id','nuts_name','longitude','latitude']
y_pred_list = []
if test_train == 'y':
    ### This section only runs if there is a saved clf model with best_params configured (produced in gridsearch section below)
    ### This will error out if you do not have saved clf model params. Please set test_train config to 'y' once params are set. 
    for i in range(0,int(1/test_split)):
        #randomly select test_split % of all nuts_ids, drop them from nuts 3 set space for next run
        #if-then clause makes sure we get the extra remaining nuts3 district from split
        if i == max(range(0,int(1/test_split))):
            test_nuts3_districts = nuts3_list
        else:
            test_nuts3_districts = random.sample(nuts3_list, int(nuts3_total_districts * test_split))
        nuts3_list = [x for x in nuts3_list if x not in list(test_nuts3_districts)]
        print(f'***NUTS3 IDs sampled for Run {i}: {test_nuts3_districts}***')
        print(nuts3_list)

        y_train = y[~y['nuts_id'].isin(test_nuts3_districts)]
        x_train = x[~x['nuts_id'].isin(test_nuts3_districts)]

        y_test = y[y['nuts_id'].isin(test_nuts3_districts)]
        x_test = x[x['nuts_id'].isin(test_nuts3_districts)]

        y_train = y_train.drop(drop_descriptor_cols, axis=1)
        x_train = x_train.drop(drop_descriptor_cols, axis=1)

        y_test = y_test.drop(['nuts_id','nuts_name'], axis=1)
        x_test = x_test.drop(drop_descriptor_cols, axis=1)
         
        ## Scale x_train values
        lin_sc = StandardScaler()
        lin_sc.fit(x_train)
        x_scaled = lin_sc.transform(x_train)

        #model
        lasso, importances = ml.logreg_model(x_scaled,y_train, x=x_test)

        ### get roc-auc score by predicted using fitted model
        x_scaled = lin_sc.transform(x_test)
        y_test['pred'] = lasso.predict_proba(x_scaled)[:, 1]
        auroc_score = roc_auc_score(y_test['presence'], y_test['pred'])
        y_pred_list.append(y_test)
        print(f'AUROC for Test Run {i}:{auroc_score}')
        print(f'*** END OF RUN {i} ***')
        print('')

    print('***Cross-Validation Complete***\n')
    y_pred = pd.concat(y_pred_list)
    y_pred = pr.remove_or_combine_duplicates(y_pred, strategy='aggregate', aggfunc='mean')

    auroc_score = roc_auc_score(y_pred['presence'], y_pred['pred'])
    print(f'***AUROC for Cross-Validation:{auroc_score}***')

    print('***Dropping descriptor columns from x and y dfs***')
    x = x.drop(drop_descriptor_cols, axis=1)
    y = y.drop(drop_descriptor_cols, axis=1)
    param_text = str(f'''
                     *** Region {region_code} Logistic Regression Results Summary
                     train-test split: {test_split}
                     ROC-AUC score: {auroc_score}
                     features dropped: {drop_features}
                     dropped cols: {drop_cols}
                     ''')
    with open(save_path+'roc-auc-test.txt', "w") as text_file:
        text_file.write(param_text)

if test_train == 'n':
    x = x.drop(drop_descriptor_cols, axis=1)
    y = y.drop(drop_descriptor_cols, axis=1)

## Test Predictions on full Cov Data

In [None]:
lin_sc = StandardScaler()
lin_sc.fit(x)
x_scaled = lin_sc.transform(x)

In [None]:
### in-sample testing
lasso, importances = ml.logreg_model(x_scaled,y,x)
y_pred = (lasso.predict_proba(x_scaled)[:,1]).round(3)

auroc_score = roc_auc_score(y['presence'], y_pred)
print(f'***AUROC for Cross-Validation:{auroc_score}***')

param_text = f'''
*** Region {region_code} Logistic Regression Results Summary
train-test split: {test_split}
ROC-AUC score: {auroc_score}
features dropped: {drop_features}
dropped cols: {drop_cols}
                    '''
with open(save_path+'roc-auc-insample.txt', "w") as text_file:
    text_file.write(param_text)

In [None]:
#view and save feature importance results
display(importances.head(10))
display(importances.tail(10))
save_name=f"feature_importances.csv"
importances.to_csv(save_path+save_name)


### Test Prediction

In [None]:
# Test Predictions
#get predictors for all countries
test_list = []
sample_sizer = 5
fn = f'../../data/{modelname}/processed-predictor-parquets/clustered/{region_code}-predictors.parquet'
df = pd.read_parquet(fn, engine='pyarrow')
samp_size = df.shape[0]/sample_sizer
print(f'sample_size of {region_code} : {samp_size}')

df = df.sample(int(np.round(samp_size)), random_state=42)
test_list.append(df)

test = pd.concat(test_list)

del df, test_list


test_df = test.drop(['geometry','landcover'],axis=1)
test_df.columns = map(str.lower, test_df.columns)

#get dummies for landcover 

dummies_test = pd.get_dummies(test_df['cat'])
#add dummy categoricals to match dummies_train used in training
for missing_env in set(dummies_train.columns).difference(set(dummies_test.columns)):
    dummies_test[missing_env] = 0

print('finished splitting dummy variables')

test_df2 = test_df.drop('cat',axis=1)
test_df2 = pd.concat([test_df2, dummies_test], axis = 1).reset_index().drop('index',axis=1)

#Match and Re-order columns to the same as was used in training dataframe x
x_test = x.rename(columns = {'tg-grp-mean-days-above-5degC-monthly-ratio':'tg-grp-mean-days-above-5degc-monthly-ratio'})
nolatlon = test_df2[x_test.columns]

x_test = nolatlon
x_test = x_test.rename(columns = {'tg-grp-mean-days-above-5degc-monthly-ratio':'tg-grp-mean-days-above-5degC-monthly-ratio'})


###
### To prevent leakage, scale values according to training scaler from above
print('Scaling values')
#scale values
test_scaled = lin_sc.transform(x_test)

print('Running prediction')
test_output = pd.DataFrame()
test_output['pred'] = (lasso.predict_proba(test_scaled)[:,1]).round(3)
test_output['latitude'] = test_df2['lat_env']
test_output['longitude'] = test_df2['lon_env']

print('Outputting prediction data')
print('removing duplicate columns')
test_output = pr.remove_or_combine_duplicates(test_output, strategy='aggregate', aggfunc='mean')


In [None]:
test_output.to_csv(save_path+f'test-predictions.csv')

## Plotting Predictions

In [None]:

histo = test_output.pred.hist(bins=100)
fig = histo.figure
fig.savefig(save_path+'distribution_histogram.png')

In [None]:
#plot predictions on all of eu - can cut out later with shapefile. 
country_codes = ['DK','NO','SE','FI','AT','CH','CZ','DE','EE','FR', 'LT','LV','NL','PL','SK', 'IT','UK','HR','BE','SI','LU']
m, ma = pl.plot_log_maps(test_output, country_codes,region=region_code, dot_sample=50000)

In [None]:
save_name=f"logreg-dotmap.html"
m.save(save_path+save_name)

save_name=f"logreg-choropleth.html"
ma.save(save_path+save_name)
