# Cross Validation

Analysis of the predicted arsenic levels for the county on Gaston Water Map. Compares data to the actual levels of arsenic in the county 2018-2022 since the model was trained on data 2011-2017


In [40]:
import geopandas as gpd
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shapely as shp  


In [59]:
# read in samples, predicted arsenic and public water systems polygon data

#pred = gpd.read_file("../data/gis/polygon/predicted-arsenic-clipped.geojson")

pred = gpd.read_file("../data/gis/polygon/predicted-arsenic.geojson")

ar = gpd.read_file("../data/gis/point/ar_samples_w_covariates.geojson")


# check length of data
print(len(ar))

# check crs for both dataframes

print(ar.crs)
print(pred.crs)


1683
EPSG:4269
EPSG:2264


In [60]:
pred = pred.to_crs(ar.crs)

In [62]:
pred['pred'] = pred['severity'].replace(['good','warn','danger'],['< 0.005','0.005 - 0.01','> 0.01'])
pred = pred[["geometry", "pred"]]

# join the pred column to ar dataframe using spatial join

ar = gpd.sjoin(ar, pred, how="left", predicate="within")

In [63]:
# create dummy variable for pred column

df= pd.get_dummies(ar, columns=["pred"])


In [68]:
def concordance(df):
    li = []

    for i in df.index:
        if df['ar'][i] < 0.005 and df['pred_< 0.005'][i] == 1:
            li.append(1)
        elif 0.005 <= df['ar'][i] <= 0.01 and df['pred_0.005 - 0.01'][i] == 1:
            li.append(1)
        elif df['ar'][i] > 0.01 and df['pred_> 0.01'][i] == 1:
            li.append(1)
        else:
            li.append(0)
    return li

In [None]:
ar['correct'] = concordance(df)

In [65]:
ar['correct'].value_counts()

correct
1    1442
0     241
Name: count, dtype: int64

In [66]:
# crosstab the correct column with the pred column

pd.crosstab(ar['correct'], ar['pred'])

pred,0.005 - 0.01,< 0.005,> 0.01
correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,209,8,22
1,22,1401,19


In [70]:
# run the concordace again on a subset for years < 2018

ar_2018 = ar[ar['year_tested'] < 2018]

df_2018 = pd.get_dummies(ar_2018, columns=["pred"])

ar_2018['correct'] = concordance(df_2018)

pd.crosstab(ar_2018['correct'], ar_2018['pred'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


pred,0.005 - 0.01,< 0.005,> 0.01
correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,111,2,13
1,15,865,16


In [71]:


# run the concordace again on a subset for years > 2018

ar_2018 = ar[ar['year_tested'] > 2018]

df_2018 = pd.get_dummies(ar_2018, columns=["pred"])

ar_2018['correct'] = concordance(df_2018)

pd.crosstab(ar_2018['correct'], ar_2018['pred'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


pred,0.005 - 0.01,< 0.005,> 0.01
correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,88,6,8
1,7,472,3


In [74]:
#Did it at least predict the samples above MCL as being at least above 0.005? aka either 1 for the pred_0.005 - 0.01 or 1 for the pred_> 0.01 

#On the same note, were the samples below MCL predicted to be below 0.005? aka either 1 for the pred_0.005 - 0.01 or 1 for the pred_< 0.005



# crosstab group_mcl column with the pred column

pd.crosstab(ar['group_five'], ar['pred'])

pred,0.005 - 0.01,< 0.005,> 0.01
group_five,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,204,1401,18
1,27,8,23
