In [3]:
import geopandas as gpd
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import fiona as fio
import shapely as shp  


import geojson as gs


In [4]:
# load in geojson files

gdf_ar = gpd.read_file('data/gis/point/ar_samples_w_geol.geojson')


## Calculate Autocovariate 

minimum Euclidean distance (bandwidth) at which no private well had zero neighbors was 1976 meters and used this value (𝑑𝑖𝑗) in the analysis.

# Feature Engineering

Convert elevated arsenic, bedrock type, geologic belt, well depth into dummy variables for modeling. Check for interaction of belt and rock type to guage need for interaction - newly engineered crosstab variables

In [6]:
# Check category levels reflect original study
gdf_ar['belt2'].value_counts()

Kings Mountain Belt    319
Charlotte Belt         278
Inner Piedmont         129
Name: belt2, dtype: int64

In [7]:
gdf_ar['type'].value_counts()

Intrusive Rocks      400
Metamorphic Rocks    326
Name: type, dtype: int64

In [8]:
# cross tabulate belt2 and type

pd.crosstab(gdf_ar['belt2'], gdf_ar['type'])

type,Intrusive Rocks,Metamorphic Rocks
belt2,Unnamed: 1_level_1,Unnamed: 2_level_1
Charlotte Belt,231,47
Inner Piedmont,31,98
Kings Mountain Belt,138,181


In [9]:
# cross tabulate belt2, type, and geocode

pd.crosstab([gdf_ar['belt2'], gdf_ar['type']], gdf_ar['geocode'])

Unnamed: 0_level_0,geocode,CZab,CZbg,CZbl,CZfv,CZg,CZms,DOg,Mc,OCg,PPmg,PzZq,Zbt
belt2,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Charlotte Belt,Intrusive Rocks,0,0,0,0,27,0,36,0,0,0,168,0
Charlotte Belt,Metamorphic Rocks,0,0,0,47,0,0,0,0,0,0,0,0
Inner Piedmont,Intrusive Rocks,0,0,0,0,0,0,0,28,3,0,0,0
Inner Piedmont,Metamorphic Rocks,5,3,0,0,0,90,0,0,0,0,0,0
Kings Mountain Belt,Intrusive Rocks,0,0,0,0,0,0,0,2,0,128,8,0
Kings Mountain Belt,Metamorphic Rocks,0,0,59,0,0,0,0,0,0,0,0,122



Simplify belt and rock type values

In [11]:
# Simplify belt and rock type values
# for belt2, Charlotte Belt = CB, "Inner Piedmont" = IP, "Kings Mountain Belt" = KM
# for type Intrusive Rocks = IR, Metamorphic Rocks = MR

gdf_ar['belt2'] = gdf_ar['belt2'].replace(['Charlotte Belt', 'Inner Piedmont', 'Kings Mountain Belt'], ['CB', 'IP', 'KM'])
gdf_ar['type'] = gdf_ar['type'].replace(['Intrusive Rocks', 'Metamorphic Rocks'], ['IR', 'MR'])



Crossing belt with rock type would result in 6 columns, whereas the formation code provides more granularity with 12 columns

In [12]:
# create new variable combining belt and rock type

gdf_ar['belt_type'] = gdf_ar['belt2'] + '_' + gdf_ar['type']

gdf_ar['belt_type'].value_counts()

CB_IR    231
KM_MR    181
KM_IR    138
IP_MR     98
CB_MR     47
IP_IR     31
Name: belt_type, dtype: int64

---

Delete everything below once proper data with full well depth and pH is obtained

In [15]:
# if depth is missing, replace with 0 for now

gdf_ar['depth'].fillna(200, inplace=True)

# if pH is missing or equal to 0, replace with 7 for now

gdf_ar['ph'].fillna(7, inplace=True)

gdf_ar['ph'].replace(0, 7, inplace=True)

---

In [None]:
# Create categorical variable for well depth, per the study, where depth categories are <150, 150-300, and 300+

gdf_ar['depth_cat'] = pd.cut(gdf_ar['depth'], bins=[0, 150, 300, 1000], labels=['<150', '150-300', '300+'])

In [None]:
# create dummy variables for belt_type, geocode, depth_cat, and group

gdf_ar = pd.get_dummies(gdf_ar, columns=['belt_type', 'geocode', 'depth_cat', 'group'])

