In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("../data/csv/sampled_wells.xlsx", converters={'Collection Date': str,
                                                        'City': lambda x: str(x.strip())})

In [3]:
# number of records

len(df)

1714

In [4]:
# selects columns

df = df[["Address",
         "City",
         "State",
         "ZipCode",
         "Well Permit #",
         "STARLIMS ID",
         "Collection Date",
         "Arsenic",
         "pH"]]

In [5]:
# renames columns

df.columns = ["add","city","state","zip","id","altid","date","ar","ph"]

In [6]:
# Checks for missing values

df.isnull().sum()

add        0
city       1
state      0
zip        0
id       614
altid      0
date       0
ar        13
ph        11
dtype: int64

In [7]:
# checks for typos in city names

set(df['city'])

{'ALEXIS',
 'BELMONT',
 'BESSEMER CITY',
 'Bassemer City',
 'Belmont',
 'Bessemer City',
 'CHERRYVILLE',
 'CRAMERTON',
 'CROUSE',
 'Cherryville',
 'Crouse',
 'DALLAS',
 'Dallas',
 'GASTONIA',
 'Gastonia',
 'IRON STATION',
 'KINGS MOUNTAIN',
 'KINGS MOUTAIN',
 'King Mtn 1Kings Mountain',
 'Kings Mountain',
 'LINCOLNTON',
 'LOWEL',
 'LOWELL',
 'Lincolnton',
 'MC ADENVILLE',
 'MOUNT HOLLY',
 'MT HOLLY',
 'Mount Holly',
 'Mt. Holly',
 'PINEVILLE',
 'STALEY',
 'STANLEY',
 'Stanley',
 nan}

In [8]:
# corrects typos for city

li = []

for each in df['city']:

    if each in ['GASTONIA', 'Gastonia']:
        li.append('GASTONIA')

    elif each in ['BELMONT', 'Belmont']:
        li.append('BELMONT')

    elif each in ['DALLAS', 'Dallas']:
        li.append('DALLAS')
    
    elif each in ['MOUNT HOLLY', 'Mt. Holly','MT HOLLY', 'Mount Holly']:
        li.append('MT HOLLY')

    elif each in ['STANLEY','Stanley', 'STALEY']:
        li.append('STANLEY')
    
    elif each in ['BESSEMER CITY', 'Bessemer City','Bassemer City']:
        li.append('BESSEMER CITY')
    
    elif each in ['KINGS MOUNTAIN', 'Kings Mountain','KINGS MOUTAIN','King Mtn 1Kings Mountain']:
        li.append('KINGS MTN')
    
    elif each in['CHERRYVILLE', 'Cherryville']:
        li.append('CHERRYVILLE')
    
    elif each in['LINCOLNTON','Lincolnton']:
        li.append('LINCOLNTON')
    
    elif each in['IRON STATION']:
        li.append('IRON STATION')

    elif each in['ALEXIS']:
        li.append('ALEXIS')

    elif each in['LOWEL','LOWELL']:
        li.append('LOWELL')

    elif each in['CROUSE', 'Crouse']:
        li.append('CROUSE')

    elif each in['CRAMERTON']: 
        li.append('CRAMERTON')

    elif each in ['MC ADENVILLE']:
        li.append('MCADENVILLE')

    elif each == 'PINEVILLE':
        li.append('PINEVILLE')
    else:
        li.append("")

df['city'] = li

df['city'].value_counts()

city
GASTONIA         360
BELMONT          285
DALLAS           254
MT HOLLY         183
STANLEY          168
BESSEMER CITY    162
KINGS MTN         99
CHERRYVILLE       90
LINCOLNTON        64
CROUSE            23
ALEXIS            16
LOWELL             3
CRAMERTON          3
PINEVILLE          1
MCADENVILLE        1
                   1
IRON STATION       1
Name: count, dtype: int64

In [9]:
# Converts long date to short date

df['date'] = df['date'].str[:10]
df['date'] = [x.strip() for x in df['date']]

df['date'] = df['date'].replace(regex=['2/1/2021'], value="2021-02-01")

In [10]:
df['date']

0       2011-01-03
1       2011-01-04
2       2011-01-04
3       2011-01-04
4       2011-01-20
           ...    
1709    2021-04-05
1710    2021-02-08
1711    2021-02-08
1712    2021-02-15
1713    2020-07-29
Name: date, Length: 1714, dtype: object

In [11]:
# converts date to datetime

df['date_tested'] = pd.to_datetime(df['date'])

In [12]:
# creates year column

df['year_tested'] = df['date_tested'].dt.year

In [13]:
# converts non-detect arsenic values to 0

li = []
for each in df['ar']:
    if each in('<0.001','< 0.005', '<0.005', '<0.01'):
        li.append(float(0))
    else:
        li.append(float(each))

df['ar'] = li

In [14]:
# Creates a new column to group arsenic values into 0 and 1, 0 for <0.001 and 1 for >=0.001

li = []
for each in df['ar']:
    if each < 0.001:
        li.append('0')
    else: li.append('1')

df['group'] = li

df['group'].value_counts()

group
0    1581
1     133
Name: count, dtype: int64

In [15]:

# Creates a new column to group arsenic values into 0 and 1, 0 for <0.005 and 1 for >=0.005

li = []
for each in df['ar']:
    if each < 0.005:
        li.append('0')
    else: li.append('1')

df['group_five'] = li


df['group_five'].value_counts()

group_five
0    1641
1      73
Name: count, dtype: int64

In [16]:

# Creates a new column to group arsenic values into 0 and 1, 0 for <0.005 and 1 for >=0.01  (MCL)

li = []
for each in df['ar']:
    if each < 0.01:
        li.append('0')
    else: li.append('1')

df['group_mcl'] = li


df['group_mcl'].value_counts()

group_mcl
0    1669
1      45
Name: count, dtype: int64

In [17]:
len(df)

1714

In [17]:
# if missing arsenic or ph values, drop the row

df = df.dropna(subset=['ar','ph'])

In [18]:
# removes leading and trailing spaces from city names

df['city'] = [s.strip() for s in df['city']]

# drops rows with missing city values

df = df[df['city'] != '']

In [19]:
# saves the cleaned data to a new file in data folder

df.to_csv("../data/csv/sampled_wells_cleaned.csv", index=False)