In [1]:
import pandas as pd
import numpy as np

In [2]:
# import data, convert Permit ID, zip to string

df = pd.read_excel("../data/csv/permitted_wells.xlsx", sheet_name="Very_Editted", converters={"Permit ID": str, "zip": str, "Date Permit Issued": str})


In [3]:
# check typos in city column

df['City'].value_counts()

City
GASTONIA         1900
DALLAS           1284
BESSEMER CITY    1161
MT HOLLY          911
STANLEY           880
BELMONT           766
KINGS MTN         642
CHERRYVILLE       450
LINCOLNTON        357
CROUSE            182
ALEXIS             74
CRAMERTON          24
IRON STATION        8
LOWELL              8
MCADENVILLE         3
Name: count, dtype: int64

In [4]:
# filter columns

df = df[["Permit ID",
         "Addr_ZIP",
         "City",
         "zip",
         "XCOORD",
         "YCOORD",
         "Depth of Well (Feet)",
         "Date Permit Issued"]]

In [5]:
# rename columns

df.columns = ["id","add_zip","city","zip","X","Y","depth","perm_date"]

In [6]:
# count null values

df.isnull().sum()

id             0
add_zip      110
city         117
zip          111
X            628
Y            628
depth        399
perm_date    168
dtype: int64

---

**re-evaluate dropping records once good data is obtained**

In [7]:
# delete any rows with missing values for BOTH add and X columns

df = df.dropna(subset=["add_zip","X"])
len(df)

8139

In [8]:
# delete any rows with missing values for id column

df = df.dropna(subset=["id"])
len(df)

8139

In [9]:
# drop any rows with missing values for city or zip

df = df.dropna(subset=["city","zip"])
len(df)

8133

---

In [10]:
# remove zip code from address column for matching purposes later

df['add'] = [s[:-6] for s in df['add_zip']]

In [11]:
# format date column, combine year_built column

df['perm_date'] = df['perm_date'].str[:10]
df['perm_date'] = pd.to_datetime(df['perm_date'])
df['year_built'] = df['perm_date'].dt.year

In [13]:
# save the cleaned data to a new file in data folder

df.to_csv("../data/alt/csv/permitted_wells_cleaned.csv", index=False)

---

**checks missingness by year range**

In [25]:
# total number of wells

len(df['year_built'])

8133

In [26]:
# total number of wells built before 2018

len(df[(df['year_built'] < 2018)])

7997

In [27]:
# null values where "year_built" is less than 2018

df[(df['year_built'] < 2018)].isnull().sum()

id              0
add_zip         0
city            0
zip             0
X               0
Y               0
depth         284
perm_date       0
add             0
year_built      0
dtype: int64

In [28]:
# number of wells built in 2018 or after

len(df[df['year_built'] > 2017])

25

In [29]:
# null values where for wells built in 2018 or after

df[df['year_built'] > 2017].isnull().sum()

id            0
add_zip       0
city          0
zip           0
X             0
Y             0
depth         2
perm_date     0
add           0
year_built    0
dtype: int64