# Data cleaning and pre-processing

In [None]:
!pip install numpy
!pip install pandas
!pip install pyproj

In [None]:
import pandas as pd
import numpy as np
from pyproj import Proj, transform
import warnings

warnings.filterwarnings("ignore")
pd.set_option("max_columns", 30)


# State list to check against dataset
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","Florida",
          "Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine",
          "Maryland","Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska",
          "Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio",
          "Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas",
          "Utah","Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

#Fetching US Zipcode data
us_zipcode = pd.read_csv("Datasets/uszips.csv")

us_zipcode = us_zipcode[us_zipcode.state_name.isin(states)]
columns_to_keep = ['zip', 'lat', 'lng', 'city', 'state_id', 'state_name', 'county_name']
us_zipcode = us_zipcode[columns_to_keep].reset_index(drop=True)
us_zipcode.set_index('zip',inplace=True)

# Fetching First street data
aws_zip = pd.read_csv("Datasets/Zip_level_risk.csv")

aws_zip.rename(columns={'zipcode': 'zip'}, inplace = True)
main_dataframe = pd.merge(us_zipcode, aws_zip ,on = 'zip',how='inner')
main_dataframe = main_dataframe[['county_name','zip', 'lat', 'lng', 'city', 'state_name', 'avg_risk_score_all']]

# Defining projections and calculating Mercator coordinates
inProj = Proj(init='epsg:3857')
outProj = Proj(init='epsg:4326')

lons, lats = [], []

for lon, lat in list(zip(main_dataframe["lng"], main_dataframe["lat"])):
    x, y = transform(outProj,inProj,lon,lat)
    lons.append(x)
    lats.append(y)

main_dataframe["MercatorX"] = lons
main_dataframe["MercatorY"] = lats

# Fetching NOAA dataset and categorising into levels (optional to use)
noaa = pd.read_csv("Datasets/NOAA.csv")

scenarios = list(noaa.Scenario.unique())
scenarios_high = [i for i in scenarios if 'HIGH' in i]
scenarios_med = [i for i in scenarios if 'MED' in i]
scenarios_low = [i for i in scenarios if 'LOW' in i]

noaa_high = noaa[noaa.Scenario.isin(scenarios_high)]
noaa_med = noaa[noaa.Scenario.isin(scenarios_med)]
noaa_low = noaa[noaa.Scenario.isin(scenarios_low)]

noaa_high.reset_index(drop=True,inplace=True)
noaa_med.reset_index(drop=True,inplace=True)
noaa_low.reset_index(drop=True,inplace=True)

# Saving datasets

# main_dataframe.to_csv("Datasets/SLR_Zipcode.csv")
# noaa_high.to_csv("Datasets/NOAA_High.csv")
# noaa_med.to_csv("Datasets/NOAA_Med.csv")
# noaa_low.to_csv("Datasets/NOAA_Low.csv")