In [1]:
# dependencies
import pandas as pd
import datetime

# import precipitation data csv
precipFile = "./precip_data.csv"

# read the file and store in a data frame
precipData = pd.read_csv(precipFile)
precipData

Unnamed: 0,Location ID,Location,Date,Value,Rank,Anomaly (1901-2000 base period),1901-2000 Mean
0,CA-001,Alameda County,189501,8.43,119,4.46,3.97
1,CA-001,Alameda County,189502,2.09,54,-1.25,3.34
2,CA-001,Alameda County,189503,1.97,51,-0.90,2.87
3,CA-001,Alameda County,189504,1.75,90,0.41,1.34
4,CA-001,Alameda County,189505,1.20,110,0.62,0.58
...,...,...,...,...,...,...,...
88271,CA-115,Yuba County,202106,0.02,24,-0.43,0.45
88272,CA-115,Yuba County,202107,0.01,92,0.00,0.01
88273,CA-115,Yuba County,202108,0.00,1,-0.07,0.07
88274,CA-115,Yuba County,202109,0.23,70,-0.28,0.51


In [2]:
# see all columns
precipData.columns

Index(['Location ID', 'Location', 'Date', 'Value', 'Rank',
       'Anomaly (1901-2000 base period)', '1901-2000 Mean'],
      dtype='object')

In [3]:
# remove extraneous columns
precipData = precipData[["Date","Location","Value"]]

# rename columns
precipData = precipData.rename(columns = {"Location":"County","Value":"Precip"})

In [4]:
precipData.dtypes

Date        int64
County     object
Precip    float64
dtype: object

In [5]:
# remove "county" from county column to be consistent with other datasets
precipData["County"] = precipData["County"].astype(str)
precipData["County"] = precipData["County"].str.replace(" County","")

In [6]:
# convert date column
precipData["Date"] = pd.to_datetime(precipData["Date"].astype(str), format='%Y%m')

# create a column for year and filter for data during or after 2013
precipData["Year"] = precipData["Date"].dt.year
precipData = precipData.loc[(precipData["Year"]>=2013),:]

# drop the year column
precipData = precipData[["Date","County","Precip"]]

# edit the date column to match the format of the other datasets
precipData["Date"] = precipData["Date"].apply(lambda x: x.strftime('%Y-%m'))
precipData

Unnamed: 0,Date,County,Precip
1416,2013-01,Alameda,0.81
1417,2013-02,Alameda,0.47
1418,2013-03,Alameda,0.63
1419,2013-04,Alameda,0.69
1420,2013-05,Alameda,0.07
...,...,...,...
88271,2021-06,Yuba,0.02
88272,2021-07,Yuba,0.01
88273,2021-08,Yuba,0.00
88274,2021-09,Yuba,0.23


In [8]:
# drop nulls and reset the index
precipData = precipData.dropna()
precipData.reset_index(inplace=True,drop=True)
precipData

Unnamed: 0,Date,County,Precip
0,2013-01,Alameda,0.81
1,2013-02,Alameda,0.47
2,2013-03,Alameda,0.63
3,2013-04,Alameda,0.69
4,2013-05,Alameda,0.07
...,...,...,...
6143,2021-06,Yuba,0.02
6144,2021-07,Yuba,0.01
6145,2021-08,Yuba,0.00
6146,2021-09,Yuba,0.23


In [9]:
# export as csv
precipData.to_csv("./clean/precip_data_clean.csv",index=False)