In [17]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os
import glob

In [2]:
# set working directory to where data files are
# REPLACE the following line with the location where you downloaded the workshop files
FILES = "/home/jonathan/research/CCSS/python-summer-series"

os.chdir(os.path.join(FILES, "data", "weather", "data"))

In [9]:
# import and concatenate all files in folder, keeping track of which fips each row came from
files = [pd.read_stata(filename).assign(file=filename) for filename in glob.glob("*.dta")]
datasets = pd.concat(files)
display(datasets)

Unnamed: 0,gridNumber,dateNum,tMin,tMax,prec,file
0,194286,2019-01-01,-25.559046,-8.512098,0.895917,fips56043.dta
1,194286,2019-01-02,-22.562197,-8.329514,0.159664,fips56043.dta
2,194286,2019-01-03,-13.759304,0.303681,0.159687,fips56043.dta
3,194286,2019-01-04,-7.499957,6.291142,0.159710,fips56043.dta
4,194286,2019-01-05,-6.361038,5.767238,0.159733,fips56043.dta
...,...,...,...,...,...,...
141980,216854,2019-12-27,-9.501097,3.487881,0.059455,fips56045.dta
141981,216854,2019-12-28,-6.479210,-1.094735,0.191162,fips56045.dta
141982,216854,2019-12-29,-7.079095,-2.725103,0.152250,fips56045.dta
141983,216854,2019-12-30,-11.224467,-0.607967,0.059458,fips56045.dta


In [10]:
# create new variables, convert temperature to fahrenheit
datasets["tmin_f"] = (9/5 * datasets["tMin"]) + 32
datasets["tmax_f"] = (9/5 * datasets["tMax"]) + 32
# compute average fahrenheit
datasets["tavg_f"] = (datasets["tmin_f"] + datasets["tmax_f"]) / 2
display(datasets)

Unnamed: 0,gridNumber,dateNum,tMin,tMax,prec,file,tmin_f,tmax_f,tavg_f
0,194286,2019-01-01,-25.559046,-8.512098,0.895917,fips56043.dta,-14.006283,16.678223,1.335970
1,194286,2019-01-02,-22.562197,-8.329514,0.159664,fips56043.dta,-8.611954,17.006876,4.197461
2,194286,2019-01-03,-13.759304,0.303681,0.159687,fips56043.dta,7.233253,32.546627,19.889940
3,194286,2019-01-04,-7.499957,6.291142,0.159710,fips56043.dta,18.500078,43.324055,30.912067
4,194286,2019-01-05,-6.361038,5.767238,0.159733,fips56043.dta,20.550133,42.381027,31.465580
...,...,...,...,...,...,...,...,...,...
141980,216854,2019-12-27,-9.501097,3.487881,0.059455,fips56045.dta,14.898026,38.278183,26.588104
141981,216854,2019-12-28,-6.479210,-1.094735,0.191162,fips56045.dta,20.337421,30.029478,25.183449
141982,216854,2019-12-29,-7.079095,-2.725103,0.152250,fips56045.dta,19.257629,27.094814,23.176222
141983,216854,2019-12-30,-11.224467,-0.607967,0.059458,fips56045.dta,11.795959,30.905661,21.350811


In [21]:
# collapse dataset: 1 date for each fips code
datasets_fc = datasets.groupby(["dateNum", "file"]).aggregate({"tmin_f": "mean", "tmax_f": "mean", "tavg_f": "mean"}).reset_index()
display(datasets_fc)

Unnamed: 0,dateNum,file,tmin_f,tmax_f,tavg_f
0,2019-01-01,fips56041.dta,-13.341657,14.449871,0.554107
1,2019-01-01,fips56043.dta,-12.845829,9.505347,-1.670241
2,2019-01-01,fips56045.dta,-12.746417,11.538298,-0.604060
3,2019-01-02,fips56041.dta,-5.961773,17.638052,5.838140
4,2019-01-02,fips56043.dta,-7.638713,15.000445,3.680866
...,...,...,...,...,...
1090,2019-12-30,fips56043.dta,2.335490,25.264975,13.800232
1091,2019-12-30,fips56045.dta,13.420627,29.852488,21.636557
1092,2019-12-31,fips56041.dta,-5.711573,20.640247,7.464337
1093,2019-12-31,fips56043.dta,3.315377,28.247398,15.781387


In [22]:
# print summary table
display(datasets_fc.groupby("file").aggregate({"tmin_f": "min", "tmax_f": "max", "tavg_f": "mean"}))

Unnamed: 0_level_0,tmin_f,tmax_f,tavg_f
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fips56041.dta,-13.341657,86.840057,38.971722
fips56043.dta,-15.298845,94.049461,42.121906
fips56045.dta,-17.705271,91.403145,43.040779


In [23]:
# perform ANOVA test: is average significantly different for one fips compared to the others?
# we will use statsmodels: https://www.statsmodels.org/stable/anova.html
lm = smf.ols("tavg_f ~ file", data=datasets_fc).fit()
display(sm.stats.anova_lm(lm))

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
file,2.0,3324.57142,1662.28571,4.340604,0.013254
Residual,1092.0,418194.292986,382.961807,,
