In [1]:
# Import dependencies 
import pandas as pd

# Clean climdiv_state_year.csv

In [2]:
# Read in data set
df1 = pd.read_csv('Resources/data_raw/climdiv_state_year.csv')
print(df1.shape)
df1.head()

(6000, 4)


Unnamed: 0,fips,year,temp,tempc
0,1,1895,61.641667,16.467593
1,1,1896,64.266667,17.925926
2,1,1897,64.191667,17.884259
3,1,1898,62.983333,17.212963
4,1,1899,63.1,17.277778


In [3]:
# Check data types
df1.dtypes

fips       int64
year       int64
temp     float64
tempc    float64
dtype: object

In [4]:
# Check for null values
df1.isnull().sum()

fips     0
year     0
temp     0
tempc    0
dtype: int64

In [5]:
# Create state dictionary
# based on: https://github.com/washingtonpost/data-2C-beyond-the-limit-usa/blob/main/data/raw/state.txt
state_dict = {
    1: ['Alabama','AL'], 2: ['Alaska','AK'], 4: ['Arizona','AZ'], 5: ['Arkansas','AR'], 
    6: ['California','CA'], 8: ['Colorado','CO'], 9: ['Connecticut','CT'], 10: ['Delaware','DE'], 
    11: ['District of Columbia','DC'], 12: ['Florida','FL'], 13: ['Georgia','GA'], 15: ['Hawaii','HI'],
    16: ['Idaho','ID'], 17: ['Illinois','IL'], 18: ['Indiana','IN'], 19: ['Iowa','IA'], 
    20: ['Kansas','KS'], 21: ['Kentucky','KY'], 22: ['Louisiana','LA'], 23: ['Maine','ME'],
    24: ['Maryland','MD'], 25: ['Massachusetts','MA'], 26: ['Michigan','MI'], 27: ['Minnesota','MN'],
    28: ['Mississippi','MS'], 29: ['Missouri','MO'], 30: ['Montana','MT'], 31: ['Nebraska','NE'], 
    32: ['Nevada','NV'], 33: ['New Hampshire','NH'], 34: ['New Jersey','NJ'], 35: ['New Mexico','NM'], 
    36: ['New York','NY'], 37: ['North Carolina','NC'], 38: ['North Dakota','ND'], 39: ['Ohio','OH'],
    40: ['Oklahoma','OK'], 41: ['Oregon','OR'], 42: ['Pennsylvania','PA'], 44: ['Rhode Island','RI'], 
    45: ['South Carolina','SC'], 46: ['South Dakota','SD'], 47: ['Tennessee','TN'], 48: ['Texas','TX'], 
    49: ['Utah','UT'], 50: ['Vermont','VT'], 51: ['Virginia','VA'], 53: ['Washington','WA'], 
    54: ['West Virginia','WV'], 55: ['Wisconsin','WI'], 56: ['Wyoming','WY'], 60: ['American Samoa','AS'], 
    66: ['Guam','GU'], 69: ['Northern Mariana Islands','MP'], 72: ['Puerto Rico','PR'], 
    74: ['U.S. Minor Outlying Islands','UM'], 78: ['U.S. Virgin Islands','VI']
}

In [6]:
# Replace fips column with a column for state names and their abbreviations
df1['state_name'] = df1['fips'].apply(lambda x: state_dict[x][0])
df1['state_abb'] = df1['fips'].apply(lambda x: state_dict[x][1])

# Drop original fips column
df1.drop('fips', axis = 'columns', inplace = True)
df1.head()

Unnamed: 0,year,temp,tempc,state_name,state_abb
0,1895,61.641667,16.467593,Alabama,AL
1,1896,64.266667,17.925926,Alabama,AL
2,1897,64.191667,17.884259,Alabama,AL
3,1898,62.983333,17.212963,Alabama,AL
4,1899,63.1,17.277778,Alabama,AL


In [7]:
# Rename Columns
df1.columns = ['year', 'tempf', 'tempc', 'state_name', 'state_abb']

# Re-arrange Columns
df1 = df1[['state_name','state_abb', 'year', 'tempf', 'tempc']].copy()
print(df1.shape)
df1.head()

(6000, 5)


Unnamed: 0,state_name,state_abb,year,tempf,tempc
0,Alabama,AL,1895,61.641667,16.467593
1,Alabama,AL,1896,64.266667,17.925926
2,Alabama,AL,1897,64.191667,17.884259
3,Alabama,AL,1898,62.983333,17.212963
4,Alabama,AL,1899,63.1,17.277778


# Clean climdiv_national_year.csv

In [8]:
# Read in data set
df2 = pd.read_csv('Resources/data_raw/climdiv_national_year.csv')
print(df2.shape)
df2.head()

(125, 3)


Unnamed: 0,year,temp,tempc
0,1895,50.3375,10.1875
1,1896,51.993333,11.107407
2,1897,51.556667,10.864815
3,1898,51.431667,10.79537
4,1899,51.009167,10.560648


In [9]:
# Check data types
df2.dtypes

year       int64
temp     float64
tempc    float64
dtype: object

In [10]:
# Check for null values
df2.isnull().sum()

year     0
temp     0
tempc    0
dtype: int64

In [11]:
# Rename Columns
df2.columns = ['year', 'tempf', 'tempc']
print(df2.shape)
df2.head()

(125, 3)


Unnamed: 0,year,tempf,tempc
0,1895,50.3375,10.1875
1,1896,51.993333,11.107407
2,1897,51.556667,10.864815
3,1898,51.431667,10.79537
4,1899,51.009167,10.560648


# Clean model_state.csv

In [12]:
# Read in file
df3 = pd.read_csv('Resources/data_raw/model_state.csv')
print(df3.shape)
df3.head()

(48, 10)


Unnamed: 0,fips,Fall,Spring,Summer,Winter,max_warming_season,Annual,STUSAB,STATE_NAME,STATENS
0,1,-0.195668,-0.105862,-0.325009,0.458526,Winter,-0.035048,AL,Alabama,1779775
1,4,1.203951,1.38448,1.274455,1.388388,Winter,1.31988,AZ,Arizona,1779777
2,5,-0.04254,0.266399,0.058596,0.532247,Winter,0.214074,AR,Arkansas,68085
3,6,1.570921,1.449242,1.478335,1.41243,Fall,1.480561,CA,California,1779778
4,8,1.055309,1.43691,1.367845,1.838758,Winter,1.438589,CO,Colorado,1779779


In [13]:
# Check data types
df3.dtypes

fips                    int64
Fall                  float64
Spring                float64
Summer                float64
Winter                float64
max_warming_season     object
Annual                float64
STUSAB                 object
STATE_NAME             object
STATENS                 int64
dtype: object

In [14]:
# Check for null values
df3.isnull().sum()

fips                  0
Fall                  0
Spring                0
Summer                0
Winter                0
max_warming_season    0
Annual                0
STUSAB                0
STATE_NAME            0
STATENS               0
dtype: int64

In [15]:
# Keep relevant columns
df3 = df3[['STATE_NAME','STUSAB', 'Annual','Fall', 'Spring', 'Summer', 'Winter', 'max_warming_season']].copy()

# Rename columns
df3.columns = ['state_name','state_abb', 'annual','fall', 'spring', 'summer', 'winter', 'max_warming_season']
print(df3.shape)
df3.head()

(48, 8)


Unnamed: 0,state_name,state_abb,annual,fall,spring,summer,winter,max_warming_season
0,Alabama,AL,-0.035048,-0.195668,-0.105862,-0.325009,0.458526,Winter
1,Arizona,AZ,1.31988,1.203951,1.38448,1.274455,1.388388,Winter
2,Arkansas,AR,0.214074,-0.04254,0.266399,0.058596,0.532247,Winter
3,California,CA,1.480561,1.570921,1.449242,1.478335,1.41243,Fall
4,Colorado,CO,1.438589,1.055309,1.43691,1.367845,1.838758,Winter


# Clean use_pop_gdp_1.csv

In [16]:
# Read in data set 
# Resident population including Armed Forces, thousands
df4 = pd.read_csv('Resources/data_raw/use_pop_gdp_1.csv')
print(df4.shape)
df4.head()

(52, 60)


Unnamed: 0,State,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,AK,229,238,246,256,263,271,271,278,285,...,699,714,722,730,737,736,737,741,740,735
1,AL,3274,3316,3323,3358,3395,3443,3464,3458,3446,...,4758,4785,4799,4816,4830,4842,4852,4864,4874,4888
2,AR,1789,1806,1853,1875,1897,1894,1899,1901,1902,...,2897,2922,2940,2952,2959,2967,2978,2990,3001,3010
3,AZ,1321,1407,1471,1521,1556,1584,1614,1646,1682,...,6343,6407,6473,6555,6633,6730,6830,6941,7044,7158
4,CA,15870,16497,17072,17668,18151,18585,18858,19176,19394,...,36961,37320,37642,37949,38261,38597,38918,39167,39358,39462


In [17]:
# Check data types
df4.dtypes

State    object
1960      int64
1961      int64
1962      int64
1963      int64
1964      int64
1965      int64
1966      int64
1967      int64
1968      int64
1969      int64
1970      int64
1971      int64
1972      int64
1973      int64
1974      int64
1975      int64
1976      int64
1977      int64
1978      int64
1979      int64
1980      int64
1981      int64
1982      int64
1983      int64
1984      int64
1985      int64
1986      int64
1987      int64
1988      int64
1989      int64
1990      int64
1991      int64
1992      int64
1993      int64
1994      int64
1995      int64
1996      int64
1997      int64
1998      int64
1999      int64
2000      int64
2001      int64
2002      int64
2003      int64
2004      int64
2005      int64
2006      int64
2007      int64
2008      int64
2009      int64
2010      int64
2011      int64
2012      int64
2013      int64
2014      int64
2015      int64
2016      int64
2017      int64
2018      int64
dtype: object

In [18]:
# Check for null values
df4.isnull().sum()

State    0
1960     0
1961     0
1962     0
1963     0
1964     0
1965     0
1966     0
1967     0
1968     0
1969     0
1970     0
1971     0
1972     0
1973     0
1974     0
1975     0
1976     0
1977     0
1978     0
1979     0
1980     0
1981     0
1982     0
1983     0
1984     0
1985     0
1986     0
1987     0
1988     0
1989     0
1990     0
1991     0
1992     0
1993     0
1994     0
1995     0
1996     0
1997     0
1998     0
1999     0
2000     0
2001     0
2002     0
2003     0
2004     0
2005     0
2006     0
2007     0
2008     0
2009     0
2010     0
2011     0
2012     0
2013     0
2014     0
2015     0
2016     0
2017     0
2018     0
dtype: int64

In [19]:
# Reshape the dataframe to a 3 column data frame using the pandas melt method 
df4 = pd.melt(df4, id_vars=['State'], 
        value_vars= list(df4.columns[1:]),
        var_name='Year', 
        value_name='population_thousands')

# Rename the columns
df4.columns = ['state_abb','year', 'population_thousands']

print(df4.shape)
df4

(3068, 3)


Unnamed: 0,state_abb,year,population_thousands
0,AK,1960,229
1,AL,1960,3274
2,AR,1960,1789
3,AZ,1960,1321
4,CA,1960,15870
...,...,...,...
3063,WA,2018,7524
3064,WI,2018,5807
3065,WV,2018,1804
3066,WY,2018,578


# Clean use_pop_gdp_2.csv

In [20]:
# Real gross domestic product (GDP), million chained (2012) dollars