## Setup

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)

In [41]:
acc = pd.read_csv('accomack-full.csv')
fauq = pd.read_csv('fauquier-full.csv')
green = pd.read_csv('greensville-full.csv')
han = pd.read_csv('hanover-full.csv')
rock = pd.read_csv('rockingham-full.csv')

In [4]:
path = '/Users/hananur/SchoolWork/Capstone/'
counties = ['Accomack', 'Fauquier', 'Greensville', 'Hanover', 'Rockingham', 'Wise']
final_df = pd.DataFrame()
for county in counties:
    df = pd.read_csv(path+county+'.csv')
    df = df[['Week', 'DSCI']]
    df.Week = pd.to_datetime(df.Week)
    df = pd.DataFrame(df.groupby(df.Week.dt.year)['DSCI'])
    df[1] = df[1].apply(lambda x: x.mean())
    df.columns = ['Year', county]
    df = df.set_index('Year')
    if len(final_df) > 0:
        final_df = final_df.join(df, on=['Year'], how='left')
    else:
        final_df = df.copy()
final_df = pd.melt(final_df.reset_index(), id_vars = ['Year'], value_vars = counties)
final_df = final_df.rename({'value': 'DSCI', 'variable':'County'}, axis=1)
final_df

Unnamed: 0,Year,County,DSCI
0,2000,Accomack,5.769231
1,2001,Accomack,52.923077
2,2002,Accomack,224.962264
3,2003,Accomack,0.000000
4,2004,Accomack,0.000000
...,...,...,...
145,2020,Wise,5.538462
146,2021,Wise,10.057692
147,2022,Wise,38.807692
148,2023,Wise,80.153846


In [5]:
final_df.to_csv('drought-index.csv')

In [6]:
pd.read_csv('drought-index.csv')

Unnamed: 0.1,Unnamed: 0,Year,County,DSCI
0,0,2000,Accomack,5.769231
1,1,2001,Accomack,52.923077
2,2,2002,Accomack,224.962264
3,3,2003,Accomack,0.000000
4,4,2004,Accomack,0.000000
...,...,...,...,...
145,145,2020,Wise,5.538462
146,146,2021,Wise,10.057692
147,147,2022,Wise,38.807692
148,148,2023,Wise,80.153846


## Merge all data

In [4]:
data_path = 'individual-datasets/'
income = pd.read_csv(data_path+'Income_per_capita_data.csv')
drought = pd.read_csv(data_path+'drought-index.csv')
co2 = pd.read_csv(data_path+'VA_CO2Emissions.csv')
crop = pd.read_csv(data_path+'crop_data.csv')
temp = pd.read_excel(data_path+'temp_precip.xlsx')
income = income[['Year', 'Income', 'County']]
income = income.set_index('County').rename({'Faquier':'Fauquier'}, axis=0).reset_index()
drought = drought[['Year', 'County', 'DSCI']]
drought = drought[(drought['Year'] > 2007) & (drought['Year'] < 2023)]
co2 = co2[['Year', 'Coal', 'Petroleum Products', 'Natural Gas', 'Total', 'Sector']]
temp = temp.rename({'County ': 'County'}, axis=1)
#co2 = co2[(co2['Year'] > 2007) & (co2['Year'] < 2023)]

#### Goal: Merge tables with matching primary keys, and update with co2 table on matching year

- income - pk (year and county) 2008-2022
- drought - pk (year and county) 2008-2022
- temp - pk (year and county) 2008-2022
- crop - pk (year, county, and crop) 2008-2022
- co2 - pk year 1970-2021

In [5]:
final_df = pd.DataFrame()
tables = [income, drought, temp]

for table in tables:
    if len(final_df) == 0:
        final_df = table.copy()
    else:
        final_df = pd.merge(final_df, table, on=['Year', 'County'], how='left')
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   County  90 non-null     object 
 1   Year    90 non-null     int64  
 2   Income  90 non-null     int64  
 3   DSCI    90 non-null     float64
 4   Precip  90 non-null     float64
 5   Temp    90 non-null     float64
dtypes: float64(3), int64(2), object(1)
memory usage: 4.3+ KB


In [8]:
final_df.to_csv('income-dsci-temp-data.csv', index=False)

In [177]:
final_df = pd.merge(final_df, crop, 
                    left_on=['Year', 'County'],
                    right_on=['Year', 'County'], 
                    how='inner')
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3286 entries, 0 to 3285
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   County      3286 non-null   object 
 1   Year        3286 non-null   int64  
 2   Income      3286 non-null   int64  
 3   DSCI        3286 non-null   float64
 4   Precip      3286 non-null   float64
 5   Temp        3286 non-null   float64
 6   Count       3286 non-null   int64  
 7   Percentage  3286 non-null   float64
 8   Crop        3269 non-null   object 
dtypes: float64(4), int64(3), object(2)
memory usage: 231.2+ KB


In [178]:
final_df = pd.merge(final_df, co2, 
                    on=['Year'],
                    how='left')
final_df

Unnamed: 0,County,Year,Income,DSCI,Precip,Temp,Count,Percentage,Crop,Coal,Petroleum Products,Natural Gas,Total,Sector
0,Accomack,2008,32179,97.962264,44.92,58.5,140068,0.019369,Corn,0.000000,2.589419,4.384754,6.974173,Residential
1,Accomack,2008,32179,97.962264,44.92,58.5,140068,0.019369,Corn,0.187220,1.071147,3.685261,4.943629,Commercial
2,Accomack,2008,32179,97.962264,44.92,58.5,140068,0.019369,Corn,7.667449,5.134328,3.550514,16.352290,Industrial
3,Accomack,2008,32179,97.962264,44.92,58.5,140068,0.019369,Corn,0.000000,51.240860,0.473097,51.713958,Transportation
4,Accomack,2008,32179,97.962264,44.92,58.5,140068,0.019369,Corn,31.632810,0.901568,4.244781,36.779160,Electic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18406,Wise,2022,41103,38.807692,52.26,53.1,104,0.000039,Woody_Wetlands,,,,,
18407,Wise,2022,41103,38.807692,52.26,53.1,42,0.000016,Herbaceous_Wetlands,,,,,
18408,Wise,2022,41103,38.807692,52.26,53.1,5,0.000002,Triticale,,,,,
18409,Wise,2022,41103,38.807692,52.26,53.1,17,0.000006,Dbl_Crop_WinWht/Corn,,,,,


In [179]:
final_df.to_csv('county-data.csv', index=False)

In [184]:
df = pd.read_csv('county-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18411 entries, 0 to 18410
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   County              18411 non-null  object 
 1   Year                18411 non-null  int64  
 2   Income              18411 non-null  int64  
 3   DSCI                18411 non-null  float64
 4   Precip              18411 non-null  float64
 5   Temp                18411 non-null  float64
 6   Count               18411 non-null  int64  
 7   Percentage          18411 non-null  float64
 8   Crop                18334 non-null  object 
 9   Coal                18150 non-null  float64
 10  Petroleum Products  18150 non-null  float64
 11  Natural Gas         18150 non-null  float64
 12  Total               18150 non-null  float64
 13  Sector              18150 non-null  object 
dtypes: float64(8), int64(3), object(3)
memory usage: 2.0+ MB


In [185]:
df.isna().sum()

County                  0
Year                    0
Income                  0
DSCI                    0
Precip                  0
Temp                    0
Count                   0
Percentage              0
Crop                   77
Coal                  261
Petroleum Products    261
Natural Gas           261
Total                 261
Sector                261
dtype: int64

### Note:
Some NAN values due to the crop file having missing values. CO2 data only goes to 2021, so no data for 2022 (resulting missing data for 261 rows).