In [23]:
#Import necessary libraries 
import os
import numpy as np 
import glob
import pandas as pd

In [14]:
# Let us load the weatehr data 
weather_dir = '../data/raw/weather'
all_weather_files = glob.glob(os.path.join(weather_dir, "*.csv"))

In [21]:
print(f"We have {len(all_weather_files)} weather data")

We have 792 weather data


In [25]:
# Let us create a list that would contain each of our dataframe

list_of_dfs = []
for f in all_weather_files:
    df = pd.read_csv(f)
    # Which year and county the file belongs to. We get this from the filename. 
    # From '2016_19001.csv' we extract 2016 and 19001
    filename = os.path.basename(f)
    year, fips_code = os.path.splitext(filename)[0].split('_')
    
    df['year'] = int(year)
    df['fips_code'] = int(fips_code)
    list_of_dfs.append(df)

df_weather_raw = pd.concat(list_of_dfs, ignore_index = True)

In [26]:
# Define summary statistics for temperature at 2m and precepitation
aggregations = {
    'T2M' : [
        'mean', # Average temperature for the season
        'max', # Hottest day's average temperature
        'min',  #Coldest day's average temerature
        'std' #Standard deviation of temperature
    ], 
    
    'PRECTOTCORR': [
        'sum', # Total precipation for entire season
        'max' # Single daya with most rain
    ]
}


In [27]:
# Group the weather data by wach year and each county and apply list of aggregations

df_weather_features = df_weather_raw.groupby(['year', 'fips_code']).agg(aggregations).reset_index()

In [29]:
df_weather_features.columns = ['_'.join(col).strip() for col in df_weather_features.columns.values]

In [31]:
df_weather_features.rename(columns={'year_' : 'year', 'fips_code_':'fips_code'}, inplace = True)

In [32]:
# Let us create extreme heat days and 
df_extreme_heat = df_weather_raw[df_weather_raw['T2M'] > 32]
extreme_heat_days = df_extreme_heat.groupby(['year', 'fips_code']).size().reset_index(name='extreme_heat_days')

In [33]:
df_weather_features = pd.merge(df_weather_features, extreme_heat_days, on = ['year', 'fips_code'], how = 'left')

In [34]:
df_weather_features['extreme_heat_days'].fillna(0, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_weather_features['extreme_heat_days'].fillna(0, inplace = True)


In [36]:
df_weather_features.head()

Unnamed: 0,year,fips_code,T2M_mean,T2M_max,T2M_min,T2M_std,PRECTOTCORR_sum,PRECTOTCORR_max,extreme_heat_days
0,2016,19001,19.122568,28.0,2.02,5.972346,684.91,42.52,0.0
1,2016,19003,19.456448,28.55,3.4,5.804227,758.36,39.75,0.0
2,2016,19005,17.386503,26.93,-1.76,6.388759,904.01,65.2,0.0
3,2016,19007,19.985355,28.86,2.33,5.958276,554.43,24.78,0.0
4,2016,19009,18.928087,27.92,2.32,5.904433,719.42,46.14,0.0


In [39]:
# We can now combine the weather features into the main dataset 

df_main = pd.read_csv('../data/processed/iowa_yield_with_coords.csv')

In [40]:
df_final = pd.merge(
    df_main, 
    df_weather_features, 
    left_on = ['year', 'full_fips_code'],
    right_on = ['year', 'fips_code'], 
    how = 'inner'
)

In [42]:
df_final.drop(columns = ['fips_code'], inplace = True)

In [43]:
df_final.head()

Unnamed: 0,year,state_alpha,county_name,county_code,commodity_desc,statisticcat_desc,Value,unit_desc,state_fips_str,county_fips_str,full_fips_code,latitude,longitude,T2M_mean,T2M_max,T2M_min,T2M_std,PRECTOTCORR_sum,PRECTOTCORR_max,extreme_heat_days
0,2023,IA,BUENA VISTA,21,CORN,YIELD,212.0,BU / ACRE,19,21,19021,42.741522,-95.141432,20.072623,31.87,0.63,6.325842,406.65,32.53,0.0
1,2022,IA,BUENA VISTA,21,CORN,YIELD,196.9,BU / ACRE,19,21,19021,42.741522,-95.141432,19.43224,31.12,-0.06,7.626875,387.74,22.53,0.0
2,2021,IA,BUENA VISTA,21,CORN,YIELD,203.9,BU / ACRE,19,21,19021,42.741522,-95.141432,19.849344,30.1,1.07,6.75755,450.16,25.16,0.0
3,2020,IA,BUENA VISTA,21,CORN,YIELD,183.9,BU / ACRE,19,21,19021,42.741522,-95.141432,18.81929,30.5,-3.21,7.60711,326.25,32.19,0.0
4,2019,IA,BUENA VISTA,21,CORN,YIELD,190.8,BU / ACRE,19,21,19021,42.741522,-95.141432,17.98541,28.31,0.03,6.170368,647.44,28.45,0.0


In [46]:
final_output_path = '../data/processed/final_model_data.csv'

In [47]:
df_final.to_csv(final_output_path, index = False)