# Hydroclimatic hazard - Combined_data
- Author: Eunkyoung Choi (kyoung.choi@colostate.edu)
- Version: April, 2022

In [2]:
#### Import packages
import numpy as np
import pandas as pd
import scipy.io
import geopandas as gpd
import matplotlib.pyplot as plt
from scipy import stats

# 1) Option:

In [3]:
## options
'''
Please select which one to model:
crop_name = Maize, Soybeans, Spring Wheat, Sorghum
crop_yld_name = maize_yld, soy_yld, spr_wheat_yld, sorghum_yld
'''

crop_name = 'maize'
crop_yld_name ='maize_yld'
crop_file_name = 'maize'

first_yr = 1981
last_yr =2020

In [6]:

def geoid_format(data):
    data['GEOID'] = data['GEOID'].astype(str).str.zfill(5)
    data['time'] = pd.to_datetime(data['time'])
    data['year'] = data['time'].dt.year
    data['state'] = data['GEOID'].str[:2]
    print(data.shape)
    return data.loc[(data['year'] >= first_yr) & (data['year'] <= last_yr)]

# 2) Soil moisture from NCA-LDAS
- 1979-2020 (1979 Jan1 = Nan)

In [11]:
## import data
sm_ldas = pd.read_parquet('0_soil_moisture_nca_ldas.gzip')
sm_ldas = geoid_format(sm_ldas)

## exclude leap year:
sm_ldas = sm_ldas.loc[~((sm_ldas['time'].dt.month == 2) & (sm_ldas['time'].dt.day == 29))]

## exclude no cropland:
remove_geoid = sm_ldas.loc[sm_ldas['crop_count'] <1]['GEOID'].unique()
sm_ldas = sm_ldas.loc[~sm_ldas['GEOID'].isin(remove_geoid)]
sm_ldas.head()

array([365, 366], dtype=int64)

In [11]:
column_lst =['ldas_sm_0_10cm', 'ldas_sm_10_40cm','ldas_sm_40_100cm']

for column in column_lst:
    if sm_ldas[column].dtype == 'float64':
        sm_ldas[column] = sm_ldas[column].astype(np.float32)

# 3) Weather data from PRISM and AgERA5

In [29]:
weather = pd.read_parquet('agera5_cropland_weighted_avg_water_demand_27Mar2022.gzip')
weather = geoid_format(weather)
weather.head()

In [62]:
ds = pd.read_parquet('prism_cropland_weighted_avg_daily_climate_usa_counties_1981_2020_27Mar2022.gzip')
ds['GEOID'] = ds['GEOID'].astype(str).str.zfill(5)
ds['time'] = pd.to_datetime(ds['time'])
ds.head()

Unnamed: 0,GEOID,county_name,time,county_count,prism_ppt,prism_vpdmean,prism_tmax,prism_tmin
0,1001,Autauga,1981-01-01,215691.0,0.0,5.553852,14.667161,-2.519873
1,1001,Autauga,1981-01-02,215691.0,0.0,6.471299,16.364265,-0.844994
2,1001,Autauga,1981-01-03,215691.0,0.0,5.27809,12.137314,-4.178577
3,1001,Autauga,1981-01-04,215691.0,0.0,7.511469,16.633713,-2.830301
4,1001,Autauga,1981-01-05,215691.0,0.0,5.058056,9.431636,-4.508458


In [64]:
weather = pd.merge(ds.drop(columns=['county_count']), weather.drop(columns=['county_name']), on=['GEOID','time'], how='outer', indicator=True)
weather._merge.unique()

['both', 'left_only']
Categories (3, object): ['left_only', 'right_only', 'both']

In [4]:
weather.loc[weather['_merge'] == 'left_only']
weather = weather.rename(columns={'_merge':'prismag_ldas_merge'})

In [71]:
print(weather.shape)
weather = weather.loc[~((weather.time.dt.month ==2) & (weather.time.dt.day==29))]
print(weather.shape)

(44852700, 17)
(44822000, 17)


In [12]:
column_lst_w= ['prism_ppt', 'prism_vpdmean',
       'prism_tmax', 'prism_tmin', 'agera5_avg_vpd_kpa', 'agera5_short_eto',
       'agera5_tall_eto']

for column in column_lst_w:
    if weather[column].dtype == 'float64':
        weather[column] = weather[column].astype(np.float32)

In [15]:
########################################## Merge SM and Weather ###############################################################
ldas_temp = pd.merge(
    sm_ldas[['GEOID','time','ldas_sm_0_10cm','ldas_sm_10_40cm','ldas_sm_40_100cm','year']],
    weather.drop(columns=['county_name','year']),
    on=['GEOID','time'], how='outer', indicator=True)
print(ldas_temp._merge.unique())
print(sm_ldas.shape,
      weather.shape)

['both', 'left_only']
Categories (3, object): ['left_only', 'right_only', 'both']
(44690600, 9) (44646800, 15)


In [17]:
ldas_temp = ldas_temp.loc[ldas_temp['_merge'] == 'both']
del sm_ldas
del weather

# 4) Yield data

In [8]:
yield_data = pd.read_csv('usda_'+crop_file_name+'_1981_2020_with_irrigation_indicator_3June2022.csv', index_col=0)
yield_data['GEOID']=yield_data['GEOID'].astype(str).str.zfill(5)
yield_data = yield_data.loc[(yield_data['year'] >= first_yr) & (yield_data['year'] <= last_yr)]

((3185, 2), (3185, 2), (0,))

In [12]:
############################################ Merging #########################################################################
ldas_final = pd.merge(ldas_temp, yield_data[['GEOID','year',crop_yld_name,'lon','lat','yld_count_until2020','irri_counties']],
                    on=['GEOID','year'], how='outer', indicator=True)
ldas_final = ldas_final.loc[ldas_final['_merge'] == 'both'].drop(columns=['_merge'])
del ldas_temp

['left_only', 'both']
Categories (3, object): ['left_only', 'right_only', 'both']

In [7]:
## historical crop progress data
progress = pd.read_csv('crop_calendar'+crop_file_name+'_yearly_50%_planted_density_correspoinding_week_number_2014_2020_hybrid_method_22Mar2022.csv',index_col=0)
progress['State_ID'] = progress['State_ID'].astype(str).str.zfill(2)
progress = progress.rename(columns={'State':'State_ab', 'State_ID':'state'})
crop_progress = progress.groupby(['state','growing_phase'])['week_50'].quantile(0.5, interpolation='nearest').reset_index()
crop_progress['week_50'] = crop_progress['week_50'].round(0).astype(int)

Unnamed: 0,state,growing_phase,week_50
0,4,EMERGED,5
1,4,HARVESTED,24
2,4,HEADED,14
3,4,PLANTED,3
4,30,EMERGED,22


In [9]:
duplicate_state = crop_progress[crop_progress[['state','week_50']].duplicated()]['state']
duplicate_state.values

array([], dtype=object)

In [10]:
crop_progress = crop_progress.loc[~(crop_progress['state'].isin(duplicate_state) & (crop_progress['growing_phase'] == 'EMERGED'))]

In [19]:
## week number input
ldas_final['week_50'] = ldas_final['time'].dt.week
ldas_final.head()

  ldas_final['week_50'] = ldas_final['time'].dt.week


Unnamed: 0,GEOID,time,ldas_sm_0_10cm,ldas_sm_10_40cm,ldas_sm_40_100cm,year,prism_ppt,prism_vpdmean,prism_tmax,prism_tmin,...,state,ldas_avg_vpd_kpa,ldas_short_eto,ldas_tall_eto,durum_yld,lon,lat,yld_count_until2020,irri_counties,week_50
24820,4003,1981-01-01,0.133886,0.205611,0.214761,1981,0.0,10.017172,19.535763,0.26539,...,4,1.22263,2.245915,3.581294,92.0,-109.75178,31.879091,7.0,Yes,1
24821,4003,1981-01-02,0.132737,0.205492,0.214664,1981,0.0,11.039088,20.917643,1.954751,...,4,1.017399,2.571626,4.223361,92.0,-109.75178,31.879091,7.0,Yes,1
24822,4003,1981-01-03,0.131824,0.205382,0.214568,1981,0.0,9.363811,19.42802,2.005535,...,4,1.082156,1.489441,2.443376,92.0,-109.75178,31.879091,7.0,Yes,1
24823,4003,1981-01-04,0.131686,0.205267,0.214468,1981,0.0,9.923656,20.431044,0.632699,...,4,1.099327,2.276939,3.710981,92.0,-109.75178,31.879091,7.0,Yes,1
24824,4003,1981-01-05,0.132842,0.20518,0.21438,1981,0.0,9.268793,18.536736,2.893811,...,4,1.087232,1.951347,3.062267,92.0,-109.75178,31.879091,7.0,Yes,2


In [20]:
## combining the dataframe
ldas_df =  pd.merge(ldas_final, crop_progress.rename(columns={'growing_phase':'growing'}), on=['state','week_50'], how='outer', indicator=True)
print(ldas_df.shape)

(1162525, 25)


In [21]:
print(ldas_df._merge.unique())
ldas_df = ldas_df.drop(columns='_merge')
ldas_df['growing_phase'] = ldas_df['growing'].fillna(method="ffill")
## with forward filling, all data beyond harvest phase becomes harvested which we do not need. 
ldas_df.loc[((ldas_df['growing_phase'] == 'HARVESTED') & ldas_df['growing'].isnull()),'growing_phase'] = np.nan

['left_only', 'both']
Categories (3, object): ['left_only', 'right_only', 'both']


Unnamed: 0,GEOID,time,ldas_sm_0_10cm,ldas_sm_10_40cm,ldas_sm_40_100cm,year,prism_ppt,prism_vpdmean,prism_tmax,prism_tmin,...,ldas_short_eto,ldas_tall_eto,durum_yld,lon,lat,yld_count_until2020,irri_counties,week_50,growing,growing_phase
0,4003,1981-01-01,0.133886,0.205611,0.214761,1981,0.0,10.017172,19.535763,0.26539,...,2.245915,3.581294,92.0,-109.75178,31.879091,7.0,Yes,1,,
1,4003,1981-01-02,0.132737,0.205492,0.214664,1981,0.0,11.039088,20.917643,1.954751,...,2.571626,4.223361,92.0,-109.75178,31.879091,7.0,Yes,1,,
2,4003,1981-01-03,0.131824,0.205382,0.214568,1981,0.0,9.363811,19.42802,2.005535,...,1.489441,2.443376,92.0,-109.75178,31.879091,7.0,Yes,1,,
3,4003,1981-01-04,0.131686,0.205267,0.214468,1981,0.0,9.923656,20.431044,0.632699,...,2.276939,3.710981,92.0,-109.75178,31.879091,7.0,Yes,1,,
4,4007,1981-01-01,0.188958,0.205624,0.208524,1981,0.00245,10.827749,19.813074,2.007902,...,1.815027,2.860188,66.0,-110.814567,33.802631,1.0,No,1,,


In [22]:
## check if the number of days for harvested week are 7 days. Just marking the starting week as we will consider the crop progress days before harvested phase:
ldas_df.loc[ldas_df.growing_phase == 'HARVESTED'].groupby(['GEOID','year'])['time'].count().unique()

array([7], dtype=int64)

In [23]:
irri_c = ldas_df.loc[ldas_df["irri_counties"] == 'Yes']['GEOID'].unique()
rain_c = ldas_df.loc[ldas_df['irri_counties'] == 'No']['GEOID'].unique()

ldas_df.loc[ldas_df['GEOID'].isin(irri_c), 'irri_counties'] = 'Yes'
ldas_df.loc[ldas_df['GEOID'].isin(rain_c), 'irri_counties'] = 'No'

In [5]:
### saving file
ldas_df.to_parquet('final_'+crop_file_name+'_combined_cropland_weighted_avg_dataframe_5June2022.gzip',compression='gzip')