# Testing basic linear model performance, baseline

In [1]:
import sklearn
from sklearn.linear_model import LinearRegression
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as skm
np.random.seed(51)

# Per-county LM

In [2]:
def split_train_test_val(df, year_col='year'):
    train_df = df.loc[df[year_col]<=2010]
    val_df = df.loc[(df[year_col]>2010) & (df[year_col]<=2014)]
    test_df = df.loc[df[year_col]>2014]
    return train_df, val_df, test_df

In [89]:
def train_county_lm(
    inun_csv,
    pred_cols=['acres', 'precip', 'temp', 'vpd', 'last_precip','last_temp','last_vpd', 'year'],
    drop_zeros=True,
    plot_results=True):
    """
    Fits a basic county multilinear reg model
    
    Inputs:
        inun_csv (str): Path to inundation csv. Weather csv need must be in same dir
        pred_cols (list): List of strings with column names to use as prediction.
        drop_zeros (bool): True/False drop playas that are all zeros
    
    Outputs:
        output_dict (dict): state and county ids, number of playas, r2, mse, rmse, and mae
        Will also print out the validation R^2 and plot val predicted vs true inundation
    """
    
    # Set up output
    county_id = inun_csv.split('_')[-1][:-4]
    state = inun_csv.split('_')[-2]
    output_dict = {
        'state':state,
        'county_fips':county_id,
        'num_playas':0,
        'r2':0,
        'mse':0,
        'rmse':0,
        'mae':0}

    
    # Prep inundation data
    inun_df = pd.read_csv(inun_csv)
    inun_df = inun_df.assign(
        month_num = (inun_df['year']-inun_df['year'].min())*12 + inun_df['month'])
    inun_df.set_index(['id','year','month'], inplace=True)
    inun_df = inun_df.loc[~inun_df['inundation'].isna()]
    if drop_zeros:
        max_inun = inun_df.groupby('id').agg({'inundation':'max'})
        zero_ids = max_inun.loc[max_inun['inundation']==0].index
        inun_df.drop(zero_ids, inplace=True)
        if inun_df.shape[0]==0:
            return output_dict
        
    # Prep weather data
    weather_csv = inun_csv.replace('inun_frac_','weather_')
    weather_df = pd.read_csv(weather_csv)
    weather_df.set_index(['id','year','month'], inplace=True)
    weather_last = weather_df.groupby('id').shift(1)
    weather_df = weather_df.assign(last_temp=weather_last['temp'], last_precip=weather_last['precip'], last_vpd=weather_last['vpd'])
    joined_df = weather_df.join(inun_df, how='inner')
    joined_df = joined_df.assign(year=joined_df.index.get_level_values(1))
    
    # Assign month number

    train_df, val_df, test_df = split_train_test_val(joined_df)

    # Get predictors and target var for train
    X = train_df[pred_cols]
    y = train_df['inundation']
    
    # For val
    X_val = val_df[pred_cols]
    y_val = val_df['inundation']
    
    # For test (not using yet)
    # X_test = test_df[pred_cols]
    # y_test = test_df['inundation']


    reg = LinearRegression().fit(X, y)
    pred_val = reg.predict(X_val)
    num_playas = joined_df.index.get_level_values(0).unique().shape[0]

    output_dict['num_playas'] = num_playas
    output_dict['r2'] = reg.score(X_val, y_val)
    output_dict['mse'] = skm.mean_squared_error(pred_val, y_val)
    output_dict['rmse'] = np.sqrt(output_dict['mse'])
    output_dict['mae'] = skm.mean_absolute_error(pred_val, y_val)

    if plot_results:
        print('{} Total Playas in County'.format(num_playas))
        print('Val R^2 = {}'.format(output_dict['r2']))
        print('Val RMSE = {}'.format(output_dict['rmse']))
        print('Val MAE = {}'.format(output_dict['mae']))


        plt.scatter(y_val, pred_val)
        plt.xlabel('True Inun')
        plt.ylabel('Predicted Inun')
        plt.show()
    return output_dict

In [4]:
inun_csv_list = glob.glob('../data/state_county_csvs/counties/inun_frac*')

In [33]:
rand_csv = np.random.choice(inun_csv_list)

In [37]:
# With month time
train_county_lm(rand_csv, plot_results=False)

{'state': 'CO',
 'county_fips': '8017',
 'num_playas': 101,
 'r2': -0.010279220565362346,
 'mse': 0.0062276441537851435,
 'rmse': 0.07891542405502959,
 'mae': 0.038351634232976085}

In [38]:
# With year time
train_county_lm(rand_csv,
                pred_cols=['acres', 'precip', 'temp', 'vpd', 'last_precip','last_temp','last_vpd', 'year'],
                plot_results=False)

{'state': 'CO',
 'county_fips': '8017',
 'num_playas': 101,
 'r2': -0.011184550190264764,
 'mse': 0.00623322485922875,
 'rmse': 0.07895077491214858,
 'mae': 0.038484414612420796}

In [39]:
# No time
train_county_lm(rand_csv,
                pred_cols=['acres', 'precip', 'temp', 'vpd', 'last_precip','last_temp','last_vpd'],
                plot_results=False)

{'state': 'CO',
 'county_fips': '8017',
 'num_playas': 101,
 'r2': 0.000930167847015495,
 'mse': 0.006158546343206809,
 'rmse': 0.07847640628371567,
 'mae': 0.0366782563339788}

## Now run for all counties

In [91]:
county_results = pd.DataFrame()
for cur_csv in inun_csv_list:
    county_results = county_results.append(train_county_lm(
        cur_csv,
        plot_results=False, drop_zeros=True),
                                           ignore_index=True)
county_results.to_csv('data/county_modeling_results_allvars_year_nonzeros.csv',index=False)

In [94]:
# Overall accuracy metrics
def get_overall_metric(df, metric_col, weighted=True):
    # Remove r2 < -1, because something is wrong with that
    df = df.loc[df['r2']>-1]
    if weighted:
        df = df.loc[df['num_playas']>0]
        vals = df[metric_col] * df['num_playas']
        avg = (vals.sum())/df['num_playas'].sum()
    else:
        avg = df[metric_col].mean()
    
    return avg

In [95]:
# using all variables, including last-month
county_results = pd.read_csv('data/county_modeling_results_allvars_nonzeros.csv')
for metric in ['r2','mse','rmse','mae']:
    print(metric,get_overall_metric(county_results, metric, weighted=True))

r2 0.020192521563968693
mse 0.009907532552778878
rmse 0.08963667964908374
mae 0.0380779154773957


In [97]:
# using all variables plus year, including last-month
county_results = pd.read_csv('data/county_modeling_results_allvars_year_wzeros.csv')
for metric in ['r2','mse','rmse','mae']:
    print(metric,get_overall_metric(county_results, metric, weighted=True))

r2 -0.0038429169871053563
mse 0.009906443041646486
rmse 0.08974716144594544
mae 0.04319029581517186


In [None]:
# using only current-month weather vars
county_results = pd.read_csv('data/county_modeling_results_currentvars_nonzeros.csv')
for metric in ['r2','mse','rmse','mae']:
    print(metric,get_overall_metric(county_results, metric, weighted=True))

# Per-playa LM

In [40]:
def train_playa_lm(
    inun_csv,
    pred_cols=['acres', 'precip', 'temp', 'vpd', 'last_precip','last_temp','last_vpd', 'year'],
    drop_zeros=True):
    """
    Fits a basic county multilinear reg model
    
    Inputs:
        inun_csv (str): Path to inundation csv. Weather csv need must be in same dir
        pred_cols (list): List of strings with column names to use as prediction.
        drop_zeros (bool): True/False drop playas that are all zeros
    
    Outputs:
        output_dict (dict): state and county ids, number of playas, r2, mse, rmse, and mae
        Will also print out the validation R^2 and plot val predicted vs true inundation
    """
    
    # Set up output
    county_id = inun_csv.split('_')[-1][:-4]
    state = inun_csv.split('_')[-2]


    
    # Prep inundation data
    inun_df = pd.read_csv(inun_csv)
    inun_df = inun_df.assign(
        month_num = (inun_df['year']-inun_df['year'].min())*12 + inun_df['month'])
    inun_df.set_index(['id','year','month'], inplace=True)
    inun_df = inun_df.loc[~inun_df['inundation'].isna()]
    if drop_zeros:
        max_inun = inun_df.groupby('id').agg({'inundation':'max'})
        zero_ids = max_inun.loc[max_inun['inundation']==0].index
        inun_df.drop(zero_ids, inplace=True)
        if inun_df.shape[0]==0:
            return 
        
    # Prep weather data
    weather_csv = inun_csv.replace('inun_frac_','weather_')
    weather_df = pd.read_csv(weather_csv)
    weather_df.set_index(['id','year','month'], inplace=True)
    weather_last = weather_df.groupby('id').shift(1)
    weather_df = weather_df.assign(last_temp=weather_last['temp'], last_precip=weather_last['precip'], last_vpd=weather_last['vpd'])
    joined_df = weather_df.join(inun_df, how='inner')
    joined_df = joined_df.assign(year=joined_df.index.get_level_values(1))
    
    out_df = pd.DataFrame()
    for playa_id in joined_df.index.get_level_values(0).unique():
        output_dict = {
        'state':state,
        'county_fips':county_id,
        'playa_id':playa_id,
        'max_inun':0,
        'r2':0,
        'mse':0,
        'rmse':0,
        'mae':0}
        
        playa_df = joined_df.loc[joined_df.index.get_level_values(0)==playa_id]
        output_dict['max_inun'] = playa_df['inundation'].max()
        
        train_df, val_df, test_df = split_train_test_val(playa_df)

        # Get predictors and target var for train
        X = train_df[pred_cols]
        y = train_df['inundation']

        # For val
        X_val = val_df[pred_cols]
        y_val = val_df['inundation']

        # For test (not using yet)
        # X_test = test_df[pred_cols]
        # y_test = test_df['inundation']


        reg = LinearRegression().fit(X, y)
        pred_val = reg.predict(X_val)

        output_dict['r2'] = reg.score(X_val, y_val)
        output_dict['mse'] = skm.mean_squared_error(pred_val, y_val)
        output_dict['rmse'] = np.sqrt(output_dict['mse'])
        output_dict['mae'] = skm.mean_absolute_error(pred_val, y_val)
        
        out_df = out_df.append(output_dict, ignore_index=True)
        
    return out_df

In [41]:
inun_csv_list = glob.glob('../data/state_county_csvs/counties/inun_frac*')

In [77]:
rand_csv = np.random.choice(inun_csv_list)

In [78]:
# With month time
results = train_playa_lm(rand_csv,
                pred_cols=['acres', 'precip', 'temp', 'vpd', 'last_precip','last_temp','last_vpd', 'month_num'])
print(results['r2'].mean())
print(results['rmse'].mean())

-340.90605457105323
0.052998353726936745


In [79]:
# With Year time
results = train_playa_lm(rand_csv,
                pred_cols=['acres', 'precip', 'temp', 'vpd', 'last_precip','last_temp','last_vpd', 'year'])
print(results['r2'].mean())
print(results['rmse'].mean())

-340.956905513224
0.053008031243646725


In [80]:
# With no time
results = train_playa_lm(rand_csv,
                pred_cols=['acres', 'precip', 'temp', 'vpd', 'last_precip','last_temp','last_vpd'])
print(results['r2'].mean())
print(results['rmse'].mean())

-191.6794250812918
0.04653733299150869


## Now run for all

In [92]:
playa_results = pd.DataFrame()
for cur_csv in inun_csv_list:
    playa_results = playa_results.append(train_playa_lm(
        cur_csv,
#         pred_cols=['acres', 'precip', 'temp', 'vpd'],
        drop_zeros=True),
                                           ignore_index=True)
playa_results.to_csv('data/playa_modeling_results_allvars_year_nonzeros.csv',index=False)

In [None]:
# using all variables, including last-month
playa_results = pd.read_csv('data/playa_modeling_results_currentvars_nonzeros.csv')
for metric in ['r2','mse','rmse','mae']:
    print(metric,get_overall_metric(playa_results, metric, weighted=False))

In [98]:
# using all variables, including last-month
playa_results = pd.read_csv('data/playa_modeling_results_allvars_nonzeros.csv')
for metric in ['r2','mse','rmse','mae']:
    print(metric,get_overall_metric(playa_results, metric, weighted=False))

r2 0.04664722984547984
mse 0.009303183034128045
rmse 0.06250675166775621
mae 0.035203153535689376


In [99]:
# using all variables, including last-month
playa_results = pd.read_csv('data/playa_modeling_results_allvars_year_nonzeros.csv')
for metric in ['r2','mse','rmse','mae']:
    print(metric,get_overall_metric(playa_results, metric, weighted=False))

r2 0.032219003216962014
mse 0.009540462396381746
rmse 0.06411811679363551
mae 0.04045556649408707
