# Testing basic linear model performance, baseline

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as skm
np.random.seed(51)

# Per-county LM

In [None]:
def split_train_test_val(df, year_col='year'):
    train_df = df.loc[df[year_col]<=2010]
    val_df = df.loc[(df[year_col]>2010) & (df[year_col]<=2014)]
    test_df = df.loc[df[year_col]>2014]
    return train_df, val_df, test_df

In [None]:
def train_county_lm(inun_csv):
    """
    Fits a basic county multilinear reg model
    
    Inputs:
        inun_csv (str): Path to inundation csv. Weather csv need must be in same dir
    
    Outputs:
        LinearRegression() model: returns the model
        Will also print out the validation R^2 and plot val predicted vs true inundation
    """
    weather_csv = inun_csv.replace('inun_frac_','weather_')
    inun_df = pd.read_csv(inun_csv)
    weather_df = pd.read_csv(weather_csv)
    joined_df = weather_df.set_index(['id','year','month']).join(inun_df.set_index(['id','year','month']))
    joined_df = joined_df.assign(year=joined_df.index.get_level_values(1))

    joined_df = joined_df.loc[~joined_df['inundation'].isna()]
    train_df, val_df, test_df = split_train_test_val(joined_df)

    # Get predictors and target var for train
    X = train_df[['area', 'precip', 'temp', 'vpd']]
    y = train_df['inundation']
    
    # For val
    X_val = val_df[['area', 'precip', 'temp', 'vpd']]
    y_val = val_df['inundation']
    
    # For test (not using yet)
    X_test = test_df[['area', 'precip', 'temp', 'vpd']]
    y_test = test_df['inundation']


    reg = LinearRegression().fit(X, y)
    pred_val = reg.predict(X_val)
    num_playas = joined_df.index.get_level_values(0).unique().shape[0]
    print('{} Total Playas in County'.format(num_playas))
    print('Val R^2 = {}'.format(reg.score(X_val, y_val)))
    print('Val MSE = {}'.format(skm.mean_squared_error(pred_val, y_val)))
    print('Val RMSE = {}'.format(np.sqrt(skm.mean_squared_error(pred_val, y_val))))

    plt.scatter(y_val, pred_val)
    plt.xlabel('True Inun')
    plt.ylabel('Predicted Inun')
    plt.show()
    return reg

In [None]:
inun_csv_list = glob.glob('../data/state_county_csvs/counties/inun_frac*')

In [None]:
rand_csv = np.random.choice(inun_csv_list)
train_county_lm(rand_csv)

# Per-playa LM