In [1]:
import pandas as pd
import numpy as np
import time


from sklearn import linear_model
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from keras import models
from keras import layers
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense
from keras import losses
from keras import backend as K
from keras.optimizers import Adam
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
# transfer month into season
def month_season(sample):
    # input: df, output: df
    Season = np.zeros([sample.shape[0]])
    Season[(sample['month'] <= 5) & (sample['month'] >= 3)] = 1
    Season[(sample['month'] <= 8) & (sample['month'] >= 6)] = 2
    Season[(sample['month'] <= 11) & (sample['month'] >= 9)] = 3
    Season[sample['month'] <= 2] = 4
    Season[sample['month'] == 12] = 4
    sample['Season'] = Season
    return (sample)

# using one hot coding to transfer categorical variables as binary variables
def onehotcode(sample):
    # input: df, output: df
    binary_season = pd.get_dummies(sample['Season'])
    binary_season.columns = (['spring', 'summer', 'fall', 'winter'])
    binary_year = pd.get_dummies(sample['year'])
    binary_dayofweek = pd.get_dummies(sample['dayofweek'])
    binary_dayofweek.columns = (['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
    frames = [binary_season, binary_year, binary_dayofweek]
    final = pd.concat(frames, axis=1)
    final = final.reset_index(drop=True)
    return final  # original index

# normalize and standardize numeric features
def stand(sample):
    # input: df, output: df
    numeric = np.array(sample.loc[:, ['latitude', 'longitude',
                                      'DUEXTTAU_7', 'DUEXTTAU_50', 'elevation']])
    # standarization
    stand = (numeric - np.mean(numeric, axis=0)) / np.std(numeric, axis=0)
    stand = pd.DataFrame(stand)
    stand.columns = ['latitude', 'longitude', 'DUEXTTAU_7', 'DUEXTTAU_50', 'elevation']
    return stand  # updated index

# data preprocessing
def data_preprocessing(data):
    # input: df, output: array
    sample = month_season(data)
    cate_var = onehotcode(sample)  # original index
    cate_var.index = range(len(cate_var))  # index update
    nume_var = stand(sample)
    nume_var.index = range(len(nume_var))  # index update
    final = pd.concat([nume_var, cate_var], axis=1)
    return final

def data_pre1(data):
    # input: df, output: df
    sample = month_season(data)
    cate_var = onehotcode(sample)
    cate_var.index = range(len(cate_var))
    nume_var = sample.loc[:, ['latitude', 'longitude',
                              'DUEXTTAU_7', 'DUEXTTAU_50', 'elevation']]
    nume_var.index = range(len(nume_var))
    final = pd.concat([nume_var, cate_var], axis=1)
    return final


def data_pre2(data):
    # input: data_pre1(data):df, output: df
    dropped = data.drop(columns=['latitude', 'longitude', 'DUEXTTAU_7', 'DUEXTTAU_50', 'elevation'])
    dropped.index = range(len(dropped))  # index update

    numeric = np.array(data.loc[:, ['latitude', 'longitude',
                                    'DUEXTTAU_7', 'DUEXTTAU_50', 'elevation']])
    stand = (numeric - np.mean(numeric, axis=0)) / np.std(numeric, axis=0)
    standf = pd.DataFrame(stand)  # index update
    standf.columns = ['latitude', 'longitude', 'DUEXTTAU_7', 'DUEXTTAU_50', 'elevation']
    final = pd.concat([standf, dropped], axis=1)
    return final

In [5]:
#randomly sampling
def ramdomsampling(data):
    # input: df
    # output: np.array

    # train and test split
    trainset, testset = train_test_split(data, test_size=0.15, random_state=2020)

    # original y for both train and test
    y_train_orig = trainset["DUEXTTAU_7"].values
    y_test_orig = testset["DUEXTTAU_7"].values

    # data preprocessing
    train = data_preprocessing(trainset)
    test = data_preprocessing(testset)

    # feature and label split
    X_train_rs, y_train_rs = train.drop(columns=["DUEXTTAU_7"]).values, train["DUEXTTAU_7"].values
    X_test_rs, y_test_rs = test.drop(columns=["DUEXTTAU_7"]).values, test["DUEXTTAU_7"].values

    return X_train_rs, y_train_rs, X_test_rs, y_test_rs, y_train_orig, y_test_orig


# Leave one day out
def leaveoneday(data):
    # input: df(original dataset)
    # output: np.array(transfered dataset)
    data = data_pre1(data)

    # train and test split
    trainset = data[data['Thursday'] != 1]  # Thursday
    testset = data[data['Thursday'] == 1]

    # original y for both train and test
    y_train_orig = trainset["DUEXTTAU_7"].values
    y_test_orig = testset["DUEXTTAU_7"].values

    # data preprocessing
    train = data_pre2(trainset)
    test = data_pre2(testset)

    # feature and label split
    X_train_loo, y_train_loo = train.drop(columns=["DUEXTTAU_7"]).values, train["DUEXTTAU_7"].values
    X_test_loo, y_test_loo = test.drop(columns=["DUEXTTAU_7"]).values, test["DUEXTTAU_7"].values

    return X_train_loo, y_train_loo, X_test_loo, y_test_loo, y_train_orig, y_test_orig



#randomly sampling
def ramdomsampling2(data):
    # input: df
    # output: np.array

    # train and test split
    trainset, testset = train_test_split(data, test_size=0.15, random_state=2020)

    # original y for both train and test
    y_train_orig = trainset["DUEXTTAU_7"].values
    y_test_orig = testset["DUEXTTAU_7"].values

    # data preprocessing
    train = data_pre2(trainset)
    test = data_pre2(testset)

    # feature and label split
    X_train_rs, y_train_rs = train.drop(columns=["DUEXTTAU_7"]).values, train["DUEXTTAU_7"].values
    X_test_rs, y_test_rs = test.drop(columns=["DUEXTTAU_7"]).values, test["DUEXTTAU_7"].values

    return X_train_rs, y_train_rs, X_test_rs, y_test_rs, y_train_orig, y_test_orig


# Leave one day out
def leaveoneday2(data):
    # input: df(original dataset)
    # output: np.array(transfered dataset)
    # train and test split
    trainset = data[data['Thursday'] != 1]  # Thursday
    testset = data[data['Thursday'] == 1]

    # original y for both train and test
    y_train_orig = trainset["DUEXTTAU_7"].values
    y_test_orig = testset["DUEXTTAU_7"].values

    # data preprocessing
    train = data_pre2(trainset)
    test = data_pre2(testset)

    # feature and label split
    X_train_loo, y_train_loo = train.drop(columns=["DUEXTTAU_7"]).values, train["DUEXTTAU_7"].values
    X_test_loo, y_test_loo = test.drop(columns=["DUEXTTAU_7"]).values, test["DUEXTTAU_7"].values

    return X_train_loo, y_train_loo, X_test_loo, y_test_loo, y_train_orig, y_test_orig

In [69]:
def train_predict(model, X_train, y_train, X_test, y_test, y_train_orig, y_test_orig):
    results = {}
    model.fit(X_train, y_train)

    y_train_hat = model.predict(X_train)
    y_train_pred = y_train_hat * np.std(y_train_orig) + np.mean(y_train_orig)
    results['Train R2'] = r2_score(y_train_orig, y_train_pred)
    results['Train MAE'] = sum(abs(y_train_orig - y_train_pred)) / y_train_orig.shape[0]
    results['Train MSE'] = mean_squared_error(y_train_orig, y_train_pred)
    results['Train RMSE'] = np.sqrt(results['Train MSE'])

    y_test_hat = model.predict(X_test)
    y_test_pred = y_test_hat * np.std(y_test_orig) + np.mean(y_test_orig)
    results['Test R2'] = r2_score(y_test_orig, y_test_pred)
    results['Test MAE'] = sum(abs(y_test_orig - y_test_pred)) / y_test_orig.shape[0]
    results['Test MSE'] = mean_squared_error(y_test_orig, y_test_pred)
    results['Test RMSE'] = np.sqrt(results['Test MSE'])

    return results, y_test_orig, y_test_pred


def build_model(tr_x):
    model = Sequential()
    #first layer with sample data input
    model.add(Dense(units=64, input_shape=(tr_x.shape[1],)))
    model.add(Dropout(0.2))
    #second layer
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.2))
    #third layer
    model.add(layers.Dense(16, activation='relu'))
    model.add(Dropout(0.2))
    #forth layer
    model.add(layers.Dense(8, activation='relu'))
    #fifth layer
    model.add(layers.Dense(4, activation='relu'))
    #final layer and output the result
    model.add(Dense(units=1))
    #set the model loss function
    model.compile(optimizer= 'adam', loss="mse", metrics=['mae'])
    return model


def train_whole(X_train, y_train, X_test, y_test, y_train_orig, y_test_orig):
    results = {}

    model = build_model(X_train)

    rlst = EarlyStopping(monitor='val_loss', min_delta=.0001,
                         patience=5, verbose=True, mode='min')
    mod = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=500, epochs=30,
                    callbacks=[rlst], verbose=1)

    y_train_hat = model.predict(X_train)
    y_train_pred = y_train_hat * np.std(y_train_orig) + np.mean(y_train_orig)
    results['Train R2'] = r2_score(y_train_orig, y_train_pred)
#     results['Train MAE'] = sum(abs(y_train_orig - y_train_pred)) / y_train_orig.shape[0]
    results['Train MSE'] = mean_squared_error(y_train_orig, y_train_pred)
    results['Train RMSE'] = np.sqrt(results['Train MSE'])

    y_test_hat = model.predict(X_test)
    y_test_pred = y_test_hat * np.std(y_test_orig) + np.mean(y_test_orig)
    results['Test R2'] = r2_score(y_test_orig, y_test_pred)
#     results['Test MAE'] = sum(abs(y_test_orig - y_test_pred)) / y_test_orig.shape[0]
    results['Test MSE'] = mean_squared_error(y_test_orig, y_test_pred)
    results['Test RMSE'] = np.sqrt(results['Test MSE'])

    return results, y_test_orig, y_test_pred

# 3W dataset

In [7]:
usedata = pd.read_csv("//Users//dulichen//PycharmProjects//cisi567//venv//lib//PA_copy//thesis//3w_combined.csv")

In [8]:
#split data
X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs=ramdomsampling(usedata)
X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo=leaveoneday(usedata)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [7]:
#model building
Ridge = linear_model.Ridge()
#rf = RandomForestRegressor(n_jobs = -1, n_estimators=200, max_depth = 3, random_state = 2019)
XGB = XGBRegressor(n_jobs = -1, learning_rate = 1, n_estimators=200, max_depth = 3, random_state = 2019)

In [10]:
##Ridge model
Ridge_rs=train_predict(Ridge,X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("Ridge_rs results: ")
print(Ridge_rs[0])
Ridge_loo=train_predict(Ridge,X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("Ridge_loo results: ")
print(Ridge_loo[0])

Ridge_rs results: 
{'Train R2': 0.33955309650374577, 'Train MAE': 0.10155931792599611, 'Train MSE': 0.022936688896570412, 'Train RMSE': 0.15144863451537097, 'Test R2': 0.34035708546167265, 'Test MAE': 0.10152188784933038, 'Test MSE': 0.02291425717957272, 'Test RMSE': 0.15137455922172893}
Ridge_loo results: 
{'Train R2': 0.3297490091082562, 'Train MAE': 0.1025982670562608, 'Train MSE': 0.023170419537013286, 'Train RMSE': 0.15221832851865535, 'Test R2': 0.3975777168901755, 'Test MAE': 0.09665569576983345, 'Test MSE': 0.021501717530546954, 'Test RMSE': 0.1466346395997445}


In [12]:
##XGB
XGB_rs=train_predict(XGB,X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("XGB_rs results: ")
print(XGB_rs[0])
XGB_loo=train_predict(XGB,X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("XGB_loo results: ")
print(XGB_loo[0])

In [9]:
##Neural network
nn_rs=train_whole(X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("nn_rs results: ")
print(nn_rs[0])
nn_loo=train_whole(X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("nn_loo results: ")
print(nn_loo[0])

Train on 28075336 samples, validate on 4954472 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 00016: early stopping
nn_rs results: 
{'Train R2': 0.6611817906548375, 'Train MSE': 0.011766832154262849, 'Train RMSE': 0.10847503009569921, 'Test R2': 0.6615360365063779, 'Test MSE': 0.011757346489408462, 'Test RMSE': 0.10843129847700092}
Train on 28311264 samples, validate on 4718544 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 00008: early stopping
nn_loo results: 
{'Train R2': 0.6669270647657505, 'Train MSE': 0.011514253243451759, 'Train RMSE': 0.10730448845901908, 'Test R2': 0.5089229817374781, 'Test MSE': 0.017527570988769416, 'Test RMSE': 0.13239173308318544}


# Whole year data

In [10]:
usedata = pd.read_csv("//Users//dulichen//PycharmProjects//cisi567//venv//lib//PA_copy//thesis//Combined_MERRA2+G5NR+Elevation_2005-2006.csv")

In [11]:
#split data
X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs=ramdomsampling(usedata)
X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo=leaveoneday(usedata)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [12]:
#model building
Ridge = linear_model.Ridge()
# rf = RandomForestRegressor(n_jobs = -1, n_estimators=200, max_depth = 3, random_state = 2019)
XGB = XGBRegressor(n_jobs = -1, learning_rate = 1, n_estimators=200, max_depth = 3, random_state = 2019)

In [13]:
##Ridge model
Ridge_rs=train_predict(Ridge,X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("Ridge_rs results: ")
print(Ridge_rs[0])
Ridge_loo=train_predict(Ridge,X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("Ridge_loo results: ")
print(Ridge_loo[0])

Ridge_rs results: 
{'Train R2': 0.3353387247174612, 'Train MAE': 0.11553303335359358, 'Train MSE': 0.02849950897119243, 'Train RMSE': 0.16881797585326164, 'Test R2': 0.3355294333210507, 'Test MAE': 0.11551338515181221, 'Test MSE': 0.02848124947130029, 'Test RMSE': 0.16876388675098797}
Ridge_loo results: 
{'Train R2': 0.33277273502408966, 'Train MAE': 0.11621531697898635, 'Train MSE': 0.02898136272979015, 'Train RMSE': 0.17023913395512252, 'Test R2': 0.3521679875419622, 'Test MAE': 0.1116263993790547, 'Test MSE': 0.02558274502882191, 'Test RMSE': 0.15994606912588352}


In [None]:
##XGB
XGB_rs=train_predict(XGB,X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("XGB_rs results: ")
print(XGB_rs[0])
XGB_loo=train_predict(XGB,X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("XGB_loo results: ")
print(XGB_loo[0])

In [22]:
##Neural network
nn_rs=train_whole(X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("nn_rs results: ")
print(nn_rs[0])
nn_loo=train_whole(X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("nn_loo results: ")
print(nn_loo[0])

# 4 seasons

In [23]:
wholeyear=data_pre1(usedata)

In [30]:
wholeyear.shape #(143522380, 18)

(143522380, 18)

# Spring

In [28]:
spring=wholeyear[wholeyear['spring']==1]

In [94]:
spring.to_csv('spring.csv', index=False)

In [59]:
spring.shape  #(36175504, 18)

(36175504, 18)

In [32]:
#split data
X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs=ramdomsampling2(spring)
#(30749178, 17)   (5426326,)
X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo=leaveoneday2(spring)
#(31063748, 17)   (5111756,)

In [60]:
##Ridge model
Ridge_rs=train_predict(Ridge,X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("Ridge_rs results: ")
print(Ridge_rs[0])
Ridge_loo=train_predict(Ridge,X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("Ridge_loo results: ")
print(Ridge_loo[0])

Ridge_rs results: 
{'Train R2': 0.1962342556479587, 'Train MAE': 0.13269654675910064, 'Train MSE': 0.03265223913240167, 'Train RMSE': 0.18069930584371838, 'Test R2': 0.19638105769992764, 'Test MAE': 0.1327722268443726, 'Test MSE': 0.03269426976275191, 'Test RMSE': 0.18081556836387708}
Ridge_loo results: 
{'Train R2': 0.19116706781169945, 'Train MAE': 0.13372404190045115, 'Train MSE': 0.033078527368301816, 'Train RMSE': 0.18187503228398838, 'Test R2': 0.22662237892640735, 'Test MAE': 0.12775364944342804, 'Test MSE': 0.030152178321998423, 'Test RMSE': 0.17364382604054318}


In [70]:
##Neural network
nn_rs=train_whole(X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("nn_rs results: ")
print(nn_rs[0])
nn_loo=train_whole(X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("nn_loo results: ")
print(nn_loo[0])

Train on 30749178 samples, validate on 5426326 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 00024: early stopping


KeyboardInterrupt: 

# Summer

In [71]:
summer=wholeyear[wholeyear['summer']==1]
summer.shape

(36175504, 18)

In [95]:
summer.to_csv('summer.csv', index=False)

In [72]:
#split data
X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs=ramdomsampling2(summer)
#(30749178, 17)   (5426326,)
X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo=leaveoneday2(summer)
#(31063748, 17)   (5111756,)

In [73]:
##Ridge model
Ridge_rs=train_predict(Ridge,X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("Ridge_rs results: ")
print(Ridge_rs[0])
Ridge_loo=train_predict(Ridge,X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("Ridge_loo results: ")
print(Ridge_loo[0])

Ridge_rs results: 
{'Train R2': 0.37225852677538585, 'Train MAE': 0.14204153939591965, 'Train MSE': 0.04429733821391231, 'Train RMSE': 0.21046932843982827, 'Test R2': 0.3715953029040734, 'Test MAE': 0.14208230465255553, 'Test MSE': 0.044340008477714195, 'Test RMSE': 0.21057067335627294}
Ridge_loo results: 
{'Train R2': 0.3741068570410616, 'Train MAE': 0.1424819947397081, 'Train MSE': 0.04490873907618291, 'Train RMSE': 0.21191682112607982, 'Test R2': 0.36285197487938103, 'Test MAE': 0.13887156621649027, 'Test MSE': 0.04033504430526068, 'Test RMSE': 0.2008358640912043}


In [None]:
##Neural network
nn_rs=train_whole(X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("nn_rs results: ")
print(nn_rs[0])
nn_loo=train_whole(X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("nn_loo results: ")
print(nn_loo[0])

# Fall

In [74]:
fall=wholeyear[wholeyear['fall']==1]
fall.shape

(35782292, 18)

In [96]:
fall.to_csv('fall.csv', index=False)

In [77]:
#split data
X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs=ramdomsampling2(fall)
#(30414948, 17)   (5367344,)
X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo=leaveoneday2(fall)
#(30670536, 17)   (5111756,)

In [78]:
##Ridge model
Ridge_rs=train_predict(Ridge,X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("Ridge_rs results: ")
print(Ridge_rs[0])
Ridge_loo=train_predict(Ridge,X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("Ridge_loo results: ")
print(Ridge_loo[0])

Ridge_rs results: 
{'Train R2': 0.34210209979405526, 'Train MAE': 0.08439381375831491, 'Train MSE': 0.013669054759758617, 'Train RMSE': 0.11691473286014306, 'Test R2': 0.3424883692031443, 'Test MAE': 0.08435937122391421, 'Test MSE': 0.013662419545146437, 'Test RMSE': 0.11688635311766056}
Ridge_loo results: 
{'Train R2': 0.343936343722699, 'Train MAE': 0.08394325487362185, 'Train MSE': 0.013549417478227558, 'Train RMSE': 0.11640196509607369, 'Test R2': 0.3307592652699465, 'Test MAE': 0.08698660270461918, 'Test MSE': 0.014383918673393357, 'Test RMSE': 0.11993297575476629}


In [None]:
##Neural network
nn_rs=train_whole(X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("nn_rs results: ")
print(nn_rs[0])
nn_loo=train_whole(X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("nn_loo results: ")
print(nn_loo[0])

# Winter

In [84]:
winter=wholeyear[wholeyear['winter']==1]
winter.shape

(35389080, 18)

In [97]:
winter.to_csv('winter.csv', index=False)

In [92]:
#split data
X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs=ramdomsampling2(winter)
#(30414948, 17)   (5367344,)
X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo=leaveoneday2(winter)
#(30670536, 17)   (5111756,)

In [93]:
##Ridge model
Ridge_rs=train_predict(Ridge,X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("Ridge_rs results: ")
print(Ridge_rs[0])
Ridge_loo=train_predict(Ridge,X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("Ridge_loo results: ")
print(Ridge_loo[0])

Ridge_rs results: 
{'Train R2': 0.12331290051773314, 'Train MAE': 0.08687971447350698, 'Train MSE': 0.016837267915938207, 'Train RMSE': 0.12975849843435383, 'Test R2': 0.12313020490331572, 'Test MAE': 0.08690769132680914, 'Test MSE': 0.016845264893274925, 'Test RMSE': 0.12978930962631294}
Ridge_loo results: 
{'Train R2': 0.1175159773393355, 'Train MAE': 0.08827289846951392, 'Train MSE': 0.01749622382516927, 'Train RMSE': 0.13227329218390715, 'Test R2': 0.1631013183497959, 'Test MAE': 0.07947008884771077, 'Test MSE': 0.01296624916878019, 'Test RMSE': 0.11386943913438842}


In [None]:
##Neural network
nn_rs=train_whole(X_train_rs, y_train_rs,X_test_rs, y_test_rs,y_train_orig_rs,y_test_orig_rs)
print("nn_rs results: ")
print(nn_rs[0])
nn_loo=train_whole(X_train_loo, y_train_loo,X_test_loo, y_test_loo,y_train_orig_loo,y_test_orig_loo)
print("nn_loo results: ")
print(nn_loo[0])