In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
proj_data = pd.read_csv('/content/drive/MyDrive/CIS 519/Data/data_clean.csv')

In [None]:
#convert all uniuque subdivision to integer
for reg in proj_data.region.unique():
  proj_data[reg] = (proj_data.region == reg)*1
proj_data.drop('region', axis = 1, inplace = True)


In [None]:
from calendar import month_name
lower_ma = [m.lower() for m in month_name]

proj_data['month'] = proj_data['month'].str.lower().map(lambda m: lower_ma.index(m)).astype('float64')

In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, precision_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


Drop any features related to preciptation and do feature scaling

In [None]:
regressionFeatures = proj_data.columns.drop(['precipitation_sum_mm','precipitation_max_mm','precipitation_avg_mm','precipitation_min_mm', 'snow_depth_max_mm','snow_depth_avg_mm','snow_depth_min_mm','snow_depth_sum_mm'])
df_regressionFeatures = pd.DataFrame(data=MinMaxScaler().fit_transform(proj_data[regressionFeatures]), columns=regressionFeatures)

In [None]:
features = regressionFeatures
target = 'precipitation_sum_mm'

In [None]:
seed = 42
train, test = train_test_split(proj_data, test_size=0.2, random_state=seed)

In [None]:
X_train, X_test = train[features], test[features]
y_train, y_test = train[target], test[target]

**Now we can search our hyperparameter space, while testing different models.**

First Model: Random Forest Regressor

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor




# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Create the estimator
estimator = RandomForestRegressor()

# Print results with default params:
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Results with default parameters: \n')
print('From param_grid chosen; defaults are:{n_estimators = 100, max_depth = None, min_samples_split = 1}\n' )
print('MAE: ', mean_absolute_error(y_test, predicts))
print('\n \n \n')
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator, param_grid,scoring='neg_mean_absolute_error', cv=3,n_jobs=-1,verbose=2)

# Fit the data to perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score - MAE:", np.abs(best_score))



Results with default parameters: 

From param_grid chosen; defaults are:{n_estimators = 100, max_depth = None, min_samples_split = 1}

MAE:  2.25269600818833

 
 

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best Score - MAE: 1.8685794399391058


Run best params on test

In [None]:
estimator = RandomForestRegressor(max_depth=10,min_samples_split=2,n_estimators=100)
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Test MAE: ', mean_absolute_error(y_test, predicts))



Test MAE:  2.0913639004632607


Model 2: SVR

In [78]:

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR



# Define the parameter grid
param_grid = {'kernel' : ('linear', 'poly','rbf'),
              'C' : [1,5],
              'degree' : [3,4],
              'coef0' : [0.01]}

# Create the estimator
estimator = SVR()

# Print results with default params:
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Results with default parameters: \n')
print('From param_grid chosen; defaults are:{kernel = rbf, C = 1, degree = 3, coef0 = 0.0}\n' )

print('MAE: ', mean_absolute_error(y_test, predicts))
print('\n \n \n')
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator, param_grid,scoring='neg_mean_absolute_error', cv=3,n_jobs=-1,verbose=2)

# Fit the data to perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score - MAE:", np.abs(best_score))



Results with default parameters: 

From param_grid chosen; defaults are:{kernel = rbf, C = 1, degree = 3, coef0 = 0.0}

MAE:  81.96069677467041

 
 

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'C': 1, 'coef0': 0.01, 'degree': 3, 'kernel': 'linear'}
Best Score - MAE: 1.7106685561604191


test best params

In [None]:
estimator = SVR(C=1,coef0=0.01,degree=3,kernel='linear')
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Test MAE: ', mean_absolute_error(y_test, predicts))



Test MAE:  1.778213781710118


Linear Model - Elastic Net

In [None]:
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV




# Define the parameter grid
param_grid = {'l1_ratio':[1,0.5,0],
              'max_iter':[1000,500,2000],
              'alpha':[0.5,1,0.2]}

# Create the estimator
estimator = linear_model.ElasticNet()

# Print results with default params:
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Results with default parameters: \n')
print('From param_grid chosen; defaults are:{l1_ratio = 0.5, max_iter = 1000, alpha = 1}\n' )
print('MAE: ', mean_absolute_error(y_test, predicts))
print('\n \n \n')
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator, param_grid,scoring='neg_mean_absolute_error', cv=3,n_jobs=-1,verbose=2)

# Fit the data to perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score - MAE:", np.abs(best_score))


Results with default parameters: 

From param_grid chosen; defaults are:{l1_ratio = 0.5, max_iter = 1000, alpha = 1}

MAE:  3.205372919514734

 
 

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'alpha': 1, 'l1_ratio': 1, 'max_iter': 1000}
Best Score - MAE: 1.4803428102089298


test best params

In [None]:
estimator = linear_model.ElasticNet(alpha=1,l1_ratio=1,max_iter=1000)
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Test MAE: ', mean_absolute_error(y_test, predicts))



Test MAE:  1.5391108141716312


MLP

In [None]:
from sklearn.neural_network import MLPRegressor


# Define the parameter grid
param_grid = {'hidden_layer_sizes': [(100,),(50,),(200,)],
              'activation':('logistic','relu'),
              'learning_rate':['constant'],
              'max_iter':[200,500,800]}

# Create the estimator
estimator = MLPRegressor()

# Print results with default params:
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Results with default parameters: \n')
print('From param_grid chosen; defaults are:{hidden_layer_sizes = (100,), activation = relu, learning_rate = constant, max_iter = 200}\n' )
print('MAE: ', mean_absolute_error(y_test, predicts))
print('\n \n \n')
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator, param_grid,scoring='neg_mean_absolute_error', cv=3,n_jobs=-1,verbose=2)

# Fit the data to perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score - MAE:", np.abs(best_score))




Results with default parameters: 

From param_grid chosen; defaults are:{hidden_layer_sizes = (100,), activation = relu, learning_rate = constant, max_iter = 200}

MAE:  4.322760417411814

 
 

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 800}
Best Score - MAE: 3.929843309687623


test best params:

In [None]:
estimator = MLPRegressor(activation='relu',hidden_layer_sizes=(100,),learning_rate='constant',max_iter=800)
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Test MAE: ', mean_absolute_error(y_test, predicts))



Test MAE:  3.2105089156966624


In [79]:
from sklearn.linear_model import Ridge



# Define the parameter grid
param_grid = {'alpha': [0.5,1,3],
              'max_iter':[200,500,800]}

# Create the estimator
estimator = Ridge()

# Print results with default params:
estimator.fit(X_train,y_train)
predicts = estimator.predict(X_test)
print('Results with default parameters: \n')
print('From param_grid chosen; defaults are:{alpha=1.0, max_iter=none, learning_rate = constant, max_iter = 200}\n' )
print('MAE: ', mean_absolute_error(y_test, predicts))
print('\n \n \n')
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator, param_grid,scoring='neg_mean_absolute_error', cv=3,n_jobs=-1,verbose=2)

# Fit the data to perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score - MAE:", np.abs(best_score))


Results with default parameters: 

From param_grid chosen; defaults are:{alpha=1.0, max_iter=none, learning_rate = constant, max_iter = 200}

MAE:  1.5710899753731846

 
 

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best Parameters: {'alpha': 3, 'max_iter': 200}
Best Score - MAE: 1.5531182384699267


In [73]:
import statsmodels.api as sm


model = sm.OLS(y_train, X_train).fit()
predicts = model.predict(X_test)
print(mean_absolute_error(y_test, predicts))
#view model summary
print(model.summary())


1.5721520804712341
                             OLS Regression Results                             
Dep. Variable:     precipitation_sum_mm   R-squared:                       1.000
Model:                              OLS   Adj. R-squared:                  1.000
Method:                   Least Squares   F-statistic:                 1.883e+05
Date:                  Tue, 14 May 2024   Prob (F-statistic):               0.00
Time:                          22:22:42   Log-Likelihood:                -9657.2
No. Observations:                  3907   AIC:                         1.943e+04
Df Residuals:                      3847   BIC:                         1.981e+04
Df Model:                            59                                         
Covariance Type:              nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------