### Setup

In [None]:
from google.colab import drive
#drive.flush_and_unmount()
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:.2f}'.format

In [None]:
climate = pd.read_csv("./drive/MyDrive/Colaboratory Notebooks/WiDS22_Climate_Change/Data/train_reduce.csv")
climate = climate.set_index("id")
climate.shape

(48439, 73)

# Models
**Predicting a continuous value**

- Models that only allow numeric data, allow both numeric and categorical data, only allows categorical data
- Run multiple types of models at once and get score
- Ensemble models
- Hyperparemeter tuning
- CV selection for training and testing set
- Boosting
- Change: parameters, features, re-process
- Features as parameter tuning
- Feature selection algorithms
- Try model with the principle components as features, set variance -> number of components

# Linear Regression

https://egghead.io/lessons/scikit-learn-use-linear-regression-to-estimate-continuous-values-with-python-and-scikit-learn

https://vitalflux.com/mean-square-error-r-squared-which-one-to-use/

https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Train Test split
X = climate.drop(columns = ['site_eui']) # covariates
y = climate['site_eui'] # target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=12345) # Split train/test sets

In [None]:
# Linear Model
linear = LinearRegression()
linear.fit(X_train, y_train)
# TODO: Pipeline to standardize

LinearRegression()

In [None]:
# Accuracy
print('Train data score: ', round(linear.score(X_train, y_train), 2))
print('Test data score: ', round(linear.score(X_test, y_test), 2))

Train data score:  0.51
Test data score:  0.53


In [None]:
# Other Metrics
predictions = linear.predict(X_test)
# r squared, coefficient of determination
# print("r squared: " + str(linear.score(X_test, y_test)))
print("R^2: " + str(r2_score(y_test, predictions)))
# mean squared error
print("MSE: " + str(metrics.mean_squared_error(y_test, predictions)))
# root mean squared error
print("RMSE: " + str(np.sqrt(metrics.mean_squared_error(y_test, predictions))))
# mean abolsute error
print("MAE: " + str(metrics.mean_absolute_error(y_test, predictions)))

R^2: 0.5315159539951438
MSE: 1282.2892047294888
RMSE: 35.80906595723336
MAE: 18.852940574408404


R^2: bounded between 0 and 1, 1 means model fits the data perfectly with MSE of 0. Our value means about half of variation of site eui was explained by the covariates

MSE: a value close to 0 will represent a better regression model (MSE of 0 represents a perfect predictor)

RMSE: lower the better

In [None]:
# Save predictions as Dataframe
print(predictions)
predictions_df = pd.DataFrame(data = predictions, columns = ['site_eui'])
predictions_df.sample(4)

[ 45.6261513   47.08265425  50.93392791 ...  79.33241869  92.28536781
 101.86347912]


Unnamed: 0,site_eui
6298,93.15
1625,80.18
2116,104.01
9058,78.41


In [None]:
predictions_df.shape # TODO: should have 48439 rows

(9688, 1)

# Lasso & Gradient Boosting

In [None]:
# Lasso Model
from sklearn import linear_model
reg_lasso = linear_model.Lasso(alpha=0.1)
reg_lasso.fit(X_train, y_train)

Lasso(alpha=0.1)

In [None]:
# Accuracy
print('Train data score: ', round(reg_lasso.score(X_train, y_train), 2))
print('Test data score: ', round(reg_lasso.score(X_test, y_test), 2))

Train data score:  0.48
Test data score:  0.51


In [None]:
# Gradient Boosted Regressor Model
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = GradientBoostingRegressor(random_state = 12345)
gb_reg.fit(X_train, y_train)

GradientBoostingRegressor(random_state=12345)

In [None]:
# Accuracy
print('Train data score: ', round(gb_reg.score(X_train, y_train), 2))
print('Test data score: ', round(gb_reg.score(X_test, y_test), 2))

Train data score:  0.66
Test data score:  0.66


#Random Forest Regressor & Support Vector Regressor Model

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
ran_reg = RandomForestRegressor()
ran_reg.fit(X_train, y_train)

RandomForestRegressor()

In [None]:
# Accuracy
print('Train data score: ', round(ran_reg.score(X_train, y_train), 2))
print('Test data score: ', round(ran_reg.score(X_test, y_test), 2))

Train data score:  0.96
Test data score:  0.72


In [None]:
# Other Metrics
y_pred = ran_reg.predict(X_test)
print("R^2: " + str(r2_score(y_test, y_pred)))
print("MSE: " + str(metrics.mean_squared_error(y_test, y_pred)))
print("RMSE: " + str(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
print("MAE: " + str(metrics.mean_absolute_error(y_test, y_pred)))

R^2: 0.7179964245055391
MSE: 771.8729028990589
RMSE: 27.782600722377644
MAE: 14.452064627444642


In [None]:
# Support Vector Regressor (SVR) Model
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

regressor = make_pipeline(StandardScaler(), SVR(kernel = 'rbf'))
regressor.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()), ('svr', SVR())])

In [None]:
# Accuracy
print('Train data score: ', round(regressor.score(X_train, y_train), 2))
print('Test data score: ', round(regressor.score(X_test, y_test), 2))

Train data score:  0.39
Test data score:  0.41


In [None]:
# Other Metrics
y_pred_svr = regressor.predict(X_test)
print("R^2: " + str(r2_score(y_test, y_pred_svr)))
print("MSE: " + str(metrics.mean_squared_error(y_test, y_pred_svr)))
print("RMSE: " + str(np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr))))
print("MAE: " + str(metrics.mean_absolute_error(y_test, y_pred_svr)))

R^2: 0.41110900375699877
MSE: 1611.8554595068686
RMSE: 40.14791974071469
MAE: 18.750404778651635


# Principal component features with sequential deep learning model

# Multi-Models

- Run multiple types of models at once: https://towardsdatascience.com/quickly-test-multiple-models-a98477476f0
- Trying to predict a continuous value
- Can classification models like decision tree be used for predicting coninuous values?

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn import metrics
np.random.seed(12345)

In [None]:
# Train Test split
X = climate.drop(columns = ['site_eui']) # covariates
y = climate['site_eui'] # target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=12345) # Split train/test sets

In [None]:
# Train models
linear = LinearRegression()
linear.fit(X_train, y_train)

# NB = MultinomialNB()
# NB.fit(X_train, y_train)

# KNN = KNeighborsClassifier(n_neighbors = 20)
# KNN.fit(X_train, y_train)

# logreg = LogisticRegression(solver = 'lbfgs', C=1e9, max_iter = 1000000000)
# logreg.fit(X_train, y_train)

# DT = DecisionTreeClassifier(random_state = 12345)
# DT.fit(X_train, y_train)

# # Instantiate model with 100 decision trees
# RF = RandomForestClassifier(random_state = 12345)
# RF.fit(X_train, y_train)

In [None]:
predictions = linear.predict(X_test)
# r squared, coefficient of determination
print("r squared: " + str(linear.score(X_test, y_test)))
# mean squared error
print("mean squared error: " + str(metrics.mean_squared_error(y_test, predictions)))

# print("accuracy of Linear: " + str(metrics.accuracy_score(y_test, linear.predict(X_test))))
# print("accuracy of NB: " + str(metrics.accuracy_score(y_test, NB.predict(X_test))))
# print("accuracy of KNN: " + str(metrics.accuracy_score(y_test, KNN.predict(X_test))))
# print("accuracy of logistic: " + str(metrics.accuracy_score(y_test, logreg.predict(X_test))))
# print("accuracy of DT: " + str(metrics.accuracy_score(y_test, DT.predict(X_test))))
# print("accuracy of RF: " + str(metrics.accuracy_score(y_test, RF.predict(X_test))))

In [None]:
def run_exps(X_train: pd.DataFrame , y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:
    '''
    Lightweight script to test many models and find winners
    :param X_train: training split
    :param y_train: training target vector
    :param X_test: test split
    :param y_test: test target vector
    :return: DataFrame of predictions
    '''
    
    dfs = [] # list of datasets
    models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC()), 
          ('GNB', GaussianNB()),
          ('XGB', XGBClassifier())
        ]

    results = []
    names = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    target_names = ['malignant', 'benign']
    
    for name, model in models:
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        results.append(cv_results)
        names.append(name)
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)

    final = pd.concat(dfs, ignore_index=True)
    return final