In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler 

import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression

Using data from: 
https://archive.ics.uci.edu/dataset/560/seoul+bike+sharing+demand


In [None]:
dataset_cols = ["bike_count", "hour", "temp", "humidity", "wind", "visibility", "dew_pt_temp", "radiation", "rain", "snow", "functional"]
df = pd.read_csv("../data_files/SeoulBikeData.csv").drop(["Date", "Holiday", "Seasons"], axis=1) # removing some columns that we don't care about

In [None]:
df.columns = dataset_cols
df["functional"] = (df["functional"] == "Yes").astype(int) # converting yes to 1
df = df[df["hour"] == 12] # looking only at noon
df = df.drop(["hour"], axis = 1) # dropping the hour column, since all = 12


In [None]:
df.head

In [None]:
# first, we can look at all the data and see which have a linear looking plot
# we are looking for data that does not seem too helpful, and then we can drop it
for label in df.columns[1:]: # everything from temperature onward
    plt.scatter(df[label], df["bike_count"])
    plt.title(label)
    plt.ylabel("Bike count at noon")
    plt.xlabel(label)
    plt.show()

In [None]:
df = df.drop(["wind", "visibility", "functional"], axis = 1)

In [None]:
df.head()

# Train / validate test
Now, we'll split this data into different sets. 

In [None]:
train, val, test = np.split(df.sample(frac = 1), [int(0.6 * len(df)), int(0.8 * len(df))])

In [None]:
def get_xy(dataframe, y_label, x_labels = None): # used to extract data for just whatever youre interested in
    dataframe = copy.deepcopy(dataframe) # copies entire dataframe
    if x_labels is None: 
        X = dataframe[[c for c in dataframe.columns if c != y_label]].values
    else: 
        if len(x_labels) == 1: 
            X = dataframe[x_labels[0]].values.reshape(-1,1)
        else: 
            X = dataframe[x_labels].values

    y = dataframe[y_label].values.reshape(-1,1)
    data = np.hstack((X, y))

    return data, X, y

In [None]:
_, X_train_temp, y_train_temp = get_xy(train, "bike_count", x_labels = ["temp"])
_, X_val_temp, y_val_temp = get_xy(val, "bike_count", x_labels = ["temp"])
_, X_test_temp, y_test_temp = get_xy(test, "bike_count", x_labels = ["temp"])

In [None]:
temp_reg = LinearRegression()
temp_reg.fit(X_train_temp, y_train_temp)

In [None]:
temp_reg.score(X_test_temp, y_test_temp) # pretty garbage

In [None]:
plt.scatter(X_train_temp, y_train_temp, label = "Data", color = "blue")
x = tf.linspace(-20, 40, 100)
plt.plot(x, temp_reg.predict(np.array(x).reshape(-1,1)), label = "Fit", color = "red", linewidth = 3)
plt.legend()
plt.title("Bikes vs. temp")
plt.ylabel("Num of bikes")
plt.xlabel("Temp")
plt.show()

# Multiple linear regression

In [None]:
_, X_train_all, y_train_all = get_xy(train, "bike_count", x_labels = df.columns[1:])
_, X_val_all, y_val_all = get_xy(val, "bike_count", x_labels = df.columns[1:])
_, X_test_all, y_test_all = get_xy(test, "bike_count", x_labels = df.columns[1:])

In [None]:
all_reg = LinearRegression()
all_reg.fit(X_train_all, y_train_all)

In [None]:
all_reg.score(X_test_all, y_test_all)

# Regression with NN
Here we use tensor flow again.

In [None]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.xlabel('Epoch') # Epoch is a training cycle, so we plot loss over training cycles
    plt.ylabel('MSE')
    plt.legend
    plt.grid(True)

    plt.show()

In [None]:
temp_normalizer = tf.keras.layers.Normalization(input_shape = (1,), axis = None)
temp_normalizer.adapt(X_train_temp.reshape(-1))

In [None]:
temp_nn_model = tf.keras.Sequential([
    temp_normalizer, 
    tf.keras.layers.Dense(1) # using a single node is linear
])

In [None]:
temp_nn_model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = 0.1), loss = "mean_squared_error")

In [None]:
history = temp_nn_model.fit(
    X_train_temp.reshape(-1), y_train_temp, 
    verbose = 0, 
    epochs = 1000, 
    validation_data = (X_val_temp, y_val_temp)
)

In [None]:
plot_loss(history)

In [None]:
# The fit here will be slightly different, as we are using back propagation to train a neural net node
# That differs from the previous, which simply tries to compute the line of best fit
plt.scatter(X_train_temp, y_train_temp, label = "Data", color = "blue")
x = tf.linspace(-20, 40, 100)
plt.plot(x, temp_nn_model.predict(np.array(x).reshape(-1,1)), label = "Fit", color = "red", linewidth = 3)
plt.legend()
plt.title("Bikes vs. temp")
plt.ylabel("Num of bikes")
plt.xlabel("Temp")
plt.show()

# Neural Net
Here we'll add more nodes

In [None]:
temp_normalizer = tf.keras.layers.Normalization(input_shape = (1,), axis = None)
temp_normalizer.adapt(X_train_temp.reshape(-1))

nn_model = tf.keras.Sequential([
    temp_normalizer, 
    tf.keras.layers.Dense(32, activation = 'relu'), 
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'relu')
])
nn_model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = 0.001), loss = "mean_squared_error")

In [None]:
history = nn_model.fit(
    X_train_temp, y_train_temp,
    validation_data = (X_val_temp, y_val_temp), 
    verbose = 0, epochs = 100
)

In [None]:
plot_loss(history)

In [None]:
# Now the fit will be non-linear!
plt.scatter(X_train_temp, y_train_temp, label = "Data", color = "blue")
x = tf.linspace(-20, 40, 100)
plt.plot(x, nn_model.predict(np.array(x).reshape(-1,1)), label = "Fit", color = "red", linewidth = 3)
plt.legend()
plt.title("Bikes vs. temp")
plt.ylabel("Num of bikes")
plt.xlabel("Temp")
plt.show()

# Clearly you can tell that the model still is not perfect, particularly in the leftward region where there is no data. 
# It would have been better to remove this section from the NN altogether, probably. 

In [None]:
all_normalizer = tf.keras.layers.Normalization(input_shape = (6,), axis = -1)
all_normalizer.adapt(X_train_all)

In [None]:
nn_model = tf.keras.Sequential([
    all_normalizer, 
    tf.keras.layers.Dense(32, activation = 'relu'), 
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'relu')
])
nn_model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = 0.001), loss = "mean_squared_error")

In [None]:
history = nn_model.fit(
    X_train_all, y_train_all,
    validation_data = (X_val_all, y_val_all), 
    verbose = 0, epochs = 100
)

In [None]:
plot_loss(history)

In [None]:
# Comparing the mean squared error for the linear regressor vs. neural net
y_pred_lr = all_reg.predict(X_test_all)
y_pred_nn = nn_model.predict(X_test_all)

In [None]:
def MSE(y_pred, y_real):
    return (np.square(y_pred - y_real)).mean()

In [None]:
MSE(y_pred_lr, y_test_all)

In [None]:
MSE(y_pred_nn, y_test_all)
# nn gives a larger error!? LOL! 

In [None]:
# Let's take a look at the predictions

ax = plt.axes(aspect = "equal")
plt.scatter(y_test_all, y_pred_lr, label = "Lin Reg Preds")
plt.scatter(y_test_all, y_pred_nn, label = "NN Preds")
plt.xlabel("True Values")
plt.ylabel("Predictions")
lims = [0, 1800]
plt.xlim = lims
plt.ylims = lims
plt.legend()
_ = plt.plot(lims, lims, c="red")
