In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, BatchNormalization
from tensorflow.keras import Sequential

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score

1.5.2


In [71]:
df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bikes.csv')
df.shape

(112475, 12)

### Split up dates

In [68]:
# Extract features from the date
df['dteday'] = pd.to_datetime(df['dteday'])
df['day_of_week'] = df['dteday'].dt.dayofweek
df['day'] = df['dteday'].dt.day
df['month'] = df['dteday'].dt.month

### Add new cols

In [77]:
df[df['holiday'] == 1]['dteday'].value_counts()

dteday
1/17/2011     24
12/25/2019    24
5/27/2019     24
7/4/2019      24
9/2/2019      24
              ..
11/27/2014    24
11/11/2014    24
10/13/2014    24
9/1/2014      24
10/9/2023     24
Name: count, Length: 142, dtype: int64

In [78]:
df[df['workingday'] == 0]['dteday'].value_counts()

dteday
1/1/2011     24
7/6/2019     24
8/4/2019     24
8/3/2019     24
7/28/2019    24
             ..
3/10/2019    23
3/13/2011    23
3/13/2016    23
3/10/2013    23
3/9/2014     23
Name: count, Length: 1480, dtype: int64

In [None]:
covid_start_date = pd.to_datetime('2020-03-14')
covid_end_date = pd.to_datetime('2021-12-31')

# Create a new column 'during_covid' that is 1 if the date is during the COVID-19 period, otherwise 0
df['during_covid'] = df['dteday'].apply(lambda x: 1 if covid_start_date <= x <= covid_end_date else 0)

df = df.drop(['dteday'], axis=1)

df['during_covid'].value_counts()

during_covid
0    96373
1    16102
Name: count, dtype: int64

### Create target column

In [56]:
df['total'] = df['casual'] + df['registered']
df.drop(['casual', 'registered'], inplace=True, axis=1)

df.head()

Unnamed: 0,hr,temp_c,feels_like_c,hum,windspeed,weathersit,season,holiday,workingday,day_of_week,day,month,total
0,0.0,3.0,3.0,0.7957,0.8,1,1,0,0,5,1,1,16
1,1.0,1.7,1.7,0.8272,0.8,1,1,0,0,5,1,1,38
2,2.0,1.9,1.9,0.8157,1.1,1,1,0,0,5,1,1,31
3,3.0,2.5,2.5,0.7831,0.8,1,1,0,0,5,1,1,12
4,4.0,2.0,2.0,0.8075,1.1,1,1,0,0,5,1,1,1


In [None]:
# Calculate median total users per month
monthly_median = df.groupby('month')['total'].median().reset_index()

px.bar(monthly_median, x='month', y='total', title='Median Total Users per Month')

In [None]:
monthly_counts = df['month'].value_counts().reset_index()
monthly_counts.columns = ['month', 'count']

# Sort by month
monthly_counts = monthly_counts.sort_values('month')

# Create the bar chart
fig = px.bar(monthly_counts, x='month', y='count', title='Total Number of Rows per Month')

fig.show()

In [57]:

# Calculate median total users per month
monthly_median = df.groupby('month')['total'].median().reset_index()

# Count the number of rows for each month
monthly_counts = df['month'].value_counts().reset_index()
monthly_counts.columns = ['month', 'count']

# Merge the two DataFrames on 'month'
merged_df = pd.merge(monthly_median, monthly_counts, on='month')

# Calculate the ratio of median total users to the count of rows
merged_df['ratio'] = merged_df['total'] / merged_df['count']

# Sort by month
merged_df = merged_df.sort_values('month')

px.bar(merged_df, x='month', y='ratio', title='Median Total Users per Month Adjusted for Number of Records')

In [52]:
# Calculate median total users for each weather condition
weather_effect = df.groupby('weathersit')['total'].median().reset_index()

# Map weather conditions to descriptive labels
weather_labels = {
    1: 'Clear',
    2: 'Mist',
    3: 'Light Storm',
    4: 'Heavy Storm'
}
weather_effect['weathersit'] = weather_effect['weathersit'].map(weather_labels)

# Create the bar chart
fig = px.bar(weather_effect, 
             x='weathersit', 
             y='total', 
             title='Effect of Different Weather Conditions on Users', 
             labels={'weathersit': 'Weather Condition', 'total': 'Median Total Users'},
             color_discrete_sequence=px.colors.qualitative.Pastel
            )

fig.show()

In [None]:
px.histogram(df, 
            x='hum', 
            y='total', 
            nbins=10,
            title='Total Users Depending on Humidity',
            labels={'hum': 'Humidity', 'total': 'Total Users'},
            histfunc='avg')

In [58]:
# Calculate the correlation matrix
corr_matrix = df.corr()

# Create the heatmap
px.imshow(corr_matrix, 
          title='Correlation Matrix',
          labels={'color': 'Correlation'},
          x=corr_matrix.columns,
          y=corr_matrix.columns,
          color_continuous_scale=px.colors.diverging.RdBu,
          zmin=-1, zmax=1)

### Get data ready for training

In [None]:
# One-hot encode the features
dow_encoder = OneHotEncoder()
day_encoder = OneHotEncoder()
month_encoder = OneHotEncoder()

dow_encoded = dow_encoder.fit_transform(df[['day_of_week']]).toarray()
day_encoded = day_encoder.fit_transform(df[['day']]).toarray()
month_encoded = month_encoder.fit_transform(df[['month']]).toarray()

dow_encoded = pd.DataFrame(dow_encoded, columns=[f'day_of_week_{i}' for i in range(7)])
day_encoded = pd.DataFrame(day_encoded, columns=[f'day_{i}' for i in range(1, 32)])
month_encoded = pd.DataFrame(month_encoded, columns=[f'month_{i}' for i in range(1, 13)])

# Concatenate the encoded features with the original dataframe
df_encoded = pd.concat([df, dow_encoded, day_encoded, month_encoded], axis=1)

# drop extra date columns
df_encoded.drop(['day_of_week', 'day', 'month'], axis=1, inplace=True)

In [None]:
y = df_encoded['total']
X = df_encoded.drop('total', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [47]:
# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform data
X_train = norm.transform(X_train)
X_test = norm.transform(X_test)

### Make the model

In [15]:
# Tried:
# Model 1: 35000 MSE
# Dense(64, activation='sigmoid', input_dim=num_features),
# Dense(128, activation='sigmoid'),
# Dense(64, activation='sigmoid'),
# Dense(1, activation='swish')

# Model 2: 12719 MSE,
# Dense(128, activation='relu', input_dim=num_features),
# Dense(64, activation='relu'),
# Dense(32, activation='relu'),
# Dense(1)

# Model 3: 16359 MSE, 17664 MSE
# Dense(128, activation='relu', input_dim=num_features),
# BatchNormalization(),
# Dense(64, activation='relu'),
# BatchNormalization(),
# Dense(32, activation='relu'),
# BatchNormalization(),
# Dense(1)

# Model 4: val_rmse: 157
# Dense(128, activation='relu', input_dim=num_features),
# Dropout(.4),
# Dense(64, activation='relu'),
# Dropout(.4),
# Dense(32, activation='relu'),
# Dense(1)

# Model 5: val_rmse: 145. r2=0.80986
# Dense(128, activation='relu', input_dim=num_features),
# BatchNormalization(),
# Dropout(.3),
# Dense(64, activation='relu'),
# BatchNormalization(),
# Dropout(.3),
# Dense(32, activation='relu'),
# Dense(1)

# Model 6:
# Dense(256, activation='relu', input_dim=num_features),
# BatchNormalization(),
# Dropout(.2),
# Dense(128, activation='relu'),
# BatchNormalization(),
# Dropout(.2),
# Dense(64, activation='relu'),
# Dense(1)

num_features = len(X_train[0])

model = Sequential([
  Dense(256, activation='relu', input_dim=num_features),
  BatchNormalization(),
  Dropout(.2),
  Dense(128, activation='relu'),
  BatchNormalization(),
  Dropout(.2),
  Dense(64, activation='relu'),
  Dense(1)
])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
opt = keras.optimizers.Adam(learning_rate=0.0004)
model.compile(loss='mse', optimizer=opt, metrics=[keras.metrics.RootMeanSquaredError()])

early_stop = keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error', patience=15, mode='min')

history = model.fit(X_train, y_train, epochs=500, validation_split=.20, batch_size=25, callbacks=[early_stop], shuffle=False)
hist = pd.DataFrame(history.history)

Epoch 1/500
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - loss: 126110.4844 - root_mean_squared_error: 350.4022 - val_loss: 49358.7227 - val_root_mean_squared_error: 222.1682
Epoch 2/500
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 55159.0820 - root_mean_squared_error: 234.8385 - val_loss: 43039.8438 - val_root_mean_squared_error: 207.4605
Epoch 3/500
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 49848.4688 - root_mean_squared_error: 223.2508 - val_loss: 38903.9414 - val_root_mean_squared_error: 197.2408
Epoch 4/500
[1m2465/2880[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - loss: 45994.9609 - root_mean_squared_error: 214.4462

### Plot loss

In [None]:
px.line(hist, x=hist.index, y='val_root_mean_squared_error')

### Predict values

In [None]:
predictions = np.round(model.predict(X_test), 1)
predictions

In [None]:
result = root_mean_squared_error(y_test, predictions)
result

In [None]:
result = r2_score(y_test, predictions)
result

In [None]:
# pred = pd.DataFrame(predictions,columns=['predictions'])
# pred
# pred['actual'] = y_test.tolist()
# pred

# pred['difference'] = pred['actual'] - pred['predictions']
# pred

# import seaborn as sns
# import matplotlib.pyplot as plt

# xlims =(0,55)
# ax = sns.scatterplot(data=pred,x='actual',y='predictions')
# ax.plot(xlims,xlims, color='r')
# plt.show()

In [None]:
#model.save('levi_model.keras')

### Predict mini holdout

In [None]:
mini = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/biking_holdout_test_mini.csv')

# Extract features from the date
mini['day_of_week'] = pd.to_datetime(mini['dteday']).dt.dayofweek
mini['day'] = pd.to_datetime(mini['dteday']).dt.day
mini['month'] = pd.to_datetime(mini['dteday']).dt.month

# One-hot encode the features
day_of_week_encoded = dow_encoder.transform(mini[['day_of_week']]).toarray()
day_encoded = day_encoder.transform(mini[['day']]).toarray()
month_encoded = month_encoder.transform(mini[['month']]).toarray()

# Concatenate the encoded features with the original dataframe
mini_encoded = pd.concat([
                mini,
                pd.DataFrame(day_of_week_encoded, columns=[f'day_of_week_{i}' for i in range(7)]),
                pd.DataFrame(day_encoded, columns=[f'day_{i}' for i in range(1, 32)]),
                pd.DataFrame(month_encoded, columns=[f'month_{i}' for i in range(1, 13)])
              ], axis=1)

# Drop the original features
mini_encoded.drop(['dteday', 'day_of_week', 'day', 'month'], axis=1, inplace=True)

print(mini_encoded.columns)

# transform data
mini_encoded = norm.transform(mini_encoded)

mini_pred = np.round(model.predict(mini_encoded), 1)

mini_pred = pd.DataFrame(mini_pred, columns = ['predictions'])
mini_pred.to_csv("levi-predictions.csv", index=False)