<a href="https://colab.research.google.com/github/iCarrin/WelcomeBike-DC/blob/main/Copy_of_starter_bikes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

bikes = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bikes.csv')

In [None]:
bikes

Unnamed: 0,dteday,hr,casual,registered,temp_c,feels_like_c,hum,windspeed,weathersit,season,holiday,workingday
0,1/1/2011,0.0,3,13,3.0,3.0,0.7957,0.8,1,1,0,0
1,1/1/2011,1.0,8,30,1.7,1.7,0.8272,0.8,1,1,0,0
2,1/1/2011,2.0,5,26,1.9,1.9,0.8157,1.1,1,1,0,0
3,1/1/2011,3.0,3,9,2.5,2.5,0.7831,0.8,1,1,0,0
4,1/1/2011,4.0,0,1,2.0,2.0,0.8075,1.1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
112470,10/31/2023,19.0,241,594,10.2,10.2,0.4516,8.4,2,4,0,1
112471,10/31/2023,20.0,171,450,8.9,7.5,0.5268,9.3,1,4,0,1
112472,10/31/2023,21.0,176,402,7.7,6.4,0.5756,7.8,1,4,0,1
112473,10/31/2023,22.0,106,257,8.0,8.0,0.5604,0.0,1,4,0,1


In [None]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112475 entries, 0 to 112474
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   dteday        112475 non-null  object 
 1   hr            112475 non-null  float64
 2   casual        112475 non-null  int64  
 3   registered    112475 non-null  int64  
 4   temp_c        112475 non-null  float64
 5   feels_like_c  112475 non-null  float64
 6   hum           112475 non-null  float64
 7   windspeed     112475 non-null  float64
 8   weathersit    112475 non-null  int64  
 9   season        112475 non-null  int64  
 10  holiday       112475 non-null  int64  
 11  workingday    112475 non-null  int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 10.3+ MB


In [None]:
bikes.isnull().sum()

Unnamed: 0,0
dteday,0
hr,0
casual,0
registered,0
temp_c,0
feels_like_c,0
hum,0
windspeed,0
weathersit,0
season,0


In [None]:
# Convert dteday to datetime
bikes['dteday'] = pd.to_datetime(bikes['dteday'])

# Check categorical value ranges
print("Season unique values:", bikes['season'].unique())
print("Weathersit unique values:", bikes['weathersit'].unique())
print("Holiday unique values:", bikes['holiday'].unique())
print("Workingday unique values:", bikes['workingday'].unique())

# Check numerical ranges
print("Hour range:", bikes['hr'].min(), bikes['hr'].max())
print("Humidity range:", bikes['hum'].min(), bikes['hum'].max())

Season unique values: [1 2 3 4]
Weathersit unique values: [1 2 3 4]
Holiday unique values: [0 1]
Workingday unique values: [0 1]
Hour range: 0.0 23.0
Humidity range: 0.0889 1.0


In [None]:
# Create target variable
bikes['total_rentals'] = bikes['casual'] + bikes['registered']

# Filter data up to October 2023
train_data = bikes[bikes['dteday'] <= '2023-10-31']

# Define preprocessing function
def preprocess_data(df, scaler=None, fit_scaler=True):
    df = df.copy()
    # Extract day of week
    df['day_of_week'] = df['dteday'].dt.dayofweek
    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['season', 'weathersit', 'holiday', 'workingday', 'day_of_week'], drop_first=True)
    # Scale numerical features
    numerical_cols = ['temp_c', 'feels_like_c', 'hum', 'windspeed']
    if fit_scaler:
        scaler = MinMaxScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    return df, scaler

# Apply preprocessing to training data
train_data, scaler = preprocess_data(train_data, fit_scaler=True)

# Define features and target for training
X_train_full = train_data.drop(['total_rentals', 'casual', 'registered', 'dteday'], axis=1)
y_train_full = train_data['total_rentals']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
print("Training shape:", X_train.shape)
print("Validation shape:", X_val.shape)

Training shape: (89980, 19)
Validation shape: (22495, 19)


In [None]:
# Load December data
bikes_december = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bikes_december.csv')
bikes_december['dteday'] = pd.to_datetime(bikes_december['dteday'])

# Apply preprocessing (using the same scaler)
bikes_december, _ = preprocess_data(bikes_december, scaler=scaler, fit_scaler=False)

# Define features for December data
X_dec = bikes_december.drop(['dteday'], axis=1, errors='ignore')

# Align columns with training data
X_dec = X_dec.reindex(columns=X_train_full.columns, fill_value=0)

print("December data shape:", X_dec.shape)

December data shape: (1465, 19)


In [None]:
print("Missing values in X_train:", X_train.isna().sum().sum())
print("Missing values in X_val:", X_val.isna().sum().sum())
print("Missing values in X_dec:", X_dec.isna().sum().sum())
print("X_train dtypes:\n", X_train.dtypes)

Missing values in X_train: 0
Missing values in X_val: 0
Missing values in X_dec: 0
X_train dtypes:
 hr               float64
temp_c           float64
feels_like_c     float64
hum              float64
windspeed        float64
season_2            bool
season_3            bool
season_4            bool
weathersit_2        bool
weathersit_3        bool
weathersit_4        bool
holiday_1           bool
workingday_1        bool
day_of_week_1       bool
day_of_week_2       bool
day_of_week_3       bool
day_of_week_4       bool
day_of_week_5       bool
day_of_week_6       bool
dtype: object


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Build model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 110600.3438 - mae: 233.2059 - val_loss: 57060.6797 - val_mae: 162.0734
Epoch 2/100
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 54881.5664 - mae: 157.4466 - val_loss: 50752.8516 - val_mae: 150.9995
Epoch 3/100
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 49969.0586 - mae: 145.8457 - val_loss: 47950.9805 - val_mae: 141.4180
Epoch 4/100
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 47623.3242 - mae: 140.8268 - val_loss: 46855.3555 - val_mae: 137.8887
Epoch 5/100
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 45928.1992 - mae: 137.0004 - val_loss: 46197.7773 - val_mae: 139.1097
Epoch 6/100
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 45970.3789 - mae: 136.1888 - val_loss: 45474.9258 - val_mae: 13

In [None]:
# Predict on December data
predictions = model.predict(X_dec)

# Save predictions
output = pd.DataFrame({
    'dteday': bikes_december['dteday'],
    'hr': bikes_december['hr'],
    'total_rentals_pred': predictions.flatten()
})
output.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")

# Predict on validation set
y_val_pred = model.predict(X_val)

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Predictions saved to predictions.csv
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

# Calculate metrics
mse = mean_squared_error(y_val, y_val_pred)
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
rmse = root_mean_squared_error(y_val, y_val_pred)

print(f"Validation MSE: {mse:.2f}")
print(f"Validation MAE: {mae:.2f}")
print(f"Validation R²: {r2:.2f}")
print(f"Validation RMSE: {rmse:.2f}")

Validation MSE: 25339.20
Validation MAE: 104.22
Validation R²: 0.78
Validation RMSE: 159.18
