In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load dataset
data = pd.read_csv('FloridaBikeRentals.csv',encoding="ISO-8859-1")

In [9]:
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day,temp_humidity_interaction
0,01-12-2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,-192.4
1,01-12-2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,-209.0
2,01-12-2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes,-234.0
3,01-12-2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,-248.0
4,01-12-2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes,-216.0


# Task 1: Feature Engineering

In [10]:
# Define numerical and categorical features
num_features = ['Temperature(°C)', 'Humidity(%)', 'Wind speed (m/s)', 
                'Visibility (10m)', 'Dew point temperature(°C)', 
                'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)']

categorical_features = ['Seasons', 'Holiday', 'Functioning Day']

# Handling missing values
# Fill numeric columns with median
data[num_features] = data[num_features].fillna(data[num_features].median())

# Fill categorical columns with mode
data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
])

# Preprocess and save the processed dataset
processed_data = preprocessor.fit_transform(data)

# Convert the processed data back to a DataFrame
processed_df = pd.DataFrame(processed_data, columns=num_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out()))
processed_df['Rented Bike Count']=data['Rented Bike Count']
# Save the cleaned data
processed_df.to_csv('bike_rental_features.csv', index=False)

print("Processed data saved successfully.")


Processed data saved successfully.


# Task 2: Model Building

In [12]:
X = processed_df.drop(columns=['Rented Bike Count'])
y = processed_df['Rented Bike Count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Elastic Net': ElasticNet()
}

param_grid = {
    'Ridge': {'alpha': [0.1, 1, 10]},
    'Lasso': {'alpha': [0.1, 1, 10]},
    'Elastic Net': {'alpha': [0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9]}
}

for name, model in models.items():
    if name in param_grid:
        grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='r2')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
    else:
        best_model = model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    print(f"{name} Evaluation:")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("R2 Score:", r2_score(y_test, y_pred))

Linear Regression Evaluation:
MAE: 344.0185344541952
MSE: 218093.86422772516
R2 Score: 0.47654906131579244
Ridge Evaluation:
MAE: 343.9508196879352
MSE: 218058.4184308561
R2 Score: 0.476634135399418
Lasso Evaluation:
MAE: 343.9394183178123
MSE: 218057.4565457506
R2 Score: 0.47663644403686245
Elastic Net Evaluation:
MAE: 342.38068984767983
MSE: 218570.859614375
R2 Score: 0.4754042162566524


# Task 3: Model Building with Polynomial Features

In [14]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly, y, test_size=0.2, random_state=42)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train_poly)
y_pred_poly = poly_model.predict(X_test_poly)

print("Polynomial Regression Evaluation:")
print("MAE:", mean_absolute_error(y_test_poly, y_pred_poly))
print("MSE:", mean_squared_error(y_test_poly, y_pred_poly))
print("R2 Score:", r2_score(y_test_poly, y_pred_poly))

joblib.dump(poly_model, 'best_poly_model.pkl')

# Task 4: Model Evaluation and Validation
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    print(f"{name} Cross-Validation R2 Score: {np.mean(scores)}")

# Task 5: Reporting and Insights
print("Key Findings:Example")
print("- Temperature and Humidity are significant factors influencing rentals.")
print("- Polynomial features improved the R2 score compared to linear models.")
print("- Ridge regression performed better than Lasso, indicating less need for feature selection.")
print("Recommendations:")
print("- Further feature engineering on time-based patterns may improve accuracy.")

Polynomial Regression Evaluation:
MAE: 289.09512920932684
MSE: 171378.44891588186
R2 Score: 0.5886715553740107
Linear Regression Cross-Validation R2 Score: -0.42338968058339627
Ridge Cross-Validation R2 Score: -0.41963527338440515
Lasso Cross-Validation R2 Score: -0.37906853793239464
Elastic Net Cross-Validation R2 Score: -0.7403288109971691
Key Findings:Example
- Temperature and Humidity are significant factors influencing rentals.
- Polynomial features improved the R2 score compared to linear models.
- Ridge regression performed better than Lasso, indicating less need for feature selection.
Recommendations:
- Further feature engineering on time-based patterns may improve accuracy.
