In [12]:
#necesarry imports
import joblib
import pickle
import math
import statistics
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet

In [13]:
data = pd.read_pickle('appml-assignment1-dataset-v2.pkl')

In [14]:
#split data into features (X) and targets (y)

In [66]:
X = data['X']
y = data['y']

#manipulate the date for the pipeline later
X['date'] = pd.to_datetime(X['date'])
X['day_of_week'] = X['date'].dt.dayofweek
X['hour_of_day'] = X['date'].dt.hour

In [16]:
#train-test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
#preprocessing pipepline

In [69]:
numerical_features = [col for col in X.columns if X[col].dtype in [int, float]]
categorical_features = ['day_of_week', 'hour_of_day']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

In [70]:
#elastic nets, cross validation

In [71]:
#test to try using iterative fitting for best alpha and l1
model = ElasticNetCV(
    alphas=[0.01, 0.1, 1.0],  #test
    l1_ratio=[0.1, 0.5, 0.9],  
    cv=20,
    max_iter=5000 #increasing max iterations to try and fix convergence warning 
)
model.fit(X_train_preprocessed, y_train)

#create final elastic net model, w best alpha and l1 
best_alpha = model.alpha_
best_l1_ratio = model.l1_ratio_

final_model = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)
final_model.fit(X_train_preprocessed, y_train)

#rmse
y_prediction = final_model.predict(X_test_preprocessed)
eval = np.sqrt(mean_squared_error(y_test, y_prediction))
print(eval)

y_prediction2 = final_model.predict(X_train_preprocessed)
eval2 = np.sqrt(mean_squared_error(y_train, y_prediction2))
print(eval2)

0.0021500368943677445
0.0021152778996835608


In [73]:
#save model and pipeline -- DONT RUN UNTIL WE LIKE OUR EVAL VAL

In [74]:
joblib.dump(final_model, 'model2.pkl')
joblib.dump(preprocessing_pipeline, 'pipeline2.pkl')

['pipeline2.pkl']