In [1]:
import pandas as pd

import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

import mlflow
from mlflow.tracking import MlflowClient

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [2]:
df = pd.read_csv('./data/202204-capitalbikeshare-tripdata.csv', parse_dates = ['started_at', 'ended_at'])

In [3]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,8F5ADBABCB4EBE01,classic_bike,2022-04-15 10:09:53,2022-04-15 10:16:12,Maine Ave & 9th St SW,31646.0,Smithsonian-National Mall / Jefferson Dr & 12t...,31248.0,38.88044,-77.025236,38.888774,-77.028694,member
1,F502B0A7034DE321,classic_bike,2022-04-30 20:00:03,2022-05-01 14:27:41,Braddock Rd Metro North,31047.0,Braddock Rd Metro South,31969.0,38.814577,-77.052808,38.813474,-77.053734,casual
2,01BF0E3746A32678,classic_bike,2022-04-20 19:35:59,2022-04-20 19:41:04,11th & V st NW,31332.0,14th & Belmont St NW,31119.0,38.918199,-77.027171,38.921074,-77.031887,member
3,94BD7902E9889076,docked_bike,2022-04-15 17:23:21,2022-04-15 17:48:35,14th & D St NW / Ronald Reagan Building,31231.0,15th & W St NW,31125.0,38.894514,-77.031617,38.919019,-77.034449,casual
4,2CA1C29600E5F00A,classic_bike,2022-04-18 09:04:07,2022-04-18 09:05:12,11th & V st NW,31332.0,11th & V st NW,31332.0,38.918199,-77.027171,38.918199,-77.027171,member


In [4]:
df.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id             float64
end_station_name              object
end_station_id               float64
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object

In [5]:
df['duration'] = df['ended_at'] - df['started_at']

df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
0,8F5ADBABCB4EBE01,classic_bike,2022-04-15 10:09:53,2022-04-15 10:16:12,Maine Ave & 9th St SW,31646.0,Smithsonian-National Mall / Jefferson Dr & 12t...,31248.0,38.88044,-77.025236,38.888774,-77.028694,member,6.316667
1,F502B0A7034DE321,classic_bike,2022-04-30 20:00:03,2022-05-01 14:27:41,Braddock Rd Metro North,31047.0,Braddock Rd Metro South,31969.0,38.814577,-77.052808,38.813474,-77.053734,casual,1107.633333
2,01BF0E3746A32678,classic_bike,2022-04-20 19:35:59,2022-04-20 19:41:04,11th & V st NW,31332.0,14th & Belmont St NW,31119.0,38.918199,-77.027171,38.921074,-77.031887,member,5.083333
3,94BD7902E9889076,docked_bike,2022-04-15 17:23:21,2022-04-15 17:48:35,14th & D St NW / Ronald Reagan Building,31231.0,15th & W St NW,31125.0,38.894514,-77.031617,38.919019,-77.034449,casual,25.233333
4,2CA1C29600E5F00A,classic_bike,2022-04-18 09:04:07,2022-04-18 09:05:12,11th & V st NW,31332.0,11th & V st NW,31332.0,38.918199,-77.027171,38.918199,-77.027171,member,1.083333


In [None]:
# in future can calculate distance from the lat/long info

df[['start_station_id', 'end_station_id']] = df[['start_station_id', 'end_station_id']].fillna(-1)
df = df[(df.duration >= 1) & (df.duration <= 120)]

In [None]:
categorical = ['rideable_type', 'start_station_id', 'end_station_id']
numerical = ['duration']
df[categorical].dtypes

In [None]:
train_dicts = df[categorical].to_dict(orient = 'records')

In [None]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train.shape

In [None]:
target = 'duration'
y_train = df[target].values
y_train

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)


In [None]:
y_pred = lr.predict(X_train)

In [None]:
mean_squared_error(y_train, y_pred, squared = False) 

In [None]:
def read_dataframe(filename):

    df = pd.read_csv(filename)
    
    df['started_at'] = pd.to_datetime(df['started_at'])
    df['ended_at'] = pd.to_datetime(df['ended_at'])

    df['duration'] = df['ended_at'] - df['started_at']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 120)]

    df[['rideable_type', 'start_station_id', 'end_station_id']] = df[['rideable_type', 'start_station_id', 'end_station_id']].fillna(-1)
    categorical = ['rideable_type', 'start_station_id', 'end_station_id']

    df[categorical] = df[categorical].astype(str)
    
    return df
    

In [None]:
df_train = read_dataframe('./data/202204-capitalbikeshare-tripdata.csv')
df_val = read_dataframe('./data/202205-capitalbikeshare-tripdata.csv')

In [None]:
len(df_train), len(df_val)

In [None]:
categorical = ['rideable_type', 'start_station_id', 'end_station_id']
numerical = ['duration']

dv = DictVectorizer()

train_dicts = df[categorical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [None]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [None]:
lr = Ridge()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared = False)

In [None]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [None]:

# mlflow.set_tracking_uri = ('http://127.0.0.1:5000')
# mlflow.set_experiment('duration-prediction')

# with mlflow.start_run():

#         mlflow.set_tag('developer', 'hanna')

#         mlflow.log_param('train-data-path', './data/202204-capitalbikeshare-tripdata.csv')
#         mlflow.log_param('valid-data-path', './data/202205-capitalbikeshare-tripdata.csv')

#         alpha = 0.1
#         mlflow.log_param('alpha', alpha)

#         lr = Ridge(alpha)
#         lr.fit(X_train, y_train)
        
#         y_pred = lr.predict(X_val)
#         rmse = mean_squared_error(y_val, y_pred, squared = False)
#         mlflow.log_metric('rmse', rmse)

#         mlflow.log_artifact(local_path = 'models/lin_reg.bin', artifact_path='models_pickle')

In [None]:
lr.get_params()

In [None]:
solvers = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
intercepts = [True, False]

def train_model_search(X_train, y_train, X_val, y_val):
    def objective(params):
        with mlflow.start_run(nested=True):
             mlflow.set_tag('model', 'ridge')
             mlflow.log_params(params)

             lr = Ridge(**params)
             lr.fit(X_train, y_train)
             y_pred = lr.predict(X_val)
             rmse = mean_squared_error(y_val, y_pred, squared=False)
             mlflow.log_metric("rmse", rmse)

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'fit_intercept': hp.choice('fit_intercept', intercepts),
        'solver': hp.choice('solver', solvers),
        'alpha': scope.int(hp.uniform('alpha', 0, 1))
    }

    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=1,
        trials=Trials()
    )
    return best_result

def train_best_model(X_train, y_train, X_val, y_val, dv, best_result):
    with mlflow.start_run(nested=True):
        
        train = lr.fit(X_train, label=y_train)
        valid = lr.fit(X_val, label=y_val)

        mlflow.log_params(best_result)

        lr = Ridge(**best_result)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric('rmse', rmse)

        with open("models/preprocessor.b", "wb") as f_out:
            pickle.dump(dv, f_out)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")


mlflow.sklearn.log_model(lr, artifact_path="models")

if __name__ == "__main__":

    mlflow.set_tracking_uri = ('http://127.0.0.1:5000')
    mlflow.set_experiment('bike-duration-prediction')

    X_train, X_val, y_train, y_val, dv
    train = lr.fit(X_train, y_train)
    valid = lr.fit(X_val, y_val)
    best_result = train_model_search(X_train, y_train, X_val, y_val)
    train_best_model(X_train, y_train, X_val, y_val, dv, best_result)