In [5]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [6]:
os.chdir('/Users/lauosgom/Desktop/zoomcamp/')

In [7]:
import mlflow

mlflow.set_experiment(experiment_id='736462624464899721')



<Experiment: artifact_location='mlflow-artifacts:/736462624464899721', creation_time=1685469914785, experiment_id='736462624464899721', last_update_time=1685469914785, lifecycle_stage='active', name='my-first-experiment', tags={}>

In [8]:
df_train = pd.read_parquet('data/yellow_tripdata_2022-01.parquet')
df_val = pd.read_parquet('data/yellow_tripdata_2022-02.parquet')

In [9]:
df_train.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [10]:
# calculate the duration of the trip
df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train['duration'] = df_train['duration'].apply(lambda td: td.total_seconds() / 60)

In [11]:
# How many columns are there?
a = df_train.shape[0]

In [12]:
df_train.shape

(2463931, 20)

In [13]:
df_train['duration'].std()

46.44530513776802

In [14]:
df_train.duration.describe(percentiles = [0.95, 0.98, 0.99])

count    2.463931e+06
mean     1.421220e+01
std      4.644531e+01
min     -3.442400e+03
50%      1.018333e+01
95%      3.193333e+01
98%      4.215000e+01
99%      5.085000e+01
max      8.513183e+03
Name: duration, dtype: float64

In [15]:
df_train = df_train[(df_train['duration'] >= 1) & (df_train['duration'] <= 60) ]

In [16]:
# What fraction of the records left after you dropped the outliers?
b = df_train.shape[0]

In [17]:
b/a

0.9827547930522406

In [18]:
# decide which categorical and numerical variables we are going to use
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [19]:
df_train[categorical] = df_train[categorical].astype(str)

In [20]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')

In [21]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [22]:
dv.feature_names_

['DOLocationID=1',
 'DOLocationID=10',
 'DOLocationID=100',
 'DOLocationID=101',
 'DOLocationID=102',
 'DOLocationID=105',
 'DOLocationID=106',
 'DOLocationID=107',
 'DOLocationID=108',
 'DOLocationID=109',
 'DOLocationID=11',
 'DOLocationID=111',
 'DOLocationID=112',
 'DOLocationID=113',
 'DOLocationID=114',
 'DOLocationID=115',
 'DOLocationID=116',
 'DOLocationID=117',
 'DOLocationID=118',
 'DOLocationID=119',
 'DOLocationID=12',
 'DOLocationID=120',
 'DOLocationID=121',
 'DOLocationID=122',
 'DOLocationID=123',
 'DOLocationID=124',
 'DOLocationID=125',
 'DOLocationID=126',
 'DOLocationID=127',
 'DOLocationID=128',
 'DOLocationID=129',
 'DOLocationID=13',
 'DOLocationID=130',
 'DOLocationID=131',
 'DOLocationID=132',
 'DOLocationID=133',
 'DOLocationID=134',
 'DOLocationID=135',
 'DOLocationID=136',
 'DOLocationID=137',
 'DOLocationID=138',
 'DOLocationID=139',
 'DOLocationID=14',
 'DOLocationID=140',
 'DOLocationID=141',
 'DOLocationID=142',
 'DOLocationID=143',
 'DOLocationID=144',

In [23]:
target = 'duration'
y_train = df_train[target].values

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared = False)

In [None]:
with open('models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [24]:
with mlflow.start_run():

    mlflow.set_tag('developer', 'lospina')

    mlflow.log_param('train-data-path', 'data/yellow_tripdata_2022-01.parquet')
    mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2022-02.parquet')
    
    alpha = 0.01
    mlflow.log_param('alpha', alpha)

    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_train)

    rmse = mean_squared_error(y_train, y_pred, squared = False)
    mlflow.log_metric('rmse', rmse)


In [None]:
sns.distplot(y_pred, label = 'prediction')
sns.distplot(y_train, label = 'actual')

plt.legend()

In [None]:
mean_squared_error(y_train, y_pred)

In [None]:
# do the validation with the february data

In [None]:
#ml flow with hyperparameter tuning

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, 'validation')],
            early_stopping_rounds = 50
        )
        y_pred = booster.predict(valid)
        