In [1]:
import pandas as pd
import pickle
import mlflow

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

In [2]:
from sklearn.pipeline import make_pipeline

In [3]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    
    df.duration = df.duration.dt.total_seconds() / 60
    
    df = df[(df.duration > 1) & (df.duration < 60)]
    
    categorical = ['PULocationID','DOLocationID']
    
    df[categorical] = df[categorical].astype(str)
    
    return df

def prepare_dictionaries(df):
    
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    
    result = df[categorical+numerical].to_dict(orient='records')
    return result



In [4]:
train_df = read_dataframe('D:/mlops-zoomcamp/data/green_tripdata_2021-01.parquet')
val_df = read_dataframe('D:/mlops-zoomcamp/data/green_tripdata_2021-02.parquet')
val_df.head()


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2021-02-01 00:34:03,2021-02-01 00:51:58,N,1.0,130,205,5.0,3.66,14.0,...,0.5,10.0,0.0,,0.3,25.3,1.0,1.0,0.0,17.916667
1,2,2021-02-01 00:04:00,2021-02-01 00:10:30,N,1.0,152,244,1.0,1.1,6.5,...,0.5,0.0,0.0,,0.3,7.8,2.0,1.0,0.0,6.5
2,2,2021-02-01 00:18:51,2021-02-01 00:34:06,N,1.0,152,48,1.0,4.93,16.5,...,0.5,0.0,0.0,,0.3,20.55,2.0,1.0,2.75,15.25
3,2,2021-02-01 00:53:27,2021-02-01 01:11:41,N,1.0,152,241,1.0,6.7,21.0,...,0.5,0.0,0.0,,0.3,22.3,2.0,1.0,0.0,18.233333
4,2,2021-02-01 00:57:46,2021-02-01 01:06:44,N,1.0,75,42,1.0,1.89,8.5,...,0.5,2.45,0.0,,0.3,12.25,1.0,1.0,0.0,8.966667


In [5]:
train_dicts = prepare_dictionaries(train_df)
val_dicts = prepare_dictionaries(val_df)

train_target = train_df.duration.values
val_target = val_df.duration.values


In [6]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('random-forest-experiment')

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='random-forest-experiment', tags={}>

In [7]:
with mlflow.start_run():

    params = dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)

    mlflow.log_params(params)

    dv = DictVectorizer()

    model = RandomForestRegressor(**params,n_jobs=-1)

    X_train = dv.fit_transform(train_dicts)
    model.fit(X_train,train_target)

    X_val = dv.transform(val_dicts)
    y_pred = model.predict(X_val)

    rmse = mean_squared_error(val_target,y_pred,squared=False)

    print(params,rmse)

    mlflow.log_metric('rmse',rmse)


    mlflow.sklearn.log_model(model,"models")

    with open('DictVectorizer.bin','wb') as f:
        pickle.dump(dv,f)

    mlflow.log_artifact('DictVectorizer.bin')

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 6.712849693388744


