In [44]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
import os, mlflow, requests
from mlflow.tracking.client import MlflowClient
from dotenv import load_dotenv

## I) Load data

In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = './service-account-key.json'

In [12]:
def save_training_data_in_gcs(bucket_name):
    ''' Loads data to train the model and stores it as a parquet file in a GCS bucket '''
    download_url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet'
    training_data_df = pd.read_parquet(download_url)
    training_data_df.to_parquet(f'gs://{bucket_name}/training_data/yellow_tripdata_2021-01.parquet')

In [24]:
bucket_name = 'deployment-project'
save_training_data_in_gcs(bucket_name)

In [4]:
def load_training_data(bucket_name):
    df = pd.read_parquet(f'gs://{bucket_name}/training_data/yellow_tripdata_2021-01.parquet')
    return df

In [5]:
bucket_name = 'deployment-project'
df = load_training_data(bucket_name)

In [6]:
df.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,


In [7]:
df = df.iloc[:10000]

In [8]:
df.shape

(10000, 19)

## II) Prepare data for training

In [9]:
# Compute trip duration in minutes using lpep_dropoff_datetime and lpep_pickup_datetime
def calculate_trip_duration_in_minutes(df):
    df["trip_duration_minutes"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
    df = df[(df["trip_duration_minutes"] >= 1) & (df["trip_duration_minutes"] <= 60)]
    return df

In [10]:
# Preprocess: Put location into one feature called trip_route
def preprocess(df):
    df = calculate_trip_duration_in_minutes(df)
    categorical_features = ["PULocationID", "DOLocationID"]
    df[categorical_features] = df[categorical_features].astype(str)
    df['trip_route'] = df["PULocationID"] + "_" + df["DOLocationID"]
    df = df[['trip_route', 'trip_distance', 'trip_duration_minutes']]
    y = df['trip_duration_minutes']
    X = df.drop(columns=['trip_duration_minutes'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
    return X_train, X_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = preprocess(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical_features] = df[categorical_features].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trip_route'] = df["PULocationID"] + "_" + df["DOLocationID"]


In [33]:
dv = DictVectorizer()

dv.fit(X_train.to_dict(orient="records"))
X_train = dv.transform(X_train.to_dict(orient="records"))
X_test = dv.transform(X_test.to_dict(orient="records"))

In [12]:
X_train.head(2)

Unnamed: 0,trip_route,trip_distance
4320,263_161,2.51
5066,100_230,0.68


In [15]:
y_train

4320     7.666667
5066     1.766667
609      5.483333
2287     6.700000
9236    12.950000
          ...    
5890     4.250000
5342     2.916667
5543     6.916667
879     28.050000
7458    11.833333
Name: trip_duration_minutes, Length: 7824, dtype: float64

##### I think I don't need a one-hot encoding of the input features for a random forest regressor

## III) Training

In [13]:
# Set up the connection to MLflow
MLFLOW_TRACKING_URI = 'http://34.107.124.157:5000/'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment('yellow-taxi-duration-random-forest-regression')

<Experiment: artifact_location='gs://mlflow-artifacts-karim/mle-api-ml-deployment/1', creation_time=1695901954870, experiment_id='1', last_update_time=1695901954870, lifecycle_stage='active', name='yellow-taxi-duration-random-forest-regression', tags={}>

In [14]:
# Set features and target variable for mlflow
features = ["PULocationID", "DOLocationID", "trip_distance"]
target = 'duration'

In [44]:
with mlflow.start_run():
    tags = {
        'model': 'random forest regression',
        'developer': 'karim',
        'dataset': 'yellow-taxi',
        'year': 2021,
        'month': 1,
        'features': features,
        'target': target
    }
    mlflow.set_tags(tags)

    model = RandomForestRegressor()
    # model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric('rmse', rmse)
    mlflow.sklearn.log_model(model, 'model')



## IV) Register model in MLFlow

In [None]:
# random-forest-duration

In [45]:
MLFLOW_TRACKING_URI = 'http://34.107.124.157:5000/'
RUN_ID = 'a7cb467a3b654b1795ef3e20a555d13d' # !!! STILL THE LINEAR REGRESSION!

In [46]:
# Register the model in MLFlow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model_uri = f"runs:/{RUN_ID}/model"
mlflow.register_model(model_uri=model_uri, name='random-forest-duration')

Registered model 'random-forest-duration' already exists. Creating a new version of this model...
2023/09/28 17:36:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest-duration, version 2
Created version '2' of model 'random-forest-duration'.


<ModelVersion: aliases=[], creation_timestamp=1695915447840, current_stage='None', description='', last_updated_timestamp=1695915447840, name='random-forest-duration', run_id='a7cb467a3b654b1795ef3e20a555d13d', run_link='', source='gs://mlflow-artifacts-karim/mle-api-ml-deployment/1/a7cb467a3b654b1795ef3e20a555d13d/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

In [51]:
# Investigate the model in MLFlow using the client
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
model_name = 'random-forest-duration'
latest_versions = client.get_latest_versions(name=model_name)
for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 2, stage: Production


In [50]:
# Push the model to production stage
model_version = 2
new_stage = 'Production'
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1695915447840, current_stage='Production', description='', last_updated_timestamp=1695915458779, name='random-forest-duration', run_id='a7cb467a3b654b1795ef3e20a555d13d', run_link='', source='gs://mlflow-artifacts-karim/mle-api-ml-deployment/1/a7cb467a3b654b1795ef3e20a555d13d/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

## V) Hyperparameter tuning

In [None]:
# instantiate model 
logreg = LogisticRegression()

# Define the hyperparameters and their possible values for the grid search
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

# Perform the grid search with 5-fold cross-validation
grid_search = GridSearchCV(logreg, param_grid, cv=5)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

In [69]:
with mlflow.start_run():
    tags = {
        'model': 'random forest regression - GridSearchCV',
        'developer': 'karim',
        'dataset': 'yellow-taxi',
        'year': 2021,
        'month': 1,
        'features': features,
        'target': target
    }
    mlflow.set_tags(tags)

    model = RandomForestRegressor()
    param_grid = {
        'n_estimators': [20, 50, 100],
        'criterion': ['squared_error', 'absolute_error']
    }
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    y_score = best_model.score(X_test, y_test)
    y_pred = best_model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric('rmse', rmse)
    mlflow.sklearn.log_model(best_model, 'model')



## VI) Test Request to API

In [36]:
X_test.iloc[:5]

Unnamed: 0,trip_route,trip_distance
1410,142_100,1.8
7875,263_263,0.0
4152,48_163,0.48
8107,163_107,2.29
98,264_264,7.11


In [37]:
y_test[:5]

1410     7.200000
7875     2.000000
4152     1.666667
8107     8.283333
98      23.466667
Name: trip_duration_minutes, dtype: float64

In [38]:
url = 'http://localhost:9696/predict'
data = {
    'ride_id': 'rideidTEST',
    'PULocationID': 142,
    'DOLocationID': 100,
    'trip_distance': 1.8
}
response = requests.post(url, json=data)

In [39]:
response.json()

{'ride_id': 'rideidTEST',
 'PULocationID': 142,
 'DOLocationID': 100,
 'trip_distance': 1.8,
 'predicted_duration': 5.79066666666667}