In [1]:
import pandas as pd
from zipfile import ZipFile
from geopy.distance import distance
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
import mlflow
import xgboost as xgb
from prefect import flow, task
from prefect_aws import S3Bucket
from prefect.artifacts import create_markdown_artifact
from datetime import date
from prefect_email import EmailServerCredentials, email_send_message
from prefect import runtime

In [7]:
def read_data(filename: str):
    data_path = "/home/ubuntu/notebooks/07-project/data"
    zip_file = ZipFile(f'{data_path}/{filename}.zip')
    df = pd.read_csv(zip_file.open(f'{filename}.csv'))
    
    df.started_at = pd.to_datetime(df['started_at'],errors='coerce')
    df.ended_at = pd.to_datetime(df['ended_at'],errors='coerce')
    df = df.dropna()

    df['duration'] = df['ended_at'] - df['started_at']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1)]# less than a minute rides are removed
	
    df['trip_distance'] = df.apply(lambda row: distance((row['start_lat'],row['start_lng']),(row['end_lat'],row['end_lng'])).km,axis=1)
    
    categorical = ['start_station_id', 'end_station_id','rideable_type','member_casual']
    df[categorical] = df[categorical].astype(str)

    return df

def add_features(df_train: pd.DataFrame, df_val: pd.DataFrame) -> tuple(
    [
        dict,
        dict,
        np.ndarray,
        np.ndarray,
    ]
):
    df_train['SRC_DST'] = df_train['start_station_id'] + '_' + df_train['end_station_id']	
    df_val['SRC_DST'] = df_val['start_station_id'] + '_' + df_val['end_station_id']

    categorical = ['SRC_DST','rideable_type','member_casual']
    numerical = ['trip_distance']
    
    X_train = df_train[categorical + numerical].to_dict(orient="records")
    X_val = df_val[categorical + numerical].to_dict(orient="records")
    
    y_train = df_train["duration"].values
    y_val = df_val["duration"].values
	
    return X_train, X_val, y_train, y_val


In [8]:
train_path = "202301-divvy-tripdata"
val_path  = "202302-divvy-tripdata"

df_train = read_data(train_path)
df_val = read_data(val_path)

X_train, X_val, y_train, y_val= add_features(df_train, df_val)

In [12]:
y_val

array([13.93333333,  5.33333333, 24.06666667, ..., 15.33333333,
       16.81666667, 12.        ])

In [13]:
best_params = {
    "learning_rate": 0.09002710203069089,
    "max_depth": 4,
    "min_child_weight": 7.370049044160041,
    "objective": "reg:squarederror",
    "reg_alpha": 0.007060723631897894,
    "reg_lambda": 0.11278036580451371,
    "seed": 42,
}

pipeline = make_pipeline(DictVectorizer(), xgb.XGBRegressor(**best_params))

pipeline.fit(X_train,y_train)

In [14]:
y_pred = pipeline.predict(X_val)

In [15]:
rmse = mean_squared_error(y_val, y_pred, squared=False)

In [16]:
y_pred

array([20.539114,  7.629657, 23.928131, ..., 21.752247, 21.752247,
       16.637129], dtype=float32)

In [17]:
rmse

29.687699815566035