# Duration Prediction

## Purpose
Taxi Data Prediction

## Notebook
#### nyc_taxi_analysis.ipynb
- Data Scrapping
- Data Fomart
- Feature Engineering
- Training

## Methodology
- Pandas
- Numpy
- scikit-learn

## Versioning
- Conda (Notebook Versioning Packages)

## Notable TODOs:
--

## Results:
RSME: 5.766549051507733

## Suggested next steps:
--

# Setup

### System Infomation

In [25]:
%%bash
# Variables
BASE_ENV_PATH=$(conda info -a |\
                grep -i "CONDA_ROOT" |\
                cut -d ":" -f 2 |\
                xargs echo |\
                sed 's/ *$//g')
JUPY_PATH="$BASE_ENV_PATH/bin/jupyter"
JUPYLAB_VER=$(echo $($JUPY_PATH lab --version))

## System Info
printf "## GLOBAL INFO"
# System Python Version
printf "\nConda Python Version: "
conda info | grep -i "python version" | cut -d ":" -f 2
# Conda Base
printf "Conda Base Path: $BASE_ENV_PATH"
# Jupyter Lab Version
printf "\nJupyter Lab Version: $JUPYLAB_VER"
# Jupyter Lab Extensions
printf "\nEnabled Extensions:"
echo ''
echo $($JUPY_PATH labextension list)

## Environment Info
printf "## ENVIRONMENT INFO"
# Python Env Path
printf "\nEnvironment Python Version: "
python --version||
# Python Env Path
printf "Environment Python Path: "
which python
# Active Mamba Environment
printf "Active Environment: "
conda info -a | grep -i "active environment" | cut -d ":" -f 2

## GLOBAL INFO
Conda Python Version:  3.10.9.final.0
Conda Base Path: /home/leobit/miniconda3
Jupyter Lab Version: 3.5.3
Enabled Extensions:


Config option `kernel_spec_manager_class` not recognized by `ListLabExtensionsApp`.
[W 2023-06-18 17:16:05.320 LabApp] Config option `kernel_spec_manager_class` not recognized by `LabApp`.
JupyterLab v3.5.3
/home/leobit/miniconda3/share/jupyter/labextensions
        jupyterlab_templates v0.4.0 [32menabled[0m [31m X[0m (python, jupyterlab_templates)
        jupyterlab-system-monitor v0.8.0 [32menabled[0m [32mOK[0m (python, jupyterlab-system-monitor)
        jupyterlab-execute-time v2.3.1 [32menabled[0m [32mOK[0m (python, jupyterlab_execute_time)
        nbdime-jupyterlab v2.1.1 [32menabled[0m [32mOK[0m
        jupyterlab-topbar-extension v0.6.1 [32menabled[0m [32mOK[0m (python, jupyterlab-topbar)
        @krassowski/jupyterlab-lsp v3.10.2 [32menabled[0m [32mOK[0m (python, jupyterlab-lsp)
        @lckr/jupyterlab_variableinspector v3.0.9 [32menabled[0m [32mOK[0m (python, lckr_jupyterlab_variableinspector)
        @jupyterlab/git v0.41.0 [32menabled[0m [32mO


## ENVIRONMENT INFO
Environment Python Version: Python 3.10.11
/home/leobit/miniconda3/envs/cond_mlopszoomcamp_env/bin/python
Active Environment:  cond_mlopszoomcamp_env


### Path

In [26]:
import sys
from pathlib import Path

project_paths = ['data', 'notebook', 'scripts', 'model', 'src', 'pipeline']

B_PATH = Path.cwd().parents[0]

# Create Project Folders and Set Paths
for _path in project_paths:
    (B_PATH / _path).mkdir(exist_ok=True)
    if not globals().get(f"{_path[:1]}_path".upper()):
        globals()[f"{_path[:1]}_path".upper()] = B_PATH / _path
    else:
        globals()[f"{_path}_path"] = B_PATH / _path

# Custom Paths
RAW_DT_PATH = D_PATH / 'raw'
IMG_DT_PATH = D_PATH / 'images'
ASSETS_PATH = B_PATH / 'assets'

# Script Path
sys.path.append(S_PATH.as_posix())

### Installation

In [None]:
# nyc_taxi_analysis.ipynb
%%bash
mamba install pandas
mamba install -y fastparquet
pip install sweetviz
mamba install -y pyarrow
mamba install -y seaborn
mamba install -y scikit-learn
mamba install -y mlflow
mamba install xgboost
pip install hyperopt
mamba install -c conda-forge optuna

In [1]:
%%bash
mamba install -c conda-forge prefect


Looking for: ['prefect']


Pinned packages:
  - python 3.10.*


Transaction

  Prefix: /home/leobit/miniconda3/envs/cond_mlopszoomcamp_env

  Updating specs:

   - prefect
   - ca-certificates
   - certifi
   - openssl


  Package                  Version  Build            Channel                    Size
──────────────────────────────────────────────────────────────────────────────────────
  Install:
──────────────────────────────────────────────────────────────────────────────────────

  + aiofiles                23.1.0  pyhd8ed1ab_1     conda-forge/noarch         18kB
  + aiohttp                  3.8.4  py310h1fa729e_0  conda-forge/linux-64      445kB
  + aiosignal                1.3.1  pyhd8ed1ab_0     conda-forge/noarch         13kB
  + aiosqlite               0.19.0  pyhd8ed1ab_0     conda-forge/noarch         19kB
  + anyio                    3.7.0  pyhd8ed1ab_1     conda-forge/noarch         97kB
  + apprise                  1.4.0  pyhd8ed1ab_0     conda-forge/noarch          1

### Import

In [27]:
import re
import time
import random
import warnings
import mlflow
import pandas as pd
import pickle as pkl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from utils import save_obj, load_obj, run_api
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

### Configuration / API

In [28]:
# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Pandas
pd.set_option('max_colwidth', 999)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


# Future Warning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# # # Torch Version
# print(f"Torch Version: {torch. __version__}")

# # # Torch Cuda Available
# torch.cuda.is_available()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
{M_PATH.as_posix()}

{'/home/leobit/Development/mlops-zoomcamp/model'}

In [65]:
type(M_PATH)

pathlib.PosixPath

In [64]:
M_PATH.parent

PosixPath('/home/leobit/Development/mlops-zoomcamp')

In [56]:
# MLFlow API Run
MLFLOW_SQL_PATH = f"sqlite:///{M_PATH.as_posix()}/mlflow.db"
cmd = f"mlflow ui --backend-store-uri {MLFLOW_SQL_PATH}"
run_api(cmd)

[2023-06-18 18:06:08 -0300] [208564] [INFO] Starting gunicorn 20.1.0

[2023-06-18 18:06:08 -0300] [208564] [INFO] Listening at: http://127.0.0.1:5000 (208564)

[2023-06-18 18:06:08 -0300] [208564] [INFO] Using worker: sync

[2023-06-18 18:06:08 -0300] [208567] [INFO] Booting worker with pid: 208567

[2023-06-18 18:06:08 -0300] [208568] [INFO] Booting worker with pid: 208568

[2023-06-18 18:06:08 -0300] [208569] [INFO] Booting worker with pid: 208569

[2023-06-18 18:06:08 -0300] [208593] [INFO] Booting worker with pid: 208593



Process Process-4:
Traceback (most recent call last):
  File "/home/leobit/miniconda3/envs/cond_mlopszoomcamp_env/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/leobit/miniconda3/envs/cond_mlopszoomcamp_env/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/leobit/Development/mlops-zoomcamp/scripts/utils.py", line 69, in call
    for line in iter(p.stdout.readline, b''):
KeyboardInterrupt


In [None]:
# MLFLow Experiment Set
mlflow.set_tracking_uri(MLFLOW_SQL_PATH)
mlflow.set_experiment("nyc-taxi-experiment")

In [54]:
# Prefect API Run
cmd = f"prefect server start"
run_api(cmd)



 ___ ___ ___ ___ ___ ___ _____ 

| _ \ _ \ __| __| __/ __|_   _| 

|  _/   / _|| _|| _| (__  | |  

|_| |_|_\___|_| |___\___| |_|  



Configure Prefect to communicate with the server with:



    prefect config set PREFECT_API_URL=http://127.0.0.1:4200/api



View the API reference documentation at http://127.0.0.1:4200/docs



Check out the dashboard at http://127.0.0.1:4200









Process Process-3:
Traceback (most recent call last):
  File "/home/leobit/miniconda3/envs/cond_mlopszoomcamp_env/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/leobit/miniconda3/envs/cond_mlopszoomcamp_env/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/leobit/Development/mlops-zoomcamp/scripts/utils.py", line 69, in call
    for line in iter(p.stdout.readline, b''):
KeyboardInterrupt


In [55]:
# Prefect Set Api Url
!prefect config set PREFECT_API_URL=http://127.0.0.1:4200/api

Set 'PREFECT_API_URL' to 'http://127.0.0.1:4200/api'.
[32mUpdated profile 'default'.[0m


# Topic 3.2

In [14]:
topic_3_2_path = (S_PATH / 'orchestration' / '3.2')

In [18]:
!python {(S_PATH / 'orchestration' / '3.2' / 'cat_facts.py').as_posix()}

19:20:02.190 | [36mINFO[0m    | prefect.engine - Created flow run[35m 'woodoo-chupacabra'[0m for flow[1;35m 'fetch'[0m
19:20:02.191 | [36mINFO[0m    | Flow run[35m 'woodoo-chupacabra'[0m - View at [94mhttp://127.0.0.1:4200/flow-runs/flow-run/731a5b0e-2bf6-4af6-b78f-846830f20043[0m
19:20:02.256 | [36mINFO[0m    | Flow run[35m 'woodoo-chupacabra'[0m - Created task run 'fetch_cat_fact-0' for task 'fetch_cat_fact'
19:20:02.256 | [36mINFO[0m    | Flow run[35m 'woodoo-chupacabra'[0m - Executing 'fetch_cat_fact-0' immediately...
19:20:05.541 | [36mINFO[0m    | Task run 'fetch_cat_fact-0' - On average, cats spend 2/3 of every day sleeping. That means a nine-year-old cat has been awake for only three years of its life.
19:20:05.556 | [36mINFO[0m    | Task run 'fetch_cat_fact-0' - Finished in state [32mCompleted[0m()
19:20:05.570 | [36mINFO[0m    | Flow run[35m 'woodoo-chupacabra'[0m - Finished in state [32mCompleted[0m('All states completed.')


In [19]:
!python {(S_PATH / 'orchestration' / '3.2' / 'cat_dog_facts.py').as_posix()}

21:04:01.688 | [36mINFO[0m    | prefect.engine - Created flow run[35m 'boisterous-alligator'[0m for flow[1;35m 'animal-facts'[0m
21:04:01.689 | [36mINFO[0m    | Flow run[35m 'boisterous-alligator'[0m - View at [94mhttp://127.0.0.1:4200/flow-runs/flow-run/32fc4c76-2537-4191-b744-5730b90e51ba[0m
21:04:01.784 | [36mINFO[0m    | Flow run[35m 'boisterous-alligator'[0m - Created subflow run[35m 'belligerent-beetle'[0m for flow[1;35m 'fetch-cat-fact'[0m
21:04:01.784 | [36mINFO[0m    | Flow run[35m 'belligerent-beetle'[0m - View at [94mhttp://127.0.0.1:4200/flow-runs/flow-run/638d3381-16fc-4793-a8d2-e943bacf1e0c[0m
21:04:02.630 | [36mINFO[0m    | Flow run[35m 'belligerent-beetle'[0m - Finished in state [32mCompleted[0m()
21:04:02.678 | [36mINFO[0m    | Flow run[35m 'boisterous-alligator'[0m - Created subflow run[35m 'cherry-skua'[0m for flow[1;35m 'fetch-dog-fact'[0m
21:04:02.679 | [36mINFO[0m    | Flow run[35m 'cherry-skua'[0m - View at [94mhttp://

# Topic 3.3

In [57]:
!python {(S_PATH / 'orchestration' / '3.3' / 'orchestrate_pre_prefect.py').as_posix()}

[0]	validation-rmse:19.48425
[1]	validation-rmse:17.95635
[2]	validation-rmse:16.59103
[3]	validation-rmse:15.37412
[4]	validation-rmse:14.29029
[5]	validation-rmse:13.32807
[6]	validation-rmse:12.47571
[7]	validation-rmse:11.72138
[8]	validation-rmse:11.05825
[9]	validation-rmse:10.47535
[10]	validation-rmse:9.96449
[11]	validation-rmse:9.51756
[12]	validation-rmse:9.12641
[13]	validation-rmse:8.78569
[14]	validation-rmse:8.49089
[15]	validation-rmse:8.23445
[16]	validation-rmse:8.01207
[17]	validation-rmse:7.81886
[18]	validation-rmse:7.65169
[19]	validation-rmse:7.50672
[20]	validation-rmse:7.38153
[21]	validation-rmse:7.27299
[22]	validation-rmse:7.17863
[23]	validation-rmse:7.09718
[24]	validation-rmse:7.02603
[25]	validation-rmse:6.96344
[26]	validation-rmse:6.90935
[27]	validation-rmse:6.86285
[28]	validation-rmse:6.82157
[29]	validation-rmse:6.78486
[30]	validation-rmse:6.75283
[31]	validation-rmse:6.72433
[32]	validation-rmse:6.69838
[33]	validation-rmse:6.67599
[34]	validatio

In [59]:
!python {(S_PATH / 'orchestration' / '3.3' / 'orchestrate.py').as_posix()}

18:18:04.355 | [36mINFO[0m    | prefect.engine - Created flow run[35m 'elastic-skylark'[0m for flow[1;35m 'main-flow'[0m
18:18:04.357 | [36mINFO[0m    | Flow run[35m 'elastic-skylark'[0m - View at [94mhttp://127.0.0.1:4200/flow-runs/flow-run/113217e6-dfeb-4c86-ab78-ad5beebc24b5[0m
2023/06/18 18:18:04 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.
18:18:04.446 | [36mINFO[0m    | Flow run[35m 'elastic-skylark'[0m - Created task run 'read_data-0' for task 'read_data'
18:18:04.446 | [36mINFO[0m    | Flow run[35m 'elastic-skylark'[0m - Executing 'read_data-0' immediately...
18:18:04.665 | [36mINFO[0m    | Task run 'read_data-0' - Finished in state [32mCompleted[0m()
18:18:04.678 | [36mINFO[0m    | Flow run[35m 'elastic-skylark'[0m - Created task run 'read_data-1' for task 'read_data'
18:18:04.678 | [36mINFO[0m    | Flow run[35m 'elastic-skylark'[0m - Executing 'read_data-1' immediately...
18:

# Topic 3.4

In [60]:
!

df.xlsx      mlruns  module_1_2_nyc_taxi_analysis.ipynb  sqlite:
mlartifacts  models  module_3.ipynb


# Data Visualization

# Data Wrangling

# Feature Engineering

# Modeling

# Evaluation