In [5]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

features_and_target = pd.read_parquet(TRANSFORMED_DATA_DIR / 'yellow_tripdata_features_target.parquet')

In [50]:
features = features_and_target.drop(columns='target')
target = features_and_target['target']


In [49]:
features_ = features[features['PULocationID'] == 43].copy()

In [52]:
target_ = target[features_.index]
target_ = pd.DataFrame(target_, columns=['target'])

In [31]:
target

Unnamed: 0,target
0,3.0
1,0.0
2,6.0
3,0.0
4,3.0
...,...
6937651,0.0
6937652,0.0
6937653,0.0
6937654,0.0


In [57]:
from typing import Optional
from datetime import timedelta

import plotly.express as px

def plot_one_sample(
        features: pd.DataFrame,
        target: pd.Series,
        sample_idx: int,
        predictions: Optional[pd.Series] = None,
        display_title: Optional[bool] = True
        ) -> None:
    # PULocationID is the location_id
    feature_sample = features.iloc[sample_idx].copy()
    target_sample = target.iloc[sample_idx]
    
    ts_columns = [column for column in features.columns if column.startswith('feature_')]
    ts_values = [feature_sample[column] for column in ts_columns] + [target_sample]

    ts_dates = pd.date_range(
        start = feature_sample['pickup_hour'] - timedelta(hours = len(ts_columns)),
        end = feature_sample['pickup_hour'],
        freq='h'
        )
    
    # line plot with past values
    title = f'Pick up hour = {feature_sample["pickup_hour"]}, location_id={feature_sample["PULocationID"]}'  if display_title else None
    fig = px.line(x=ts_dates,
                  y=ts_values,
                  template = 'plotly_dark',
                  markers=True,
                  title=title)
    
    if target_sample is not None:
        # green dot for the value we want to predict
        fig.add_scatter(x=[ts_dates[-1]],
                        y=[target_sample],
                        line_color='green',
                        mode='markers',
                        marker_size=10,
                        name='actual value')
    
    if predictions is not None:
        # red dot for the predicted value
        fig.add_scatter(x=[ts_dates[-1]],
                        y=[predictions.iloc[sample_idx]],
                        line_color='red',
                        mode='markers',
                        marker_size=15,
                        marker_symbol='x',
                        name='Predicted value')
        
    return fig

In [64]:
# test plot_one_sample
plot_one_sample(features_, target_, sample_idx=4)

In [65]:
def plot_ts(
        ts_data: pd.DataFrame,
        locations: Optional[list] = None,
    ):
    if locations is not None:
        ts_data_to_plot = ts_data[ts_data['PULocationID'].isin(locations)].copy()
    else:
        ts_data_to_plot = ts_data.copy()

    fig = px.line(
        ts_data_to_plot,
        x='pickup_hour',
        y='rides',
        color='PULocationID',
        template='none'
    )

    fig.show()