In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

In [None]:
# paths for the three dfs

df_jan_path = "/Users/aniket_ml/Documents/data/yellow_tripdata_2016-01.csv"
df_feb_path = "/Users/aniket_ml/Documents/data/yellow_tripdata_2016-02.csv"
df_mar_path = "/Users/aniket_ml/Documents/data/yellow_tripdata_2016-03.csv"

# load the dataframes

df_jan = dd.read_csv(df_jan_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])

df_feb = dd.read_csv(df_feb_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])


df_mar = dd.read_csv(df_mar_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])


In [None]:
df_jan

In [None]:
# concat the three dataframes as one

df_final = dd.concat([df_jan, df_feb, df_mar], axis=0)

In [None]:
df_final

In [None]:
# set the values of coordinates

min_latitude = 40.60
max_latitude = 40.85

min_longitude = -74.05
max_longitude = -73.70

min_fare_amount_val = 0.50
max_fare_amount_val = 81.0

min_trip_distance_val = 0.25
max_trip_distance_val = 24.43

In [None]:
# select data points within the given ranges

df_final = df_final.loc[(df_final["pickup_latitude"].between(min_latitude, max_latitude, inclusive="both")) & 
(df_final["pickup_longitude"].between(min_longitude, max_longitude, inclusive="both")) & 
(df_final["dropoff_latitude"].between(min_latitude, max_latitude, inclusive="both")) & 
(df_final["dropoff_longitude"].between(min_longitude, max_longitude, inclusive="both")), :]

In [None]:
# select data points within the given ranges

df_final = df_final.loc[(df_final["fare_amount"].between(min_fare_amount_val,max_fare_amount_val,inclusive="both")) & 
(df_final["trip_distance"].between(min_trip_distance_val,max_trip_distance_val,inclusive="both"))]


In [None]:
# drop columns from the data

df_final = df_final.drop(columns=['trip_distance', 'dropoff_longitude', 'dropoff_latitude', 'fare_amount'])


In [None]:
df_final

In [None]:
df_final = df_final.compute()

In [None]:
# save the dataframe

save_path = "../data/interim/processing_data.csv"

df_final.to_csv(save_path, index=False)

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

In [None]:
data_path = "../data/interim/processing_data.csv"

In [None]:
df_reader = pd.read_csv(data_path, chunksize=100000, usecols=["pickup_latitude","pickup_longitude"])

In [None]:
# train the standard scaler

scaler = StandardScaler()

for chunk in df_reader:
    # fit the scaler
    scaler.partial_fit(chunk)

In [None]:
scaler

In [None]:
df_reader = pd.read_csv(data_path, chunksize=100000, usecols=["pickup_latitude","pickup_longitude"])


In [None]:
# train the model

mini_batch = MiniBatchKMeans(n_clusters=30, n_init=10, random_state=42)

for chunk in df_reader:
    # scale the chunk
    scaled_chunk = scaler.transform(chunk)
    # train the model
    mini_batch.partial_fit(scaled_chunk)

In [None]:
mini_batch

In [None]:
# centroids of the model

mini_batch.cluster_centers_

In [None]:
scaler.inverse_transform(mini_batch.cluster_centers_)

In [None]:
# perform predictions and assign clusters

location_subset = df_final[df_final.columns[1:]]

location_subset


In [None]:
# scale the input data

scaled_location_subset = scaler.transform(location_subset)

scaled_location_subset

In [None]:
# get the cluster predictions

cluster_predictions = mini_batch.predict(scaled_location_subset)

cluster_predictions.shape

In [None]:
# save the cluster predictions in data

df_final['region'] = cluster_predictions

df_final

In [None]:
# drop the latitude and logitude columns from data

time_series_data = df_final.drop(columns=["pickup_latitude","pickup_longitude"])

time_series_data

In [None]:
# save the time series data

save_path = "../data/interim/time_series.csv"

time_series_data.to_csv(save_path, index=False)

In [None]:
time_series_data.dtypes

In [None]:
# set the time series as the index

time_series_data.set_index('tpep_pickup_datetime', inplace=True)

time_series_data

In [None]:
# group the time series data based on regions

region_grp = time_series_data.groupby("region")

region_grp

In [None]:
# check for missing values

time_series_data.isna().sum()

In [None]:

# resample the time series in 15 minute intervals

resampled_data = (
    region_grp['region']
    .resample("15min")
    .count()
)

resampled_data

In [None]:
resampled_data.name = "total_pickups"

In [None]:
resampled_data = resampled_data.reset_index(level=0)

resampled_data

In [None]:
# zeros in the data

(resampled_data['total_pickups'] == 0).sum()

In [None]:
epsilon_val = 10

resampled_data.replace({'total_pickups': {0 : epsilon_val}}, inplace=True)

In [None]:
(resampled_data['total_pickups'] == 0).sum()

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
window_values = list(range(3,11,1))
window_values

In [None]:
def calculate_best_window_value(windows):
    for window in windows:
        ind = window - 1
        y_pred = resampled_data['total_pickups'].rolling(window=window).mean().values[ind:]
        y = resampled_data['total_pickups'].values[ind:]
        error = mean_absolute_percentage_error(y, y_pred)
        print(f"For window value {window}, the MAPE is {error:.2f}")

In [None]:
calculate_best_window_value(window_values)

In [None]:
resampled_data['total_pickups'].ewm(alpha=0.9).mean()

In [None]:
smoothing_values = np.arange(0.2,1,0.1)
smoothing_values

In [None]:
def calculate_best_smoothing_value(values):
    y = resampled_data['total_pickups'].values
    for value in values:
        y_pred = resampled_data['total_pickups'].ewm(alpha=value).mean()
        error = mean_absolute_percentage_error(y, y_pred)
        print(f"For smoothing value {value:.1f}, the MAPE is {error:.2f}")

In [None]:
calculate_best_smoothing_value(smoothing_values)

In [None]:
# dataset with pickup smoothing applied

resampled_data["avg_pickups"] = resampled_data['total_pickups'].ewm(alpha=0.4).mean().round()

resampled_data

In [None]:
# save the resampled data

resampled_data_save_path = "../data/interim/final_data.csv"

resampled_data.to_csv(resampled_data_save_path, index=True)

In [None]:
# shape of the data

resampled_data.shape