# Exploratory Data Analysis

In [None]:
%load_ext nb_black

In [None]:
import scipy

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data Cleaning

In [None]:
pricing_data_df = pd.read_csv("data/test.csv")

During upfront pricing, the following factors are available to us:

1. Type of vehicle - premium, XL, go, etc
2. Customers Profile
    - Fraud Score
    - Lifetime value
    - Number of previous cancellation by driver within journey
3. Geography
    - Distance
    - Starting destination
    - Ending destination
    - Tolls
2. Traffic
    - Wait time due to incoming traffic
3. Surge
    - Time of day i.e. Rush Hoiur
    - High Demand/Low Supply
    - Bad weather

In [None]:
pricing_data_df["calc_created"] = pd.to_datetime(pricing_data_df["calc_created"])

* Removing all UIDs and tokens as we can't feed them into model and UUIDs are calculated uniquely for each session.
* Ticket ID for resolution isn't useful as we don't have any ticket information.

In [None]:
pricing_data_df.drop(
    ["driver_device_uid_new", "device_token", "ticket_id_new"], axis=1, inplace=True
)

Can we remove all the states if all the rides are `finished`?

In [None]:
pricing_data_df["b_state"].value_counts()

In [None]:
pricing_data_df["order_state"].value_counts()

In [None]:
pricing_data_df["order_try_state"].value_counts()

In [None]:
pricing_data_df.drop(
    ["b_state", "order_state", "order_try_state"], axis=1, inplace=True
)

All the orders are finished, hence this information is redundant.

We can remove `order_try_id_new` since we already have `order_id_new` available. Furthermore, `dest_change_number` let's us know how many times the destination was changed.

In [None]:
pricing_data_df.drop(["order_try_id_new"], axis=1, inplace=True)

In [None]:
pricing_data_df.drop_duplicates(inplace=True)

In [None]:
pricing_data_df.reset_index(inplace=True, drop=True)

In [None]:
pricing_data_df.info()

# Upfront Pricing Exploration

Also removing any distance or duration that is 0.

In [None]:
upfront_pricing_data_df = pricing_data_df.loc[
    (pricing_data_df["upfront_price"].notnull()),
    :,
]

In [None]:
upfront_pricing_data_df["prediction_price_type"].value_counts()

Since all upfront prices have prediction price type as upfront, we can drop `prediction_price_type`,

In [None]:
upfront_pricing_data_df.drop(["prediction_price_type"], axis=1, inplace=True)

## Problem Scope

Does a deviation actually exist?

In [None]:
upfront_pricing_data_df["upfront_price_deviation_perc"] = (
    (
        upfront_pricing_data_df["upfront_price"]
        - upfront_pricing_data_df["metered_price"]
    )
    / upfront_pricing_data_df["upfront_price"]
    * 100
)
upfront_pricing_data_df["abs_upfront_price_deviation_perc"] = abs(
    upfront_pricing_data_df["upfront_price_deviation_perc"]
)

In [None]:
upfront_pricing_data_df["abs_upfront_price_deviation_perc"].describe(
    percentiles=[0.25, 0.5, 0.75, 0.85, 0.9, 0.95, 0.99]
)

Roughly 50% of the orders are deviating below 20% from the upfront pricing. 

In [None]:
fig, ax = plt.subplots(figsize=(16, 6), dpi=120)
p = sns.kdeplot(data=upfront_pricing_data_df["upfront_price_deviation_perc"], ax=ax)
x, y = p.get_lines()[0].get_data()
cdf = scipy.integrate.cumtrapz(y, x, initial=0)
nearest_05 = np.abs(cdf - 0.5).argmin()
x_median = x[nearest_05]
y_median = y[nearest_05]
plt.vlines(x_median, 0, y_median, colors="black")
plt.grid()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16, 6), dpi=120)
p = sns.kdeplot(data=upfront_pricing_data_df["abs_upfront_price_deviation_perc"], ax=ax)
x, y = p.get_lines()[0].get_data()
cdf = scipy.integrate.cumtrapz(y, x, initial=0)
nearest_05 = np.abs(cdf - 0.5).argmin()
x_median = x[nearest_05]
y_median = y[nearest_05]
plt.vlines(x_median, 0, y_median, colors="black")
plt.grid()
plt.show()

The distribution of pricing is right skewed long tailed. Our focus will on identifying what the source of the 50% of the error is.

## Problem Impact

How many customers does this deviation impact?

In [None]:
pricing_data_df.shape

Assuming our population consists of 4270 customers.

In [None]:
upfront_pricing_data_df.shape[0] / pricing_data_df.shape[0]

In [None]:
upfront_pricing_data_df.shape[0]

Around 70% of the customers have suffered from some form of a deviation between upfront and metered pricing on the app.

In [None]:
upfront_pricing_data_df["upfront_price_deviation_perc"].describe(
    percentiles=[0.25, 0.35, 0.5, 0.55, 0.75, 0.85, 0.9, 0.95, 0.99]
)

In [None]:
upfront_pricing_data_df[
    upfront_pricing_data_df["upfront_price_deviation_perc"] < 0
].shape[0] / upfront_pricing_data_df.shape[0]

Around 60% of the customers see a price higher than the one that is shown upfront.

In [None]:
upfront_pricing_data_df[
    upfront_pricing_data_df["upfront_price_deviation_perc"] < -20
].shape[0] / upfront_pricing_data_df.shape[0]

Around 35% of the customers get charged more at the end of the journey.

We're going to assume that anyone who created an `overpaid_ride_ticket` and didn't pay more for a ride, did it by accident. 

In [None]:
upfront_pricing_data_df[
    (upfront_pricing_data_df["upfront_price_deviation_perc"] < -20)
]["overpaid_ride_ticket"].value_counts(normalize=True)

4% of customers who were shown a higher price (i.e. with a deviation of 20%), complained about an overpaid ticket.

### Outliers

There seem to be outliers with the negative deviation with the lowest value going down to -1000%.

In [None]:
upfront_pricing_data_df["upfront_price_deviation_perc"].max(), upfront_pricing_data_df[
    "upfront_price_deviation_perc"
].min()

In [None]:
upfront_price_dev_q1, upfront_price_dev_q3 = np.percentile(
    upfront_pricing_data_df["upfront_price_deviation_perc"], [25, 75]
)
upfront_price_dev_iqr = upfront_price_dev_q3 - upfront_price_dev_q1
upfront_price_dev_ul = upfront_price_dev_q3 + 1.5 * upfront_price_dev_iqr
upfront_price_dev_ll = upfront_price_dev_q1 - 1.5 * upfront_price_dev_iqr

In [None]:
upfront_pricing_deviation_outliers_df = upfront_pricing_data_df[
    (upfront_pricing_data_df["upfront_price_deviation_perc"] > upfront_price_dev_ul)
    | (upfront_pricing_data_df["upfront_price_deviation_perc"] < upfront_price_dev_ll)
]

In [None]:
upfront_pricing_deviation_outliers_df["upfront_price_deviation_perc"].describe(
    percentiles=[0.25, 0.35, 0.5, 0.55, 0.75, 0.85, 0.9, 0.95, 0.99]
)

In [None]:
upfront_pricing_deviation_outliers_df.sample(5).T

* These cases should be flagged. 
* There's a deviation of -1100% to 100% in the pricing.

### Distance

In [None]:
upfront_pricing_data_df["predicted_distance_deviation_perc"] = (
    (
        upfront_pricing_data_df["predicted_distance"]
        - upfront_pricing_data_df["distance"]
    )
    / upfront_pricing_data_df["predicted_distance"]
    * 100
)
upfront_pricing_data_df["abs_predicted_distance_deviation_perc"] = abs(
    upfront_pricing_data_df["predicted_distance_deviation_perc"]
)

In [None]:
upfront_pricing_data_df["distance"].describe(
    percentiles=[0.15, 0.25, 0.35, 0.5, 0.55, 0.75, 0.85, 0.9, 0.95, 0.99]
)

In [None]:
upfront_pricing_data_df["predicted_distance"].describe(
    percentiles=[0.15, 0.25, 0.35, 0.5, 0.55, 0.75, 0.85, 0.9, 0.95, 0.99]
)

There seem to be people who've travelled no distance, whereas the minimum we're predicting is 21.

In [None]:
upfront_pricing_data_df.loc[
    upfront_pricing_data_df["distance"] == 0, "predicted_distance_deviation_perc"
].describe()

In [None]:
upfront_pricing_data_df.loc[
    upfront_pricing_data_df["distance"] == 0, "upfront_price_deviation_perc"
].describe()

In [None]:
35 * 100.0 / upfront_pricing_data_df.shape[0]

* 1% of the deviations or 35 cases have issues where the rides didn't start. 
* Hence this isn't a distance prediction issue but a ride incomplete issue, i.e., the ride was completed without starting

Few reasons why this could happen:
* GPS was malfunctioning and the actual distance didn't get recorded.
* The driver took the ride off of the app.

In [None]:
upfront_pricing_data_df["duration"].describe(
    percentiles=[0.15, 0.25, 0.35, 0.5, 0.55, 0.75, 0.85, 0.9, 0.95, 0.99]
)

There are some rides with 0 duration.

In [None]:
upfront_pricing_data_df.loc[
    upfront_pricing_data_df["duration"] == 0, ["distance", "duration"]
]

These seem to be a subset of 0 distance. These could possibly be drivers who took the ride "off the app".

In [None]:
upfront_pricing_data_ignored_0_dist_df = upfront_pricing_data_df[
    upfront_pricing_data_df["distance"] > 0
]

In [None]:
upfront_pricing_data_ignored_0_dist_df["predicted_distance_deviation_perc"].describe(
    percentiles=[0.15, 0.25, 0.35, 0.5, 0.55, 0.75, 0.85, 0.9, 0.95, 0.99]
)

In [None]:
predicted_dist_less_than_actual_df = upfront_pricing_data_ignored_0_dist_df[
    upfront_pricing_data_ignored_0_dist_df["predicted_distance_deviation_perc"] < 0
]

### Duration

In [None]:
upfront_pricing_data_df["predicted_duration_deviation_perc"] = (
    (
        upfront_pricing_data_df["predicted_duration"]
        - upfront_pricing_data_df["duration"]
    )
    / upfront_pricing_data_df["predicted_duration"]
    * 100
)
upfront_pricing_data_df["abs_predicted_duration_deviation_perc"] = abs(
    upfront_pricing_data_df["predicted_duration_deviation_perc"]
)

In [None]:
upfront_pricing_data_df["predicted_duration_deviation_perc"].describe(
    percentiles=[0.15, 0.25, 0.35, 0.5, 0.55, 0.75, 0.85, 0.9, 0.95, 0.99]
)

In [None]:
 = upfront_pricing_data_df[
    (upfront_pricing_data_df["predicted_distance_deviation_perc"] < 0)
    & (upfront_pricing_data_df["duration"] > 0)
]

In [None]:
predicted_dura_less_than_actual_df["overpaid_ride_ticket"].value_counts(normalize=True)

In [None]:
predicted_dura_less_than_actual_df["upfront_price_deviation_perc"].describe(
    percentiles=[0.15, 0.25, 0.35, 0.5, 0.55, 0.75, 0.85, 0.9, 0.95, 0.99]
)

### How do the price deviations compare?

In [None]:
bins = [-10000, -1000, -100, -80, -60, -40, -20, 0, 20, 40, 60, 80, 100, 1000, 10000]

In [None]:
upfront_pricing_data_ignored_0_dist_df = upfront_pricing_data_df[
    upfront_pricing_data_df["distance"] > 0
]

In [None]:
upfront_pricing_data_ignored_0_dist_df[
    "bins_predicted_distance_deviation_perc"
] = pd.cut(
    upfront_pricing_data_ignored_0_dist_df["predicted_distance_deviation_perc"],
    bins=bins,
)
upfront_pricing_data_ignored_0_dist_df[
    "bins_predicted_duration_deviation_perc"
] = pd.cut(
    upfront_pricing_data_ignored_0_dist_df["predicted_duration_deviation_perc"],
    bins=bins,
)
upfront_pricing_data_ignored_0_dist_df["bins_upfront_price_deviation_perc"] = pd.cut(
    upfront_pricing_data_ignored_0_dist_df["upfront_price_deviation_perc"], bins=bins
)

In [None]:
df = (
    upfront_pricing_data_ignored_0_dist_df.groupby(
        ["bins_upfront_price_deviation_perc", "bins_predicted_distance_deviation_perc"]
    )["order_id_new"]
    .count()
    .reset_index()
)

In [None]:
df.loc[df.groupby(["bins_upfront_price_deviation_perc"])["order_id_new"].idxmax()]

In [None]:
upfront_pricing_data_ignored_0_dist_df[
    [
        "predicted_distance_deviation_perc",
        "predicted_duration_deviation_perc",
        "upfront_price_deviation_perc",
    ]
].corr()

In [None]:
dist_bin_df = (
    upfront_pricing_data_ignored_0_dist_df["bins_predicted_distance_deviation_perc"]
    .value_counts(normalize=True)
    .reset_index()
    .rename(
        columns={
            "bins_predicted_distance_deviation_perc": "distance_deviation_perc_pop",
            "index": "bins_distance_deviation_perc",
        }
    )
)
dura_bin_df = (
    upfront_pricing_data_ignored_0_dist_df["bins_predicted_duration_deviation_perc"]
    .value_counts(normalize=True)
    .reset_index()
    .rename(
        columns={
            "bins_predicted_duration_deviation_perc": "duration_deviation_perc_pop",
            "index": "bins_duration_deviation_perc",
        }
    )
)
price_bin_df = (
    upfront_pricing_data_ignored_0_dist_df["bins_upfront_price_deviation_perc"]
    .value_counts(normalize=True)
    .reset_index()
    .rename(
        columns={
            "bins_upfront_price_deviation_perc": "price_deviation_perc_pop",
            "index": "bins_price_deviation_perc",
        }
    )
)