In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import os
os.chdir("../../")

from src import display_df

In [2]:
# df = pd.read_csv("data/features/features_data.csv")
df = pd.read_csv("data/final/clean_merged_df.csv")

In [3]:
display_df(df)

| order_id   | trip_duration   | trip_distance   | is_weekend   | is_holiday   | hour_of_day   | driver_id   | driver_action   |
|:-----------|:----------------|:----------------|:-------------|:-------------|:--------------|:------------|:----------------|
| 392001     | 217             | 30060.4         | 0            | 0            | 9             | 243828      | accepted        |
| 392001     | 217             | 30060.4         | 0            | 0            | 9             | 243588      | rejected        |
| 392001     | 217             | 30060.4         | 0            | 0            | 9             | 243830      | rejected        |
| 392001     | 217             | 30060.4         | 0            | 0            | 9             | 243539      | rejected        |
| 392001     | 217             | 30060.4         | 0            | 0            | 9             | 171653      | rejected        |


#### Drop irrelevant columns

In [37]:
# # Drop columns not needed for training
drop_col = ['driver_id', 'order_id', 'is_holiday']
# drop_col = ['order_id', 
#             'trip_start_time', 
#             'trip_end_time',
#             "trip_origin",
#             "trip_destination",
#             "id", 
#             "driver_id", 
#             "drivers_lat", 
#             "drivers_lon", 
#             "trip_start_date", 
#             "trip_end_date",
#             "drivers_lat",
#             "drivers_lon",
#             "trip_start_date",
#             "trip_end_date",
#             "trip_origin_latitude",
#             "trip_origin_longitude",
#             "trip_destination_latitude",
#             "trip_destination_longitude"
#           ]

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1557726 entries, 0 to 1557725
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   order_id       1557726 non-null  int64  
 1   trip_duration  1557726 non-null  float64
 2   trip_distance  1557726 non-null  float64
 3   is_weekend     1557726 non-null  int64  
 4   is_holiday     1557726 non-null  int64  
 5   hour_of_day    1557726 non-null  int64  
 6   driver_id      1557726 non-null  float64
 7   driver_action  1557726 non-null  object 
dtypes: float64(3), int64(4), object(1)
memory usage: 95.1+ MB


In [39]:
data = df.drop(columns=drop_col)
display_df(data)

| trip_duration   | trip_distance   | is_weekend   | hour_of_day   | driver_action   |
|:----------------|:----------------|:-------------|:--------------|:----------------|
| 217             | 30060.4         | 0            | 9             | accepted        |
| 217             | 30060.4         | 0            | 9             | rejected        |
| 217             | 30060.4         | 0            | 9             | rejected        |
| 217             | 30060.4         | 0            | 9             | rejected        |
| 217             | 30060.4         | 0            | 9             | rejected        |


In [40]:
# Remove duplicates
data = data.drop_duplicates()

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50022 entries, 0 to 1557664
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   trip_duration  50022 non-null  float64
 1   trip_distance  50022 non-null  float64
 2   is_weekend     50022 non-null  int64  
 3   hour_of_day    50022 non-null  int64  
 4   driver_action  50022 non-null  object 
dtypes: float64(2), int64(2), object(1)
memory usage: 2.3+ MB


#### Make the data numeric. We can use label encoding for the categorical columns.

###### Display the non numeric columns

In [42]:
import numpy as np

struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)

print(non_numeric_columns)

['driver_action']


In [43]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

display_df(struct_data)

| trip_duration   | trip_distance   | is_weekend   | hour_of_day   | driver_action   |
|:----------------|:----------------|:-------------|:--------------|:----------------|
| 217             | 30060.4         | 0            | 9             | 0               |
| 217             | 30060.4         | 0            | 9             | 1               |
| 2055            | 11905.3         | 0            | 10            | 0               |
| 2055            | 11905.3         | 0            | 10            | 1               |
| 3681            | 6844.1          | 0            | 6             | 0               |


### Apply NOTEARS algorithm to learn the structure of the data

In [47]:
from causalnex.structure.notears import from_pandas

# struct_data = struct_data.head()
sm = from_pandas(
        struct_data,
        tabu_edges=[
        #   Add forbidden edges
          ("driver_action", "trip_distance"),
          ("driver_action", "trip_duration"),
          ("trip_duration", "trip_distance")
          
        ],
        w_threshold=0.8,
        max_iter=1000000,
    )


#### Visualize the learned StructureModel model using the plot function

In [48]:
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

viz.toggle_physics(False)
viz.show("fully_connected.html")

fully_connected.html


In [None]:
sm.add_edge("trip_distance", "trip_duration")
sm.add_edge("trip_distance", "driver_action")
sm.add_edge("hour_of_day", "driver_action")
sm.add_edge("is_weekend", "driver_action")

In [50]:

# sm.remove_edges_below_threshold(0.8)
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.show("modified.html")

modified.html


## Reducing caridinality 

In [51]:
# Define the number of bins for discretization
num_bins = 10

# Create a copy of the DataFrame
discretised_data = data.copy()

# Create a dictionary to store unique values for each column
data_vals = {col: data[col].unique() for col in data.columns}

# Create a dictionary to map driver_action values to 'accepted' and 'rejected'
driver_action_map = {v: v for v in data_vals['driver_action']}

# Create a dictionary to map hour_of_day values to 'morning' or 'afternoon/night'
hour_of_day_map = {v: 'morning' if v <= 12 else 'afternoon/night' for v in data_vals['hour_of_day']}

# Create a dictionary to map is_weekend values to weekend or not_weekend
is_weekend_map = {v: 'weekday' if v == 0 else 'weekend' for v in data_vals['is_weekend']}

# Map the values in the driver_action and hour_of_day columns
discretised_data["driver_action"] = discretised_data["driver_action"].map(driver_action_map)
discretised_data["hour_of_day"] = discretised_data["hour_of_day"].map(hour_of_day_map)
discretised_data['is_weekend'] = discretised_data['is_weekend'].map(is_weekend_map)

# Print the first 5 rows of the columns driver_action and hour_of_day
print(discretised_data[['driver_action', 'hour_of_day', 'is_weekend']].head(20).to_markdown(index=False, numalign="left", stralign="left"))


# Discretize the `trip_distance` column into equal-width bins
discretised_data['trip_distance'] = pd.cut(discretised_data['trip_distance'], bins=num_bins, labels=False)

# Discretize the `trip_duration` column into equal-width bins
discretised_data['trip_duration'] = pd.cut(discretised_data['trip_duration'], bins=num_bins, labels=False)

# Print the value counts of the discretized `trip_distance` column
print("Value counts for discretized trip_distance:")
print(discretised_data['trip_distance'].value_counts().sort_index().to_markdown(numalign="left", stralign="left"))

# Print the value counts of the discretized `trip_duration` column
print("\nValue counts for discretized trip_duration:")
print(discretised_data['trip_duration'].value_counts().sort_index().to_markdown(numalign="left", stralign="left"))

| driver_action   | hour_of_day   | is_weekend   |
|:----------------|:--------------|:-------------|
| accepted        | morning       | weekday      |
| rejected        | morning       | weekday      |
| accepted        | morning       | weekday      |
| rejected        | morning       | weekday      |
| accepted        | morning       | weekday      |
| rejected        | morning       | weekday      |
| accepted        | morning       | weekday      |
| rejected        | morning       | weekday      |
| accepted        | morning       | weekday      |
| rejected        | morning       | weekday      |
| accepted        | morning       | weekday      |
| rejected        | morning       | weekday      |
| accepted        | morning       | weekday      |
| rejected        | morning       | weekday      |
| accepted        | morning       | weekday      |
| rejected        | morning       | weekday      |
| accepted        | morning       | weekday      |
| rejected        | morning    

In [52]:
display_df(discretised_data)

| trip_duration   | trip_distance   | is_weekend   | hour_of_day   | driver_action   |
|:----------------|:----------------|:-------------|:--------------|:----------------|
| 2               | 8               | weekday      | morning       | accepted        |
| 2               | 8               | weekday      | morning       | rejected        |
| 4               | 3               | weekday      | morning       | accepted        |
| 4               | 3               | weekday      | morning       | rejected        |
| 5               | 1               | weekday      | morning       | accepted        |


In [53]:
# Split 90% train and 10% test
from sklearn.model_selection import train_test_split

train, test = train_test_split(discretised_data, train_size=0.9, test_size=0.1, random_state=7)

In [54]:
from causalnex.network import BayesianNetwork
from causalnex.structure.structuremodel import StructureModel

# Having my learned model 'sm' from a previous step

# Convert the StructureModel (sm) into a BayesianNetwork
bn = BayesianNetwork(sm)

In [55]:
bn = bn.fit_node_states(discretised_data)  # Fit node states first

In [56]:
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

In [57]:
bn.cpds["driver_action"]

hour_of_day,afternoon/night,afternoon/night,afternoon/night,afternoon/night,afternoon/night,afternoon/night,afternoon/night,afternoon/night,afternoon/night,afternoon/night,...,morning,morning,morning,morning,morning,morning,morning,morning,morning,morning
is_weekend,weekday,weekday,weekday,weekday,weekday,weekday,weekday,weekday,weekday,weekday,...,weekend,weekend,weekend,weekend,weekend,weekend,weekend,weekend,weekend,weekend
trip_distance,0,1,2,3,4,5,6,7,8,9,...,0,1,2,3,4,5,6,7,8,9
driver_action,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
accepted,0.502915,0.504282,0.5,0.500822,0.509222,0.503289,0.492651,0.50858,0.514493,0.510089,...,0.504,0.470588,0.512821,0.483425,0.511905,0.506993,0.515152,0.513636,0.526946,0.514286
rejected,0.497085,0.495718,0.5,0.499178,0.490778,0.496711,0.507349,0.49142,0.485507,0.489911,...,0.496,0.529412,0.487179,0.516575,0.488095,0.493007,0.484848,0.486364,0.473054,0.485714


In [58]:
from causalnex.evaluation import classification_report

classification_report(bn, test, "driver_action")

{'driver_action_accepted': {'precision': 0.5020408163265306,
  'recall': 0.8203216947822676,
  'f1-score': 0.6228775692582663,
  'support': 2549.0},
 'driver_action_rejected': {'precision': 0.45346062052505964,
  'recall': 0.15484922575387122,
  'f1-score': 0.23086269744835966,
  'support': 2454.0},
 'accuracy': 0.4939036578053168,
 'macro avg': {'precision': 0.47775071842579514,
  'recall': 0.48758546026806937,
  'f1-score': 0.426870133353313,
  'support': 5003.0},
 'weighted avg': {'precision': 0.4782119535448377,
  'recall': 0.4939036578053168,
  'f1-score': 0.43059204149062474,
  'support': 5003.0}}

## Machine learning

In [60]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m540.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:10[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [63]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score



def train_random_forest_model(X, y, test_size=0.1, random_state=42):
    """
    Trains a Random Forest classifier on the given data and evaluates its performance.

    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable.
        test_size (float): Proportion of data for testing (default=0.2).
        random_state (int): Random seed for reproducibility (default=42).

    Returns:
        RandomForestClassifier: The trained Random Forest model.
    """

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the Random Forest model
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("ROC-AUC:", roc_auc)

    return model
