In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import os
os.chdir("../../")

from src import display_df

In [2]:
df = pd.read_csv("data/features/features_data.csv")

In [3]:
display_df(df)

| order_id   | trip_origin         | trip_destination    | trip_start_time     | trip_end_time       | id   | driver_id   | driver_action   | drivers_lat   | drivers_lon   | day_of_week   | hour_of_day   | day_of_month   | month   | trip_start_date   | trip_end_date   | trip_origin_latitude   | trip_origin_longitude   | trip_destination_latitude   | trip_destination_longitude   | is_weekend   | is_holiday   |
|:-----------|:--------------------|:--------------------|:--------------------|:--------------------|:-----|:------------|:----------------|:--------------|:--------------|:--------------|:--------------|:---------------|:--------|:------------------|:----------------|:-----------------------|:------------------------|:----------------------------|:-----------------------------|:-------------|:-------------|
| 392001     | 6.6010417,3.2766339 | 6.4501069,3.3916154 | 2021-07-01 09:30:59 | 2021-07-01 09:34:36 | 1    | 243828      | accepted        | 6.60221       | 3.27046       | 

#### Drop irrelevant columns

In [8]:
# Drop columns not needed for training
drop_col = ['order_id', 
            'trip_start_time', 
            'trip_end_time',
            "trip_origin",
            "trip_destination",
            "id", 
            "driver_id", 
            "drivers_lat", 
            "drivers_lon", 
            "trip_start_date", 
            "trip_end_date",
          ]

In [9]:
data = df.drop(columns=drop_col)
display_df(data)

| driver_action   | day_of_week   | hour_of_day   | day_of_month   | month   | trip_origin_latitude   | trip_origin_longitude   | trip_destination_latitude   | trip_destination_longitude   | is_weekend   | is_holiday   |
|:----------------|:--------------|:--------------|:---------------|:--------|:-----------------------|:------------------------|:----------------------------|:-----------------------------|:-------------|:-------------|
| accepted        | Thursday      | 9             | 1              | July    | 6.60104                | 3.27663                 | 6.45011                     | 3.39162                      | 0            | 0            |
| rejected        | Thursday      | 9             | 1              | July    | 6.60104                | 3.27663                 | 6.45011                     | 3.39162                      | 0            | 0            |
| rejected        | Thursday      | 9             | 1              | July    | 6.60104                | 3.27663     

#### Make the data numeric. We can use label encoding for the categorical columns.

###### Display the non numeric columns

In [10]:
import numpy as np

struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)

print(non_numeric_columns)

['driver_action', 'day_of_week', 'month']


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

display_df(struct_data)

| driver_action   | day_of_week   | hour_of_day   | day_of_month   | month   | trip_origin_latitude   | trip_origin_longitude   | trip_destination_latitude   | trip_destination_longitude   | is_weekend   | is_holiday   |
|:----------------|:--------------|:--------------|:---------------|:--------|:-----------------------|:------------------------|:----------------------------|:-----------------------------|:-------------|:-------------|
| 0               | 4             | 9             | 1              | 3       | 6.60104                | 3.27663                 | 6.45011                     | 3.39162                      | 0            | 0            |
| 1               | 4             | 9             | 1              | 3       | 6.60104                | 3.27663                 | 6.45011                     | 3.39162                      | 0            | 0            |
| 1               | 4             | 9             | 1              | 3       | 6.60104                | 3.27663     

### Apply NOTEARS algorithm to learn the structure of the data

##### Split the data into train and test sets

In [None]:

# 1. Split Data into Training and Hold-out Set
train_data, holdout_data = train_test_split(df, test_size=0.2, random_state=42)


In [6]:
from causalnex.structure.data_info import DataInfo

# Assuming 'df' is your DataFrame

# Identify categorical columns based on their data type (object or category)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Create a DataInfo object to store information about the data
data_info = DataInfo(df)

# Update the data_info object to mark the identified columns as categorical
for col in categorical_cols:
    data_info = data_info.set_categorical(col)

# Now, you can use this data_info object when creating your causal graph
# For example, with the StructureModel class:
from causalnex.structure import StructureModel
sm = StructureModel(data_info=data_info)


ModuleNotFoundError: No module named 'causalnex.structure.data_info'

In [5]:
from causalnex.structure import StructureModel
from causalnex.structure.notears import from_pandas

# Convert categorical variables to numeric codes
for col in ['trip_origin', 'Trip Destination', 'day_of_week', 'month']:
    train_data[col] = pd.Categorical(train_data[col]).codes

# Convert datetime variables to timestamps
for col in ['Trip Start Time', 'Trip End Time', 'Trip Start Date']:
    train_data[col] = pd.to_datetime(train_data[col]).astype(int) / 10**9


KeyError: 'Trip Origin'

In [None]:
display_df(train_data)

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time,day_of_week,hour_of_day,day_of_month,month,Trip Origin_latitude,Trip Origin_longitude,Trip Destination_latitude,Trip Destination_longitude,trip_duration,is_holiday,is_weekend,Trip Start Date
121,392249,222,216,1625098000.0,1625132000.0,1,8,1,0,6.559589,3.385647,6.531492,3.335939,2570.0,0,0,1625098000.0
499,392904,81,271,1625098000.0,1625137000.0,1,10,1,0,6.45872,3.442086,6.576325,3.347018,2684.0,0,0,1625098000.0
20,392040,126,138,1625098000.0,1625148000.0,1,10,1,0,6.500608,3.598194,6.480557,3.280383,13018.0,0,0,1625098000.0
188,392375,240,224,1625098000.0,1625134000.0,1,9,1,0,6.569703,3.262063,6.538651,3.332084,3294.0,0,0,1625098000.0
71,392148,61,109,1625098000.0,1625129000.0,1,8,1,0,6.449082,3.40379,6.459446,3.370149,2183.0,0,0,1625098000.0
106,392217,166,295,1625098000.0,1625132000.0,1,9,1,0,6.524381,3.3888,6.592106,3.338791,2378.0,0,0,1625098000.0
270,392521,303,278,1625098000.0,1625133000.0,1,9,1,0,6.601529,3.36338,6.581541,3.360005,2194.0,0,0,1625098000.0
348,392660,93,72,1625098000.0,1625140000.0,1,9,1,0,6.471358,3.278997,6.447023,3.462735,8250.0,0,0,1625098000.0
435,392802,237,102,1625098000.0,1625147000.0,1,10,1,0,6.568494,3.379576,6.456731,3.53094,12977.0,0,0,1625098000.0
102,392208,318,164,1625098000.0,1625131000.0,1,8,1,0,6.605953,3.386028,6.497595,3.383098,4141.0,0,0,1625098000.0


In [41]:

from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

# Learn structure
sm = from_pandas(train_data)


# Visualize the graph
viz = plot_structure(
    sm,
    # graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)

viz.toggle_physics(False)

filename = "./causal_example.html"
viz.show(filename)
# Image(filename)

./causal_example.html
