In [1]:
import pandas as pd
import numpy as np
import warnings

# Filtering warning for the append method being replaced by pd.concat
warnings.filterwarnings("ignore")

In [2]:
import os
import io

folder_path = "../data/traffic"

text_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each text file and create a DataFrame from its content
for file_name in text_files:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r') as file:
        file_content = file.read()
        df = pd.read_csv(io.StringIO(file_content), delimiter=';')
        dataframes.append(df)

# Concatenate all DataFrames into one
final_dataframe = pd.concat(dataframes, ignore_index=True)

### In the _traffic_ dataframe
#### _"etat_traffic"_ column: 

- 0: unknown
- 1: fluid (occupation_rate between 0 and 0.15)
- 2: pre-saturation (occupation_rate between 0.15 and 0.30)
- 3: saturated (occupation_rate between 0.30 and 0.50)
- 4: blocked (occupation_rate greater than 0.50)

#### _"etat_barre"_ column (tells us whether a road is closed or not):

- 0: unknown
- 1: open
- 2: closed
- 3: invalid

In [3]:
final_dataframe

Unnamed: 0,iu_ac,libelle,iu_nd_amont,libelle_nd_amont,iu_nd_aval,libelle_nd_aval,t_1h,q,k,etat_trafic,etat_barre,dessin
0,799,Bd_Kellermann,460,Bd_Kellermann-Moulin_Pointe,459,Bd_Kellermann-Damesme,2020-01-01 01:00:00,,,0,3,"""<PLINE COURBE=""""1""""><PT X=""""601352"""" Y=""""1245..."
1,799,Bd_Kellermann,460,Bd_Kellermann-Moulin_Pointe,459,Bd_Kellermann-Damesme,2020-01-01 02:00:00,,,0,3,"""<PLINE COURBE=""""1""""><PT X=""""601352"""" Y=""""1245..."
2,799,Bd_Kellermann,460,Bd_Kellermann-Moulin_Pointe,459,Bd_Kellermann-Damesme,2020-01-01 03:00:00,,,0,3,"""<PLINE COURBE=""""1""""><PT X=""""601352"""" Y=""""1245..."
3,799,Bd_Kellermann,460,Bd_Kellermann-Moulin_Pointe,459,Bd_Kellermann-Damesme,2020-01-01 04:00:00,,,0,3,"""<PLINE COURBE=""""1""""><PT X=""""601352"""" Y=""""1245..."
4,799,Bd_Kellermann,460,Bd_Kellermann-Moulin_Pointe,459,Bd_Kellermann-Damesme,2020-01-01 05:00:00,,,0,3,"""<PLINE COURBE=""""1""""><PT X=""""601352"""" Y=""""1245..."
...,...,...,...,...,...,...,...,...,...,...,...,...
118018906,5047,Daumesnil,2664,Daumesnil-Abel-Parrot,2666,Daumesnil-Traversiere,2023-12-31 20:00:00,671.0,,0,3,"""<PLINE><PT X=""""602845"""" Y=""""127426""""/><PT X=""..."
118018907,5047,Daumesnil,2664,Daumesnil-Abel-Parrot,2666,Daumesnil-Traversiere,2023-12-31 21:00:00,751.0,,0,3,"""<PLINE><PT X=""""602845"""" Y=""""127426""""/><PT X=""..."
118018908,5047,Daumesnil,2664,Daumesnil-Abel-Parrot,2666,Daumesnil-Traversiere,2023-12-31 22:00:00,640.0,,0,3,"""<PLINE><PT X=""""602845"""" Y=""""127426""""/><PT X=""..."
118018909,5047,Daumesnil,2664,Daumesnil-Abel-Parrot,2666,Daumesnil-Traversiere,2023-12-31 23:00:00,453.0,,0,3,"""<PLINE><PT X=""""602845"""" Y=""""127426""""/><PT X=""..."


In [4]:
traffic = final_dataframe.drop(columns=['etat_barre', 'etat_trafic', 'dessin', 'iu_ac', 'iu_nd_amont', 'libelle_nd_amont',
       'iu_nd_aval', 'libelle_nd_aval', 'libelle'])

flow = traffic.groupby("t_1h").sum("q")
occupation_rate = traffic.groupby("t_1h").mean("k")
traffic = traffic.drop_duplicates(subset=["t_1h"])

traffic = traffic.merge(flow, on="t_1h", how="left")
traffic = traffic.merge(occupation_rate, on="t_1h", how="left")

traffic = traffic.drop_duplicates()
traffic = traffic.rename(columns={"q": "flow", "k": "occupation_rate", "t_1h": "date"})
traffic = traffic.drop(columns=["q_x", "k_x", "q_y", "k_y"])

traffic["date"] = pd.to_datetime(traffic["date"])

KeyError: "['dessin', 'iu_ac', 'iu_nd_amont', 'libelle_nd_amont', 'iu_nd_aval', 'libelle_nd_aval'] not found in axis"

In [8]:
traffic

Unnamed: 0,date,flow,occupation_rate
0,2020-01-01 01:00:00,665.634534,7.436377
1,2020-01-01 02:00:00,812.483517,10.620825
2,2020-01-01 03:00:00,825.396832,9.824548
3,2020-01-01 04:00:00,748.093018,7.255026
4,2020-01-01 05:00:00,646.331628,5.412504
...,...,...,...
34930,2023-12-31 20:00:00,817.656959,10.942673
34931,2023-12-31 21:00:00,809.873308,10.271545
34932,2023-12-31 22:00:00,744.325419,7.861960
34933,2023-12-31 23:00:00,696.066817,7.405008


In [43]:
path_traffic2 = "../data/comptages-routiers-permanents.csv"
traffic2 = pd.read_csv(path_traffic2, sep=';').drop_duplicates()

In [44]:
traffic2["Date et heure de comptage"] = pd.to_datetime(traffic2["Date et heure de comptage"], utc=True)
traffic2 = traffic2.set_index("Date et heure de comptage").tz_localize(None).reset_index()

In [45]:
traffic2 = traffic2.rename(columns={"Débit horaire": "flow", "Taux d'occupation": "occupation_rate", "Date et heure de comptage": "date"})

flow2 = traffic2.groupby("date").sum("flow")
occupation_rate2 = traffic2.groupby("date").mean("occupation_rate")
traffic2 = traffic2.drop_duplicates(subset=["date"])

traffic2 = traffic2.merge(flow2, on="date", how="left")
traffic2 = traffic2.merge(occupation_rate2, on="date", how="left")

traffic2 = traffic2.drop_duplicates()
traffic2 = traffic2.drop(columns=["Identifiant arc", "Libelle", "Etat trafic", "Identifiant noeud amont", "Libelle noeud amont",
                                  "Identifiant noeud aval", "Libelle noeud aval", "Etat arc", "Date debut dispo data", "Date fin dispo data",
                                  "geo_point_2d", "geo_shape", "flow_x", "flow_y", "occupation_rate_x", "occupation_rate_y", "Identifiant arc_x",
                                  "Identifiant noeud amont_x", "Identifiant noeud aval_x", "Identifiant arc_y", "Identifiant noeud amont_y",
                                  "Identifiant noeud aval_y"])

In [46]:
threshold_date = "2024-01-01"
date_filter = pd.to_datetime(threshold_date)

traffic2 = traffic2[traffic2["date"] >= date_filter]

In [47]:
traffic_final = pd.concat([traffic, traffic2])

In [48]:
traffic_final

Unnamed: 0,date,flow,occupation_rate
0,2020-01-01 01:00:00,665.634534,7.436377
1,2020-01-01 02:00:00,812.483517,10.620825
2,2020-01-01 03:00:00,825.396832,9.824548
3,2020-01-01 04:00:00,748.093018,7.255026
4,2020-01-01 05:00:00,646.331628,5.412504
...,...,...,...
2359,2024-07-13 01:00:00,482.097842,3.493852
2360,2024-07-13 21:00:00,764.547682,5.884027
2361,2024-07-13 22:00:00,894.775573,6.114897
2362,2024-07-13 09:00:00,835.008974,5.640619


In [49]:
# Exporting the traffic data set in order to be able to use it later without running everything again (very long concatenation)
traffic_final.to_csv("../data/traffic_data.csv", sep=";")