# Fichier RDF

## EDA

In [48]:
from rdflib import Graph
import pandas as pd


RDF => DataFrame

In [49]:
# monthly RDF to DataFrame
def rdf_monthly_chicago_to_df(file_path):
    g = Graph()
    g.parse(file_path, format="xml")

    data = {}

    for s, p, o in g:
        s = str(s)
        p = str(p)
        o = str(o)

        if s not in data:
            data[s] = {
                "subject": s,
                "city": "Chicago"
            }

        field = p.split("/")[-1]

        if field in [
            "route",
            "routename",
            "month_beginning",
            "avg_weekday_rides",
            "avg_saturday_rides",
            "avg_sunday_holiday_rides",
            "monthtotal"
        ]:
            data[s][field] = o

    df = pd.DataFrame(data.values())

    df["month_beginning"] = pd.to_datetime(df["month_beginning"], errors="coerce")

    numeric_cols = [
        "avg_weekday_rides",
        "avg_saturday_rides",
        "avg_sunday_holiday_rides",
        "monthtotal"
    ]

    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    return df

df_Monthly=rdf_monthly_chicago_to_df("../Data/CTA Chicago - Ridership - Bus Routes - Monthly Day-Type Averages & Totals (RDF).rdf")

In [50]:
# daily RDF to DataFrame
def rdf_daily_to_df(file_path):
    g = Graph()
    g.parse(file_path, format="xml")

    rows = {}

    for s, p, o in g:
        s = str(s)
        p = str(p)
        o = str(o)

        if s not in rows:
            rows[s] = {
                "subject": s,
                "city": "Chicago"
            }

        col = p.split("/")[-1]

        if col == "route":
            rows[s]["route"] = o
        elif col == "date":
            rows[s]["date"] = o
        elif col == "daytype":
            rows[s]["daytype"] = o
        elif col == "rides":
            rows[s]["ridership"] = o

    df = pd.DataFrame(rows.values())

    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["ridership"] = pd.to_numeric(df["ridership"], errors="coerce")

    return df
df_Daily = rdf_daily_to_df("../Data/CTA Chicago - Ridership - Bus Routes - Daily Type Averages & Totals (RDF).rdf")


Afficher les 5 lignes

In [51]:
df_Daily.head()


Unnamed: 0,subject,city,ridership,route,daytype,date
0,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,21184,20,W,2001-01-03
1,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,273,90N,W,2001-01-03
2,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,515,169,W,2001-01-04
3,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,880,33,W,2001-01-05
4,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,8511,152,W,2001-01-02


In [52]:
df_Monthly.head()

Unnamed: 0,subject,city,route,month_beginning,avg_weekday_rides,avg_saturday_rides,monthtotal,routename,avg_sunday_holiday_rides
0,https://data.cityofchicago.org/resource/bynn-g...,Chicago,204,2001-01-01,1445.4,0.0,31798,Dodge,0.0
1,https://data.cityofchicago.org/resource/bynn-g...,Chicago,8,2001-02-01,20741.7,12528.9,502067,Halsted,9279.6
2,https://data.cityofchicago.org/resource/bynn-g...,Chicago,8,2001-01-01,19582.2,12420.0,521892,Halsted,8280.8
3,https://data.cityofchicago.org/resource/bynn-g...,Chicago,127,2001-02-01,214.5,0.0,4290,Madison/Roosevelt Circulator,0.0
4,https://data.cityofchicago.org/resource/bynn-g...,Chicago,6,2001-02-01,19337.2,12335.8,467758,Jackson Park Express,7917.8


Description de la Dataframe


In [53]:
print("Description de la DataFrame Daily \n",df_Daily.describe())

Description de la DataFrame Daily 
           ridership                        date
count    500.000000                         500
mean    6044.176000  2001-01-02 18:40:19.200000
min        2.000000         2001-01-01 00:00:00
25%     1203.250000         2001-01-02 00:00:00
50%     4013.500000         2001-01-03 00:00:00
75%     8925.000000         2001-01-04 00:00:00
max    27956.000000         2001-01-05 00:00:00
std     6091.957502                         NaN


In [54]:

print("Description de la DataFrame Monthly \n",df_Monthly.describe())

Description de la DataFrame Monthly 
                   month_beginning  avg_weekday_rides  avg_saturday_rides  \
count                         500         500.000000          500.000000   
mean   2001-02-12 22:59:31.200000        7586.930800         4717.066000   
min           2001-01-01 00:00:00           0.000000            0.000000   
25%           2001-01-01 00:00:00        1492.400000            0.000000   
50%           2001-02-01 00:00:00        5292.350000         2784.800000   
75%           2001-03-01 00:00:00       11869.575000         7921.900000   
max           2001-04-01 00:00:00       32205.800000        26330.600000   
std                           NaN        7351.498665         5607.979314   

          monthtotal  avg_sunday_holiday_rides  
count     500.000000                500.000000  
mean   194781.898000               3006.342400  
min       148.000000                  0.000000  
25%     34269.250000                  0.000000  
50%    134509.000000            

Afficher les infos de DataFrame 

In [55]:

print("Info sur la DataFrame Daily \n")
df_Daily.info()

Info sur la DataFrame Daily 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   subject    500 non-null    object        
 1   city       500 non-null    object        
 2   ridership  500 non-null    int64         
 3   route      500 non-null    object        
 4   daytype    500 non-null    object        
 5   date       500 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 23.6+ KB


In [56]:
print("Info sur  la DataFrame Monthly \n")
df_Monthly.info()

Info sur  la DataFrame Monthly 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   subject                   500 non-null    object        
 1   city                      500 non-null    object        
 2   route                     500 non-null    object        
 3   month_beginning           500 non-null    datetime64[ns]
 4   avg_weekday_rides         500 non-null    float64       
 5   avg_saturday_rides        500 non-null    float64       
 6   monthtotal                500 non-null    int64         
 7   routename                 500 non-null    object        
 8   avg_sunday_holiday_rides  500 non-null    float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 35.3+ KB


Taille de DataFrame

In [57]:
print("Daily DataFrame shape:", df_Daily.shape)
print("Monthly DataFrame shape:", df_Monthly.shape)

Daily DataFrame shape: (500, 6)
Monthly DataFrame shape: (500, 9)


Vérifier les valeurs nulls

In [58]:
print("Les valeurs nulles dans le DataFrame Daily :\n", df_Daily.isnull().sum())
print("Les valeurs nulles dans le DataFrame Monthly :\n", df_Monthly.isnull().sum())

Les valeurs nulles dans le DataFrame Daily :
 subject      0
city         0
ridership    0
route        0
daytype      0
date         0
dtype: int64
Les valeurs nulles dans le DataFrame Monthly :
 subject                     0
city                        0
route                       0
month_beginning             0
avg_weekday_rides           0
avg_saturday_rides          0
monthtotal                  0
routename                   0
avg_sunday_holiday_rides    0
dtype: int64


Vérifier les doublons

In [59]:
print("Les doublons dans le DataFrame Daily:",df_Daily.duplicated().sum())
print("Les doublons dans le DataFrame Monthly:",df_Monthly.duplicated().sum())


Les doublons dans le DataFrame Daily: 0
Les doublons dans le DataFrame Monthly: 0


## Nettoyage 

### Remarque : aucun nettoyage de données n’a été appliqué. Toutes les valeurs sont considérées comme valides et logiques.

In [60]:
#supprimer colonnes subject et city des deux DataFrames
df_Daily = df_Daily.drop(columns=["subject", "city"])
df_Monthly = df_Monthly.drop(columns=["subject", "city"])

##  Exportation du DataFrame final au format CSV


 Fusionner les données de fréquentation journalières et mensuelles par itinéraire et par mois

In [61]:
df_Daily['date'] = pd.to_datetime(df_Daily['date'], errors='coerce')
df_Daily['month_beginning'] = df_Daily['date'].dt.to_period('M').dt.to_timestamp()

df_Monthly['month_beginning'] = pd.to_datetime(df_Monthly['month_beginning'], errors='coerce')

df_merged = pd.merge(
    df_Daily,
    df_Monthly,
    on=['route', 'month_beginning'],
    how='left'
)


Exporter comme un CSV

In [63]:
df_merged.to_csv("../Data clean/Chicago_route.csv", index=False)

# Fichier Excel

In [None]:
df_Mode=pd.read_excel("../Data/cta-ridership-daily-boarding-totals-20260203-69820a3f9df63091665572.xlsx")

## EDA

Afficher les 5 lignes

In [None]:
df_Mode.head()

Unnamed: 0,service_date,day_type,bus,rail_boardings,total_rides
0,2001-01-01,U,297192,126455,423647
1,2001-01-02,W,780827,501952,1282779
2,2001-01-03,W,824923,536432,1361355
3,2001-01-04,W,870021,550011,1420032
4,2001-01-05,W,890426,557917,1448343


Taille de DataFrame

In [None]:
df_Mode.shape

(9100, 5)

Description de la DataFrame

In [None]:
print("Description de la DataFrame Mode: \n")
df_Mode.describe()

Description de la DataFrame Mode: 



Unnamed: 0,service_date,bus,rail_boardings,total_rides
count,2526,2526.0,2526.0,2526.0
mean,2022-06-16 12:00:00,447388.56057,335343.043943,782731.6
min,2019-01-01 00:00:00,80783.0,23544.0,110047.0
25%,2020-09-23 06:00:00,307571.5,203752.0,503023.2
50%,2022-06-16 12:00:00,441554.5,322465.0,763159.0
75%,2024-03-08 18:00:00,566131.5,405493.0,970749.0
max,2025-11-30 00:00:00,905477.0,816086.0,1677559.0
std,,175712.572371,174488.280933,345684.8


Afficher les infos de la DataFrame

In [None]:
print("Info sur la DataFrame Mode \n")
df_Mode.info()

Info sur la DataFrame Mode 

<class 'pandas.core.frame.DataFrame'>
Index: 2526 entries, 6574 to 9099
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   service_date    2526 non-null   datetime64[ns]
 1   day_type        2526 non-null   object        
 2   bus             2526 non-null   int64         
 3   rail_boardings  2526 non-null   int64         
 4   total_rides     2526 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 118.4+ KB


Vérifier les valeurs nulls

In [None]:
print("Les valeurs nulles dans le DataFrame Route :")
df_Mode.isna().sum()

Les valeurs nulles dans le DataFrame Route :


service_date      0
day_type          0
bus               0
rail_boardings    0
total_rides       0
dtype: int64

Vérifier les doublons

In [None]:
print("Les doublons dans le DataFrame Mode:",df_Mode.duplicated().sum())


Les doublons dans le DataFrame Mode: 0


## Nettoyage

Filtrer la date 

In [None]:
df_Mode=df_Mode[(df_Mode['service_date']> '2018-12-31')]

##  Exportation du DataFrame final au format CSV


In [None]:
df_Mode.to_csv("../Data clean/Chicago_mode.csv")