# EDA

### Chicago


In [40]:
from rdflib import Graph
import pandas as pd


RDF => DataFrame

In [41]:
# monthly RDF to DataFrame
def rdf_monthly_chicago_to_df(file_path):
    g = Graph()
    g.parse(file_path, format="xml")

    data = {}

    for s, p, o in g:
        s = str(s)
        p = str(p)
        o = str(o)

        if s not in data:
            data[s] = {
                "subject": s,
                "city": "Chicago"
            }

        field = p.split("/")[-1]

        if field in [
            "route",
            "routename",
            "month_beginning",
            "avg_weekday_rides",
            "avg_saturday_rides",
            "avg_sunday_holiday_rides",
            "monthtotal"
        ]:
            data[s][field] = o

    df = pd.DataFrame(data.values())

    df["month_beginning"] = pd.to_datetime(df["month_beginning"], errors="coerce")

    numeric_cols = [
        "avg_weekday_rides",
        "avg_saturday_rides",
        "avg_sunday_holiday_rides",
        "monthtotal"
    ]

    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    return df

df_Monthly = rdf_monthly_chicago_to_df("CTA Chicago - Ridership - Bus Routes - Monthly Day-Type Averages & Totals (RDF).rdf")


In [42]:
# daily RDF to DataFrame
def rdf_daily_to_df(file_path):
    g = Graph()
    g.parse(file_path, format="xml")

    rows = {}

    for s, p, o in g:
        s = str(s)
        p = str(p)
        o = str(o)

        if s not in rows:
            rows[s] = {
                "subject": s,
                "city": "Chicago"
            }

        col = p.split("/")[-1]

        if col == "route":
            rows[s]["route"] = o
        elif col == "date":
            rows[s]["date"] = o
        elif col == "daytype":
            rows[s]["daytype"] = o
        elif col == "rides":
            rows[s]["ridership"] = o

    df = pd.DataFrame(rows.values())

    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["ridership"] = pd.to_numeric(df["ridership"], errors="coerce")

    return df
df_Daily = rdf_daily_to_df("CTA Chicago - Ridership - Bus Routes - Daily Type Averages & Totals (RDF).rdf")


In [43]:
df_Daily.head()


Unnamed: 0,subject,city,daytype,ridership,date,route
0,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,5932,2001-01-02,71
1,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,2445,2001-01-03,37
2,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,25393,2001-01-03,9
3,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,403,2001-01-03,56A
4,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,4096,2001-01-03,73


In [44]:
df_Monthly.head()

Unnamed: 0,subject,city,route,month_beginning,routename,avg_saturday_rides,avg_sunday_holiday_rides,monthtotal,avg_weekday_rides
0,https://data.cityofchicago.org/resource/bynn-g...,Chicago,4,2001-02-01,Cottage Grove,17637.0,11797.6,595896,23907.9
1,https://data.cityofchicago.org/resource/bynn-g...,Chicago,56,2001-01-01,Milwaukee,9160.4,6174.4,381555,14274.6
2,https://data.cityofchicago.org/resource/bynn-g...,Chicago,62,2001-04-01,Archer,7784.5,5453.1,379009,15266.9
3,https://data.cityofchicago.org/resource/bynn-g...,Chicago,147,2001-03-01,Outer Drive Express,6144.3,0.0,244988,9739.4
4,https://data.cityofchicago.org/resource/bynn-g...,Chicago,79,2001-01-01,79th,23808.6,15573.7,833507,30018.4


Description


In [45]:
print("Description de la DataFrame Daily \n",df_Daily.describe())

Description de la DataFrame Daily 
           ridership                        date
count    500.000000                         500
mean    6044.176000  2001-01-02 18:40:19.200000
min        2.000000         2001-01-01 00:00:00
25%     1203.250000         2001-01-02 00:00:00
50%     4013.500000         2001-01-03 00:00:00
75%     8925.000000         2001-01-04 00:00:00
max    27956.000000         2001-01-05 00:00:00
std     6091.957502                         NaN


In [46]:

print("Description de la DataFrame Monthly \n",df_Monthly.describe())

Description de la DataFrame Monthly 
                   month_beginning  avg_saturday_rides  \
count                         500          500.000000   
mean   2001-02-12 22:59:31.200000         4717.066000   
min           2001-01-01 00:00:00            0.000000   
25%           2001-01-01 00:00:00            0.000000   
50%           2001-02-01 00:00:00         2784.800000   
75%           2001-03-01 00:00:00         7921.900000   
max           2001-04-01 00:00:00        26330.600000   
std                           NaN         5607.979314   

       avg_sunday_holiday_rides     monthtotal  avg_weekday_rides  
count                500.000000     500.000000         500.000000  
mean                3006.342400  194781.898000        7586.930800  
min                    0.000000     148.000000           0.000000  
25%                    0.000000   34269.250000        1492.400000  
50%                 1410.500000  134509.000000        5292.350000  
75%                 5344.325000  310161.

In [47]:

print("Info sur la DataFrame Daily \n")
df_Daily.info()

Info sur la DataFrame Daily 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   subject    500 non-null    object        
 1   city       500 non-null    object        
 2   daytype    500 non-null    object        
 3   ridership  500 non-null    int64         
 4   date       500 non-null    datetime64[ns]
 5   route      500 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 23.6+ KB


In [48]:
print("Info sur  la DataFrame Monthly \n")
df_Monthly.info()

Info sur  la DataFrame Monthly 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   subject                   500 non-null    object        
 1   city                      500 non-null    object        
 2   route                     500 non-null    object        
 3   month_beginning           500 non-null    datetime64[ns]
 4   routename                 500 non-null    object        
 5   avg_saturday_rides        500 non-null    float64       
 6   avg_sunday_holiday_rides  500 non-null    float64       
 7   monthtotal                500 non-null    int64         
 8   avg_weekday_rides         500 non-null    float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 35.3+ KB


Taille de DataFrame

In [49]:
print("Daily DataFrame shape:", df_Daily.shape)
print("Monthly DataFrame shape:", df_Monthly.shape)

Daily DataFrame shape: (500, 6)
Monthly DataFrame shape: (500, 9)


Vérifier les valeurs nulls

In [50]:
print("Les valeurs nulles dans le DataFrame Daily :\n", df_Daily.isnull().sum())
print("Les valeurs nulles dans le DataFrame Monthly :\n", df_Monthly.isnull().sum())

Les valeurs nulles dans le DataFrame Daily :
 subject      0
city         0
daytype      0
ridership    0
date         0
route        0
dtype: int64
Les valeurs nulles dans le DataFrame Monthly :
 subject                     0
city                        0
route                       0
month_beginning             0
routename                   0
avg_saturday_rides          0
avg_sunday_holiday_rides    0
monthtotal                  0
avg_weekday_rides           0
dtype: int64


Les doublons

In [51]:
print("Les doublons dans le DataFrame Daily:",df_Daily.duplicated().sum())
print("Les doublons dans le DataFrame Monthly:",df_Monthly.duplicated().sum())


Les doublons dans le DataFrame Daily: 0
Les doublons dans le DataFrame Monthly: 0


# Nettoyage 

### Remarque : aucun nettoyage de données n’a été appliqué. Toutes les valeurs sont considérées comme valides et logiques.

In [52]:
#supprimer colonnes subject et city des deux DataFrames
df_Daily = df_Daily.drop(columns=["subject", "city"])
df_Monthly = df_Monthly.drop(columns=["subject", "city"])

Exporter comme un CSV

In [53]:
df_Daily['date'] = pd.to_datetime(df_Daily['date'], errors='coerce')
df_Daily['month_beginning'] = df_Daily['date'].dt.to_period('M').dt.to_timestamp()

df_Monthly['month_beginning'] = pd.to_datetime(df_Monthly['month_beginning'], errors='coerce')

df_merged = pd.merge(
    df_Daily,
    df_Monthly,
    on=['route', 'month_beginning'],
    how='left'
)


In [54]:
df_merged.to_csv("Chicago_Transit_Authority_Bus_Ridership.csv", index=False)