## Import Libraries

In [1]:
import sys
import os
import pandas as pd
# Add parent directory to path
sys.path.append(os.path.dirname(os.getcwd()))

# Import using package structure
from scripts.data_downloader import DataDownloader
from scripts.utils import rename_traffic_columns
# Use with adjusted paths
downloader = DataDownloader('../data/DGCTA - Flussi di Traffico.csv')

Loading data from: ../data/DGCTA - Flussi di Traffico.csv
Successfully loaded 196 records


## Load Data

In [None]:
# downloader.data_map
downloader.data_map[2021,1]  

{'dataset_id': 'DS-1944',
 'url': 'https://rerdatahub.blob.core.windows.net/open-datahub/DS-1944/2021/01_Gennaio_2021.csv',
 'frequency': 'SETTIMANALE',
 'status_code': 'S',
 'timestamp': '2024-08-02 17:43:31',
 'availability': 'OPEN'}

In [None]:
df = downloader.download_and_load_data(year=2020, month=1)

Downloading data for 2020-01...
URL: https://rerdatahub.blob.core.windows.net/open-datahub/DS-1944/2020/01_Gennaio_2020.csv
Progress: 100.0%
Successfully downloaded to: downloads/01_Gennaio_2020.csv
Data loaded successfully. Shape: (3568869, 8)


In [7]:
df = rename_traffic_columns(df)

## Preprocess Data

In [None]:
# day is currently in the format 'YYYYMMDD'
df['date'] = pd.to_datetime(df['Day'], format='%Y%m%d')
df['Day'] = df['date'].dt.day
df['Month'] = df['date'].dt.month

# drop the 'date' column if you don't need it anymore
df.drop(columns=['year'], inplace=True)

In [31]:
df.dtypes

Year              int64
Month             int32
Day               int32
HourMinute       object
MTSStationID      int64
DirectionCode     int64
VehicleType       int64
TransitCount      int64
dtype: object

## Explore Data

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3568869 entries, 0 to 3568868
Data columns (total 8 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   Year           int64 
 1   Month          int32 
 2   Day            int32 
 3   HourMinute     object
 4   MTSStationID   int64 
 5   DirectionCode  int64 
 6   VehicleType    int64 
 7   TransitCount   int64 
dtypes: int32(2), int64(5), object(1)
memory usage: 190.6+ MB


In [36]:
df.describe()

Unnamed: 0,Year,Month,Day,MTSStationID,DirectionCode,VehicleType,TransitCount
count,3568869.0,3568869.0,3568869.0,3568869.0,3568869.0,3568869.0,3568869.0
mean,2020.0,1.0,16.71508,332.417,0.5002459,4.553951,19.79028
std,0.0,0.0,8.773471,179.6472,0.5,2.312943,43.08847
min,2020.0,1.0,1.0,6.0,0.0,0.0,1.0
25%,2020.0,1.0,9.0,174.0,0.0,2.0,1.0
50%,2020.0,1.0,17.0,305.0,1.0,4.0,3.0
75%,2020.0,1.0,24.0,441.0,1.0,6.0,14.0
max,2020.0,1.0,31.0,676.0,1.0,10.0,793.0


In [34]:
df.Day.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
      dtype=int32)

In [35]:
df.Month.unique()

array([1], dtype=int32)

In [37]:
df.tail(10)

Unnamed: 0,Year,Month,Day,HourMinute,MTSStationID,DirectionCode,VehicleType,TransitCount
3568859,2020,1,31,23:45,675,0,4,2
3568860,2020,1,31,23:45,675,0,7,1
3568861,2020,1,31,23:45,675,0,8,3
3568862,2020,1,31,23:45,675,1,2,7
3568863,2020,1,31,23:45,675,1,4,5
3568864,2020,1,31,23:45,675,1,8,4
3568865,2020,1,31,23:45,676,0,2,37
3568866,2020,1,31,23:45,676,0,8,2
3568867,2020,1,31,23:45,676,1,2,46
3568868,2020,1,31,23:45,676,1,4,2


In [41]:
df.tail().to_dict()

{'Year': {3568864: 2020,
  3568865: 2020,
  3568866: 2020,
  3568867: 2020,
  3568868: 2020},
 'Month': {3568864: 1, 3568865: 1, 3568866: 1, 3568867: 1, 3568868: 1},
 'Day': {3568864: 31, 3568865: 31, 3568866: 31, 3568867: 31, 3568868: 31},
 'HourMinute': {3568864: '23:45',
  3568865: '23:45',
  3568866: '23:45',
  3568867: '23:45',
  3568868: '23:45'},
 'MTSStationID': {3568864: 675,
  3568865: 676,
  3568866: 676,
  3568867: 676,
  3568868: 676},
 'DirectionCode': {3568864: 1, 3568865: 0, 3568866: 0, 3568867: 1, 3568868: 1},
 'VehicleType': {3568864: 8, 3568865: 2, 3568866: 8, 3568867: 2, 3568868: 4},
 'TransitCount': {3568864: 4,
  3568865: 37,
  3568866: 2,
  3568867: 46,
  3568868: 2}}

In [38]:
df.MTSStationID.unique()

array([  6,   7,   9,  12,  16,  25,  52,  53,  98, 100, 104, 107, 108,
       112, 115, 116, 117, 122, 123, 124, 125, 126, 127, 128, 130, 131,
       132, 133, 134, 137, 138, 139, 140, 143, 146, 147, 149, 150, 151,
       152, 153, 154, 155, 157, 158, 170, 171, 173, 174, 176, 177, 182,
       184, 186, 188, 219, 220, 221, 222, 223, 227, 228, 236, 238, 239,
       243, 244, 245, 246, 247, 252, 256, 258, 259, 260, 262, 263, 274,
       275, 276, 277, 278, 279, 281, 282, 285, 287, 288, 289, 290, 291,
       292, 294, 298, 300, 303, 304, 305, 307, 308, 311, 312, 314, 318,
       323, 328, 331, 332, 334, 336, 337, 339, 340, 342, 343, 344, 349,
       350, 354, 355, 356, 357, 376, 377, 383, 389, 390, 391, 393, 396,
       400, 401, 402, 403, 404, 405, 410, 411, 425, 426, 427, 428, 429,
       431, 432, 436, 437, 438, 439, 441, 442, 445, 446, 447, 448, 449,
       450, 451, 452, 453, 504, 505, 601, 602, 605, 607, 608, 610, 612,
       613, 614, 615, 616, 617, 618, 620, 621, 623, 624, 627, 62