In [1]:
from pathlib import Path
import pandas as pd
import sys
import os
# add project root to sys.path
sys.path.append(os.path.abspath(".."))

from cdrlib.pipeline import load_all_days, build_master_parquet




ROOT = Path.cwd().parent           # repo root 
DATA_DIR = ROOT / "data"
OUT_PARQUET = ROOT / "milan_cdr_master.parquet"


In [6]:
files = load_all_days(DATA_DIR, pattern="sms-call-internet-mi-*.csv")
len(files), files[0], files[-1]

(7,
 WindowsPath('c:/Users/tempadmin.DESKTOP-C50JUBO/Desktop/DS Roadmap/CDR Mobility & Traffic Analytics Project/data/sms-call-internet-mi-2013-11-01.csv'),
 WindowsPath('c:/Users/tempadmin.DESKTOP-C50JUBO/Desktop/DS Roadmap/CDR Mobility & Traffic Analytics Project/data/sms-call-internet-mi-2013-11-07.csv'))

In [7]:
cdr = build_master_parquet(files, OUT_PARQUET)
cdr.head(3)

Unnamed: 0,square_id,time_interval,sms_in,sms_out,call_in,call_out,internet
0,1,2013-11-01 00:00:00,2.0843,1.1047,0.5919,0.4293,57.799
1,1,2013-11-01 01:00:00,1.1637,0.77,0.1906,0.1942,44.0469
2,1,2013-11-01 02:00:00,0.4156,0.3004,0.0279,0.136,41.2071


In [8]:
print("Rows:", len(cdr))
print("Date range:", cdr["time_interval"].min(), "→", cdr["time_interval"].max())
print("Unique cells:", cdr["square_id"].nunique())


Rows: 1679994
Date range: 2013-11-01 00:00:00 → 2013-11-07 23:00:00
Unique cells: 10000


In [9]:
import geopandas as gpd

gdf = gpd.read_file(DATA_DIR / "milano-grid.geojson")
gdf["square_id"] = gdf["cellId"].astype("int32")

cent = gdf.to_crs(4326).centroid
centroids = pd.DataFrame({
    "square_id": gdf["square_id"].astype("int32"),
    "lon": cent.x.round(6),
    "lat": cent.y.round(6),
})
centroids.to_csv(ROOT / "milan_cell_centroids.csv", index=False)
centroids.head()



  cent = gdf.to_crs(4326).centroid


Unnamed: 0,square_id,lon,lat
0,1,9.012991,45.357743
1,2,9.015991,45.357743
2,3,9.018992,45.357743
3,4,9.021992,45.357742
4,5,9.024993,45.357741


## 📌 Conclusions (Notebook 1 — Data Loading & Preparation)

- Successfully loaded all **raw daily CSVs** of Milan mobile phone activity.  
- Built a unified **Parquet master dataset** (`milan_cdr_master.parquet`) with consistent schema.  
- Extracted the official **grid cell centroids** from the GeoJSON and exported as CSV.  

➡ Notebook 1 establishes the **data foundation**: clean, standardized, and geospatially enabled — ready for EDA and clustering in Notebook 2.
