# Data Loading - Paris Gentrification Analysis

**Purpose:** Download and load all raw data sources (FILOSOFI, EDUCATION, CENSUS, DVF, IRIS) for data preparation pipeline

**Output:** Raw datasets stored in `raw_datasets/` folder, ready for cleaning and aggregation

---

## Workflow Overview

1. **Load FILOSOFI** - Income data (2013, 2017, 2021)
2. **Load EDUCATION** - Higher education statistics (2013, 2017, 2021)
3. **Load CENSUS** - Demographic data (2013, 2017, 2021)
4. **Load DVF** - Real estate transaction data (2014-2021)
5. **Load IRIS** - Geographic boundaries and type classifications
6. **Export & Quality Check** - Verify all files loaded with expected structure

---



## 1. Load FILOSOFI Datasets (2013, 2017, 2021)



FILOSOFI datasets contain income and living standards data at IRIS level.

In [None]:
# Setup: Create necessary directories
import pandas as pd
import geopandas as gpd
import os
import shutil
from pathlib import Path
import gdown
import xlrd

# Create datasets and raw_datasets folders if they don't exist
datasets_dir = Path('..') / 'datasets'
raw_datasets_dir = Path('..') / 'raw_datasets'
datasets_dir.mkdir(exist_ok=True)
raw_datasets_dir.mkdir(exist_ok=True)

print(f"Setup complete. Datasets directory ready at: {datasets_dir}")
print(f"Raw datasets directory ready at: {raw_datasets_dir}")

Setup complete. Datasets directory ready at: ../datasets
Raw datasets directory ready at: ../raw_datasets


In [2]:
# FILOSOFI 2013
print("Loading FILOSOFI 2013...")

# Download file
file_id = '1fRbArcfw_DHrycI11NsjbosnXCp6Nh26'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'filosofi_2013.xlsx'
gdown.download(url, str(raw_file), quiet=False)

filosofi_2013 = pd.read_excel(raw_file, header=5)

print(f"Available columns: {list(filosofi_2013.columns)}")

# Define columns to keep with their new names
filosofi_columns_to_keep = {
    'IRIS': 'code_iris',
    'LIBIRIS': 'libelle_iris',
    'DISP_MED13': 'median_uc',}

# Filter for Paris intra-muros (département 75)
iris_col = 'IRIS'
filosofi_2013_paris = filosofi_2013[filosofi_2013[iris_col].astype(str).str.startswith('75')].copy()

# Select only columns that exist in the dataframe
cols_to_keep = [col for col in filosofi_columns_to_keep.keys() if col in filosofi_2013_paris.columns]
filosofi_2013_paris = filosofi_2013_paris[cols_to_keep].copy()

# Rename columns
rename_mapping = {col: filosofi_columns_to_keep[col] for col in cols_to_keep}
filosofi_2013_paris.rename(columns=rename_mapping, inplace=True)

# Save to datasets folder
filosofi_2013_paris.to_parquet(datasets_dir / 'filosofi_2013_paris.parquet', index=False)
print(f"FILOSOFI 2013: {len(filosofi_2013_paris)} IRIS in Paris saved")
print(f"Final columns: {list(filosofi_2013_paris.columns)}")

Loading FILOSOFI 2013...


Downloading...
From: https://drive.google.com/uc?id=1fRbArcfw_DHrycI11NsjbosnXCp6Nh26
To: /workspaces/thesis/raw_datasets/filosofi_2013.xlsx
100%|██████████| 5.04M/5.04M [00:00<00:00, 29.3MB/s]



Available columns: ['IRIS', 'LIBIRIS', 'COM', 'LIBCOM', 'DISP_TP6013', 'DISP_Q113', 'DISP_MED13', 'DISP_Q313', 'DISP_EQ13', 'DISP_D113', 'DISP_D213', 'DISP_D313', 'DISP_D413', 'DISP_D613', 'DISP_D713', 'DISP_D813', 'DISP_D913', 'DISP_RD13', 'DISP_S80S2013', 'DISP_GI13', 'DISP_PTSAC13', 'DISP_PBEN13', 'DISP_PPEN13', 'DISP_PPAT13', 'DISP_PPSOC13', 'DISP_PPFAM13', 'DISP_PPMINI13', 'DISP_PPLOGT13', 'DISP_PIMPOT13']
FILOSOFI 2013: 853 IRIS in Paris saved
Final columns: ['code_iris', 'libelle_iris', 'median_uc']
FILOSOFI 2013: 853 IRIS in Paris saved
Final columns: ['code_iris', 'libelle_iris', 'median_uc']


In [3]:
# Verify FILOSOFI 2013 data
print("=" * 60)
print("VERIFICATION - FILOSOFI 2013")
print("=" * 60)
loaded_filosofi_2013 = pd.read_parquet(datasets_dir / 'filosofi_2013_paris.parquet')
print(f"Number of IRIS: {len(loaded_filosofi_2013)}")
print(f"Number of rows: {len(loaded_filosofi_2013)}")
print(f"Number of columns: {len(loaded_filosofi_2013.columns)}")
print(f"Columns: {list(loaded_filosofi_2013.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_filosofi_2013['code_iris'].head(10).tolist())
print(f"\nData types:")
print(loaded_filosofi_2013.dtypes)
print(f"\nMissing values:")
print(loaded_filosofi_2013.isnull().sum())
print("=" * 60)

VERIFICATION - FILOSOFI 2013
Number of IRIS: 853
Number of rows: 853
Number of columns: 3
Columns: ['code_iris', 'libelle_iris', 'median_uc']

Sample IRIS codes:
['751010201', '751010202', '751010203', '751010204', '751010301', '751010401', '751020601', '751020701', '751020702', '751020703']

Data types:
code_iris        object
libelle_iris     object
median_uc       float64
dtype: object

Missing values:
code_iris       0
libelle_iris    0
median_uc       0
dtype: int64
Number of IRIS: 853
Number of rows: 853
Number of columns: 3
Columns: ['code_iris', 'libelle_iris', 'median_uc']

Sample IRIS codes:
['751010201', '751010202', '751010203', '751010204', '751010301', '751010401', '751020601', '751020701', '751020702', '751020703']

Data types:
code_iris        object
libelle_iris     object
median_uc       float64
dtype: object

Missing values:
code_iris       0
libelle_iris    0
median_uc       0
dtype: int64


In [4]:
# FILOSOFI 2017
print("Loading FILOSOFI 2017...")

# Download file
file_id = '1IhvzpWGInGlDj7Xpi4kHP87msylR7hiQ'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'filosofi_2017.xlsx'
gdown.download(url, str(raw_file), quiet=False)

filosofi_2017 = pd.read_excel(raw_file, header=5)

print(f"Available columns: {list(filosofi_2017.columns)}")

# Define columns to keep with their new names
filosofi_columns_to_keep = {
    'IRIS': 'code_iris',
    'LIBIRIS': 'libelle_iris',
    'DISP_MED17': 'median_uc',
}

# Filter for Paris intra-muros
iris_col = 'IRIS'
filosofi_2017_paris = filosofi_2017[filosofi_2017[iris_col].astype(str).str.startswith('75')].copy()

# Select only columns that exist in the dataframe
cols_to_keep = [col for col in filosofi_columns_to_keep.keys() if col in filosofi_2017_paris.columns]
filosofi_2017_paris = filosofi_2017_paris[cols_to_keep].copy()

# Rename columns
rename_mapping = {col: filosofi_columns_to_keep[col] for col in cols_to_keep}
filosofi_2017_paris.rename(columns=rename_mapping, inplace=True)

# Save to datasets folder
filosofi_2017_paris.to_parquet(datasets_dir / 'filosofi_2017_paris.parquet', index=False)
print(f"FILOSOFI 2017: {len(filosofi_2017_paris)} IRIS in Paris saved")
print(f"Final columns: {list(filosofi_2017_paris.columns)}")

Loading FILOSOFI 2017...


Downloading...
From: https://drive.google.com/uc?id=1IhvzpWGInGlDj7Xpi4kHP87msylR7hiQ
To: /workspaces/thesis/raw_datasets/filosofi_2017.xlsx
100%|██████████| 2.76M/2.76M [00:00<00:00, 108MB/s]



Available columns: ['IRIS', 'LIBIRIS', 'COM', 'LIBCOM', 'DISP_TP6017', 'DISP_Q117', 'DISP_MED17', 'DISP_Q317', 'DISP_EQ17', 'DISP_D117', 'DISP_D217', 'DISP_D317', 'DISP_D417', 'DISP_D617', 'DISP_D717', 'DISP_D817', 'DISP_D917', 'DISP_RD17', 'DISP_S80S2017', 'DISP_GI17', 'DISP_PACT17', 'DISP_PTSA17', 'DISP_PCHO17', 'DISP_PBEN17', 'DISP_PPEN17', 'DISP_PPAT17', 'DISP_PPSOC17', 'DISP_PPFAM17', 'DISP_PPMINI17', 'DISP_PPLOGT17', 'DISP_PIMPOT17', 'DISP_NOTE17']
FILOSOFI 2017: 871 IRIS in Paris saved
Final columns: ['code_iris', 'libelle_iris', 'median_uc']


In [5]:
# Verify FILOSOFI 2017 data
print("=" * 60)
print("VERIFICATION - FILOSOFI 2017")
print("=" * 60)
loaded_filosofi_2017 = pd.read_parquet(datasets_dir / 'filosofi_2017_paris.parquet')
print(f"Number of IRIS: {len(loaded_filosofi_2017)}")
print(f"Number of rows: {len(loaded_filosofi_2017)}")
print(f"Number of columns: {len(loaded_filosofi_2017.columns)}")
print(f"Columns: {list(loaded_filosofi_2017.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_filosofi_2017['code_iris'].head(10).tolist())
print(f"\nData types:")
print(loaded_filosofi_2017.dtypes)
print(f"\nMissing values:")
print(loaded_filosofi_2017.isnull().sum())
print("=" * 60)

VERIFICATION - FILOSOFI 2017
Number of IRIS: 871
Number of rows: 871
Number of columns: 3
Columns: ['code_iris', 'libelle_iris', 'median_uc']

Sample IRIS codes:
['751010201', '751010202', '751010203', '751010204', '751010301', '751010401', '751010402', '751020601', '751020602', '751020701']

Data types:
code_iris        object
libelle_iris     object
median_uc       float64
dtype: object

Missing values:
code_iris       0
libelle_iris    0
median_uc       1
dtype: int64


In [6]:
# FILOSOFI 2021
print("Loading FILOSOFI 2021...")

# Download file
file_id = '1Har2wCg63dQZSTWYxmyXQ0DcQ0dHylX8'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'filosofi_2021.xlsx'
gdown.download(url, str(raw_file), quiet=False)

filosofi_2021 = pd.read_excel(raw_file, header=5)

print(f"Available columns: {list(filosofi_2021.columns)}")

# Define columns to keep with their new names
filosofi_columns_to_keep = {
    'IRIS': 'code_iris',
    'LIBIRIS': 'libelle_iris',
    'DISP_MED21': 'median_uc',
}

# Filter for Paris intra-muros
iris_col = 'IRIS'
filosofi_2021_paris = filosofi_2021[filosofi_2021[iris_col].astype(str).str.startswith('75')].copy()

# Select only columns that exist in the dataframe
cols_to_keep = [col for col in filosofi_columns_to_keep.keys() if col in filosofi_2021_paris.columns]
filosofi_2021_paris = filosofi_2021_paris[cols_to_keep].copy()

# Rename columns
rename_mapping = {col: filosofi_columns_to_keep[col] for col in cols_to_keep}
filosofi_2021_paris.rename(columns=rename_mapping, inplace=True)

# Save to datasets folder
filosofi_2021_paris.to_parquet(datasets_dir / 'filosofi_2021_paris.parquet', index=False)
print(f"FILOSOFI 2021: {len(filosofi_2021_paris)} IRIS in Paris saved")
print(f"Final columns: {list(filosofi_2021_paris.columns)}")

Loading FILOSOFI 2021...


Downloading...
From: https://drive.google.com/uc?id=1Har2wCg63dQZSTWYxmyXQ0DcQ0dHylX8
To: /workspaces/thesis/raw_datasets/filosofi_2021.xlsx
100%|██████████| 2.82M/2.82M [00:00<00:00, 111MB/s]



Available columns: ['IRIS', 'LIBIRIS', 'COM', 'LIBCOM', 'DISP_TP6021', 'DISP_INCERT21', 'DISP_Q121', 'DISP_MED21', 'DISP_Q321', 'DISP_EQ21', 'DISP_D121', 'DISP_D221', 'DISP_D321', 'DISP_D421', 'DISP_D621', 'DISP_D721', 'DISP_D821', 'DISP_D921', 'DISP_RD21', 'DISP_S80S2021', 'DISP_GI21', 'DISP_PACT21', 'DISP_PTSA21', 'DISP_PCHO21', 'DISP_PBEN21', 'DISP_PPEN21', 'DISP_PPAT21', 'DISP_PPSOC21', 'DISP_PPFAM21', 'DISP_PPMINI21', 'DISP_PPLOGT21', 'DISP_PIMPOT21', 'DISP_NOTE21']
FILOSOFI 2021: 992 IRIS in Paris saved
Final columns: ['code_iris', 'libelle_iris', 'median_uc']


In [7]:
# Verify FILOSOFI 2021 data
print("=" * 60)
print("VERIFICATION - FILOSOFI 2021")
print("=" * 60)
loaded_filosofi_2021 = pd.read_parquet(datasets_dir / 'filosofi_2021_paris.parquet')
print(f"Number of IRIS: {len(loaded_filosofi_2021)}")
print(f"Number of rows: {len(loaded_filosofi_2021)}")
print(f"Number of columns: {len(loaded_filosofi_2021.columns)}")
print(f"Columns: {list(loaded_filosofi_2021.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_filosofi_2021['code_iris'].head(10).tolist())
print(f"\nData types:")
print(loaded_filosofi_2021.dtypes)
print(f"\nMissing values:")
print(loaded_filosofi_2021.isnull().sum())
print("=" * 60)

VERIFICATION - FILOSOFI 2021
Number of IRIS: 992
Number of rows: 992
Number of columns: 3
Columns: ['code_iris', 'libelle_iris', 'median_uc']

Sample IRIS codes:
['751010101', '751010102', '751010103', '751010104', '751010105', '751010199', '751010201', '751010202', '751010203', '751010204']

Data types:
code_iris       object
libelle_iris    object
median_uc       object
dtype: object

Missing values:
code_iris       0
libelle_iris    0
median_uc       0
dtype: int64


## 2. Load EDUCATION Datasets (2013, 2017, 2021)


Education datasets contain higher education attainment data at IRIS level.

**2013**: 
- `P13_NSCOL15P_SUP` → `pop_bac_sup` (all higher education, not subdivided)

**2017/2021**: 
- `P17/P21_NSCOL15P_SUP2` → `pop_bac2` (Bac+2)
- `P17/P21_NSCOL15P_SUP34` → `pop_bac34` (Bac+3/4)
- `P17/P21_NSCOL15P_SUP5` → `pop_bac5_plus` (Bac+5+)
- **Aggregated**: `pop_bac_sup` = sum of the 3 above (to match 2013)

In [8]:
# EDUCATION 2013
print("Loading EDUCATION 2013...")

# Download file
file_id = '1JmDo7waeztZukDAskj1PUGvzawussYWC'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'education_2013.xlsx'
gdown.download(url, str(raw_file), quiet=False)

education_2013 = pd.read_excel(raw_file, header=5)

# Filter for Paris intra-muros
iris_col = [col for col in education_2013.columns if 'IRIS' in col.upper()][0]
education_2013_paris = education_2013[education_2013[iris_col].astype(str).str.startswith('75')].copy()

# Keep only total higher education (not subdivided in 2013)
# P13_NSCOL15P_SUP = All higher education (Bac+2, Bac+3/4, Bac+5+ combined)
education_columns_mapping = {
    'IRIS': 'code_iris',
    'P13_NSCOL15P_SUP': 'pop_bac_sup',  # All higher education (to match aggregated 2017/2021)
}

cols_to_keep = [col for col in education_columns_mapping.keys() if col in education_2013_paris.columns]
final_names = [education_columns_mapping[col] for col in cols_to_keep]

education_2013_paris = education_2013_paris[cols_to_keep].copy()
education_2013_paris.columns = final_names

# Save to datasets folder
education_2013_paris.to_parquet(datasets_dir / 'education_2013_paris.parquet', index=False)
print(f"EDUCATION 2013: {len(education_2013_paris)} IRIS in Paris saved")
print(f"Columns: {list(education_2013_paris.columns)}")

Loading EDUCATION 2013...


Downloading...
From: https://drive.google.com/uc?id=1JmDo7waeztZukDAskj1PUGvzawussYWC
To: /workspaces/thesis/raw_datasets/education_2013.xlsx
100%|██████████| 37.1M/37.1M [00:01<00:00, 34.7MB/s]



EDUCATION 2013: 992 IRIS in Paris saved
Columns: ['code_iris', 'pop_bac_sup']


In [9]:
# Verify EDUCATION 2013 data
print("=" * 60)
print("VERIFICATION - EDUCATION 2013")
print("=" * 60)
loaded_education_2013 = pd.read_parquet(datasets_dir / 'education_2013_paris.parquet')
print(f"Number of IRIS: {len(loaded_education_2013)}")
print(f"Number of rows: {len(loaded_education_2013)}")
print(f"Number of columns: {len(loaded_education_2013.columns)}")
print(f"Columns: {list(loaded_education_2013.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_education_2013['code_iris'].head(10).tolist())
print(f"\nEducation Statistics:")
print(f"  Total Higher Ed (all): {loaded_education_2013['pop_bac_sup'].sum():,.0f}")
print(f"  Avg Higher Ed (all):   {loaded_education_2013['pop_bac_sup'].mean():,.1f}")
print(f"\nNote: 2013 data is not subdivided (includes all Bac+2, Bac+3/4, Bac+5+)")
print(f"\nData types:")
print(loaded_education_2013.dtypes)
print(f"\nMissing values:")
print(loaded_education_2013.isnull().sum())
print("=" * 60)

VERIFICATION - EDUCATION 2013
Number of IRIS: 992
Number of rows: 992
Number of columns: 2
Columns: ['code_iris', 'pop_bac_sup']

Sample IRIS codes:
['751010101', '751010102', '751010103', '751010104', '751010105', '751010199', '751010201', '751010202', '751010203', '751010204']

Education Statistics:
  Total Higher Ed (all): 914,174
  Avg Higher Ed (all):   921.5

Note: 2013 data is not subdivided (includes all Bac+2, Bac+3/4, Bac+5+)

Data types:
code_iris       object
pop_bac_sup    float64
dtype: object

Missing values:
code_iris      0
pop_bac_sup    0
dtype: int64


In [10]:
# EDUCATION 2017
print("Loading EDUCATION 2017...")

# Download file
file_id = '1ZyxFLSPaGfnVi29HjdxNoMlLh9XoeWye'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'education_2017.xlsx'
gdown.download(url, str(raw_file), quiet=False)

education_2017 = pd.read_excel(raw_file, header=5)

# Filter for Paris intra-muros
iris_col = [col for col in education_2017.columns if 'IRIS' in col.upper()][0]
education_2017_paris = education_2017[education_2017[iris_col].astype(str).str.startswith('75')].copy()

# Keep all 3 higher education categories - 2017 uses P17_ prefix
education_columns_mapping = {
    'IRIS': 'code_iris',
    'P17_NSCOL15P_SUP2': 'pop_bac2',        # Bac+2
    'P17_NSCOL15P_SUP34': 'pop_bac34',      # Bac+3/4
    'P17_NSCOL15P_SUP5': 'pop_bac5_plus',   # Bac+5+
}

cols_to_keep = [col for col in education_columns_mapping.keys() if col in education_2017_paris.columns]
final_names = [education_columns_mapping[col] for col in cols_to_keep]

education_2017_paris = education_2017_paris[cols_to_keep].copy()
education_2017_paris.columns = final_names

# Create aggregated column to match 2013 (all higher education)
education_2017_paris['pop_bac_sup'] = (
    education_2017_paris['pop_bac2'].fillna(0) + 
    education_2017_paris['pop_bac34'].fillna(0) + 
    education_2017_paris['pop_bac5_plus'].fillna(0)
)

# Save to datasets folder
education_2017_paris.to_parquet(datasets_dir / 'education_2017_paris.parquet', index=False)
print(f"EDUCATION 2017: {len(education_2017_paris)} IRIS in Paris saved")
print(f"Columns: {list(education_2017_paris.columns)}")

Loading EDUCATION 2017...


Downloading...
From: https://drive.google.com/uc?id=1ZyxFLSPaGfnVi29HjdxNoMlLh9XoeWye
To: /workspaces/thesis/raw_datasets/education_2017.xlsx
100%|██████████| 26.5M/26.5M [00:00<00:00, 106MB/s] 



EDUCATION 2017: 992 IRIS in Paris saved
Columns: ['code_iris', 'pop_bac2', 'pop_bac34', 'pop_bac5_plus', 'pop_bac_sup']


In [11]:
# Verify EDUCATION 2017 data
print("=" * 60)
print("VERIFICATION - EDUCATION 2017")
print("=" * 60)
loaded_education_2017 = pd.read_parquet(datasets_dir / 'education_2017_paris.parquet')
print(f"Number of IRIS: {len(loaded_education_2017)}")
print(f"Number of rows: {len(loaded_education_2017)}")
print(f"Number of columns: {len(loaded_education_2017.columns)}")
print(f"Columns: {list(loaded_education_2017.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_education_2017['code_iris'].head(10).tolist())
print(f"\nEducation Statistics:")
print(f"  Total Bac+2:          {loaded_education_2017['pop_bac2'].sum():,.0f}")
print(f"  Total Bac+3/4:        {loaded_education_2017['pop_bac34'].sum():,.0f}")
print(f"  Total Bac+5+:         {loaded_education_2017['pop_bac5_plus'].sum():,.0f}")
print(f"  Total Higher Ed (all): {loaded_education_2017['pop_bac_sup'].sum():,.0f}")
print(f"\nAverages per IRIS:")
print(f"  Avg Bac+2:            {loaded_education_2017['pop_bac2'].mean():,.1f}")
print(f"  Avg Bac+3/4:          {loaded_education_2017['pop_bac34'].mean():,.1f}")
print(f"  Avg Bac+5+:           {loaded_education_2017['pop_bac5_plus'].mean():,.1f}")
print(f"  Avg Higher Ed (all):  {loaded_education_2017['pop_bac_sup'].mean():,.1f}")
print(f"\nData types:")
print(loaded_education_2017.dtypes)
print(f"\nMissing values:")
print(loaded_education_2017.isnull().sum())
print("=" * 60)

VERIFICATION - EDUCATION 2017
Number of IRIS: 992
Number of rows: 992
Number of columns: 5
Columns: ['code_iris', 'pop_bac2', 'pop_bac34', 'pop_bac5_plus', 'pop_bac_sup']

Sample IRIS codes:
['751010101', '751010102', '751010103', '751010104', '751010105', '751010199', '751010201', '751010202', '751010203', '751010204']

Education Statistics:
  Total Bac+2:          133,482
  Total Bac+3/4:        245,114
  Total Bac+5+:         606,156
  Total Higher Ed (all): 984,751

Averages per IRIS:
  Avg Bac+2:            134.6
  Avg Bac+3/4:          247.1
  Avg Bac+5+:           611.0
  Avg Higher Ed (all):  992.7

Data types:
code_iris         object
pop_bac2         float64
pop_bac34        float64
pop_bac5_plus    float64
pop_bac_sup      float64
dtype: object

Missing values:
code_iris        0
pop_bac2         0
pop_bac34        0
pop_bac5_plus    0
pop_bac_sup      0
dtype: int64


In [12]:
# EDUCATION 2021
print("Loading EDUCATION 2021...")

# Download file
file_id = '1gP0FNOwIM3KPq8nVbL_oj9U_IPVJdHQW'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'education_2021.xlsx'
gdown.download(url, str(raw_file), quiet=False)

education_2021 = pd.read_excel(raw_file, header=5)

# Filter for Paris intra-muros
iris_col = [col for col in education_2021.columns if 'IRIS' in col.upper()][0]
education_2021_paris = education_2021[education_2021[iris_col].astype(str).str.startswith('75')].copy()

# Keep all 3 higher education categories - 2021 uses P21_ prefix
education_columns_mapping = {
    'IRIS': 'code_iris',
    'P21_NSCOL15P_SUP2': 'pop_bac2',        # Bac+2
    'P21_NSCOL15P_SUP34': 'pop_bac34',      # Bac+3/4
    'P21_NSCOL15P_SUP5': 'pop_bac5_plus',   # Bac+5+
}

cols_to_keep = [col for col in education_columns_mapping.keys() if col in education_2021_paris.columns]
final_names = [education_columns_mapping[col] for col in cols_to_keep]

education_2021_paris = education_2021_paris[cols_to_keep].copy()
education_2021_paris.columns = final_names

# Create aggregated column to match 2013 (all higher education)
education_2021_paris['pop_bac_sup'] = (
    education_2021_paris['pop_bac2'].fillna(0) + 
    education_2021_paris['pop_bac34'].fillna(0) + 
    education_2021_paris['pop_bac5_plus'].fillna(0)
)

# Save to datasets folder
education_2021_paris.to_parquet(datasets_dir / 'education_2021_paris.parquet', index=False)
print(f"EDUCATION 2021: {len(education_2021_paris)} IRIS in Paris saved")
print(f"Columns: {list(education_2021_paris.columns)}")

Loading EDUCATION 2021...


Downloading...
From: https://drive.google.com/uc?id=1gP0FNOwIM3KPq8nVbL_oj9U_IPVJdHQW
To: /workspaces/thesis/raw_datasets/education_2021.xlsx
100%|██████████| 29.2M/29.2M [00:00<00:00, 54.4MB/s]



EDUCATION 2021: 992 IRIS in Paris saved
Columns: ['code_iris', 'pop_bac2', 'pop_bac34', 'pop_bac5_plus', 'pop_bac_sup']


In [13]:
# Verify EDUCATION 2021 data
print("=" * 60)
print("VERIFICATION - EDUCATION 2021")
print("=" * 60)
loaded_education_2021 = pd.read_parquet(datasets_dir / 'education_2021_paris.parquet')
print(f"Number of IRIS: {len(loaded_education_2021)}")
print(f"Number of rows: {len(loaded_education_2021)}")
print(f"Number of columns: {len(loaded_education_2021.columns)}")
print(f"Columns: {list(loaded_education_2021.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_education_2021['code_iris'].head(10).tolist())
print(f"\nEducation Statistics:")
print(f"  Total Bac+2:          {loaded_education_2021['pop_bac2'].sum():,.0f}")
print(f"  Total Bac+3/4:        {loaded_education_2021['pop_bac34'].sum():,.0f}")
print(f"  Total Bac+5+:         {loaded_education_2021['pop_bac5_plus'].sum():,.0f}")
print(f"  Total Higher Ed (all): {loaded_education_2021['pop_bac_sup'].sum():,.0f}")
print(f"\nAverages per IRIS:")
print(f"  Avg Bac+2:            {loaded_education_2021['pop_bac2'].mean():,.1f}")
print(f"  Avg Bac+3/4:          {loaded_education_2021['pop_bac34'].mean():,.1f}")
print(f"  Avg Bac+5+:           {loaded_education_2021['pop_bac5_plus'].mean():,.1f}")
print(f"  Avg Higher Ed (all):  {loaded_education_2021['pop_bac_sup'].mean():,.1f}")
print(f"\nData types:")
print(loaded_education_2021.dtypes)
print(f"\nMissing values:")
print(loaded_education_2021.isnull().sum())
print("=" * 60)

VERIFICATION - EDUCATION 2021
Number of IRIS: 992
Number of rows: 992
Number of columns: 5
Columns: ['code_iris', 'pop_bac2', 'pop_bac34', 'pop_bac5_plus', 'pop_bac_sup']

Sample IRIS codes:
['751010101', '751010102', '751010103', '751010104', '751010105', '751010199', '751010201', '751010202', '751010203', '751010204']

Education Statistics:
  Total Bac+2:          123,521
  Total Bac+3/4:        238,092
  Total Bac+5+:         642,412
  Total Higher Ed (all): 1,004,025

Averages per IRIS:
  Avg Bac+2:            124.5
  Avg Bac+3/4:          240.0
  Avg Bac+5+:           647.6
  Avg Higher Ed (all):  1,012.1

Data types:
code_iris         object
pop_bac2         float64
pop_bac34        float64
pop_bac5_plus    float64
pop_bac_sup      float64
dtype: object

Missing values:
code_iris        0
pop_bac2         0
pop_bac34        0
pop_bac5_plus    0
pop_bac_sup      0
dtype: int64


## 3. Load CENSUS Datasets (2013, 2017, 2021)

CENSUS datasets contain demographic and occupational data at IRIS level.

**Variables extracted (all years 2013, 2017, 2021):**
- **Population totals**: `pop_total`, `pop_15plus`
- **Age groups**: `pop_18_24`, `pop_25_39`, `pop_65plus`
- **Occupations**: `pop_cadres` (executives), `pop_prof_inter` (intermediate professions), `pop_employes` (employees), `pop_ouvriers` (workers)
- **Immigration**: `pop_immigres`, `pop_etrangers` (foreigners)
- **IRIS type**: `typ_iris` (H=Habitat, D=Divers, A=Activité)

**Year prefixes**: 
- 2013: `P13_` / `C13_`
- 2017: `P17_` / `C17_`
- 2021: `P21_` / `C21_`

In [14]:
# CENSUS 2013
print("Loading CENSUS 2013...")

# Download file
file_id = '1b2LTSza0fRFkuVnvni60cKWKi51g3BQh'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'census_2013.xlsx'
gdown.download(url, str(raw_file), quiet=False)

census_2013 = pd.read_excel(raw_file, header=5)

# Filter for Paris intra-muros
iris_col = [col for col in census_2013.columns if 'IRIS' in col.upper()][0]
census_2013_paris = census_2013[census_2013[iris_col].astype(str).str.startswith('75')].copy()

# Keep only selected variables
census_columns_mapping = {
    'IRIS': 'code_iris',
    'TYP_IRIS': 'typ_iris',
    'P13_POP': 'pop_total',
    'C13_POP15P': 'pop_15plus',
    'C13_POP15P_CS3': 'pop_cadres',
    'C13_POP15P_CS4': 'pop_prof_inter',
    'C13_POP15P_CS5': 'pop_employes',
    'C13_POP15P_CS6': 'pop_ouvriers',
    'P13_POP1824': 'pop_18_24',
    'P13_POP2539': 'pop_25_39',
    'P13_POP65P': 'pop_65plus',
    'P13_POP_IMM': 'pop_immigres',
    'P13_POP_ETR': 'pop_etrangers',
}

cols_to_keep = [col for col in census_columns_mapping.keys() if col in census_2013_paris.columns]
final_names = [census_columns_mapping[col] for col in cols_to_keep]

census_2013_paris = census_2013_paris[cols_to_keep].copy()
census_2013_paris.columns = final_names

# Save to datasets folder
census_2013_paris.to_parquet(datasets_dir / 'census_2013_paris.parquet', index=False)
print(f"CENSUS 2013: {len(census_2013_paris)} IRIS in Paris saved")
print(f"Columns: {list(census_2013_paris.columns)}")

Loading CENSUS 2013...


Downloading...
From: https://drive.google.com/uc?id=1b2LTSza0fRFkuVnvni60cKWKi51g3BQh
To: /workspaces/thesis/raw_datasets/census_2013.xlsx
100%|██████████| 70.7M/70.7M [00:00<00:00, 77.2MB/s]



CENSUS 2013: 992 IRIS in Paris saved
Columns: ['code_iris', 'typ_iris', 'pop_total', 'pop_15plus', 'pop_cadres', 'pop_prof_inter', 'pop_employes', 'pop_ouvriers', 'pop_18_24', 'pop_25_39', 'pop_65plus', 'pop_immigres', 'pop_etrangers']


In [15]:
# Verify CENSUS 2013 data
print("=" * 60)
print("VERIFICATION - CENSUS 2013")
print("=" * 60)
loaded_census_2013 = pd.read_parquet(datasets_dir / 'census_2013_paris.parquet')
print(f"Number of IRIS: {len(loaded_census_2013)}")
print(f"Number of rows: {len(loaded_census_2013)}")
print(f"Number of columns: {len(loaded_census_2013.columns)}")
print(f"Columns: {list(loaded_census_2013.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_census_2013['code_iris'].head(10).tolist())
print(f"\nTotal population across all IRIS: {loaded_census_2013['pop_total'].sum():,.0f}")
print(f"\nData types:")
print(loaded_census_2013.dtypes)
print(f"\nMissing values:")
print(loaded_census_2013.isnull().sum())
print("=" * 60)

VERIFICATION - CENSUS 2013
Number of IRIS: 992
Number of rows: 992
Number of columns: 13
Columns: ['code_iris', 'typ_iris', 'pop_total', 'pop_15plus', 'pop_cadres', 'pop_prof_inter', 'pop_employes', 'pop_ouvriers', 'pop_18_24', 'pop_25_39', 'pop_65plus', 'pop_immigres', 'pop_etrangers']

Sample IRIS codes:
['751010101', '751010102', '751010103', '751010104', '751010105', '751010199', '751010201', '751010202', '751010203', '751010204']

Total population across all IRIS: 2,229,621

Data types:
code_iris          object
typ_iris           object
pop_total         float64
pop_15plus        float64
pop_cadres        float64
pop_prof_inter    float64
pop_employes      float64
pop_ouvriers      float64
pop_18_24         float64
pop_25_39         float64
pop_65plus        float64
pop_immigres      float64
pop_etrangers     float64
dtype: object

Missing values:
code_iris         0
typ_iris          0
pop_total         0
pop_15plus        0
pop_cadres        0
pop_prof_inter    0
pop_employes  

In [None]:
# CENSUS 2017
print("Loading CENSUS 2017...")

# Download file
file_id = '1KHGwB0S8D-gj7f3d3LBCzRjqBWxaewzw'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'census_2017.xlsx'
gdown.download(url, str(raw_file), quiet=False)

census_2017 = pd.read_excel(raw_file, header=5)

# Filter for Paris intra-muros
iris_col = [col for col in census_2017.columns if 'IRIS' in col.upper()][0]
census_2017_paris = census_2017[census_2017[iris_col].astype(str).str.startswith('75')].copy()

# Keep only selected variables (2017 uses P17_ prefix)
census_columns_mapping = {
    'IRIS': 'code_iris',
    'TYP_IRIS': 'typ_iris',
    'P17_POP': 'pop_total',
    'C17_POP15P': 'pop_15plus',
    'C17_POP15P_CS3': 'pop_cadres',
    'C17_POP15P_CS4': 'pop_prof_inter',
    'C17_POP15P_CS5': 'pop_employes',
    'C17_POP15P_CS6': 'pop_ouvriers',
    'P17_POP1824': 'pop_18_24',
    'P17_POP2539': 'pop_25_39',
    'P17_POP65P': 'pop_65plus',
    'P17_POP_IMM': 'pop_immigres',
    'P17_POP_ETR': 'pop_etrangers',
}

cols_to_keep = [col for col in census_columns_mapping.keys() if col in census_2017_paris.columns]
final_names = [census_columns_mapping[col] for col in cols_to_keep]

census_2017_paris = census_2017_paris[cols_to_keep].copy()
census_2017_paris.columns = final_names

# Save to datasets folder
census_2017_paris.to_parquet(datasets_dir / 'census_2017_paris.parquet', index=False)
print(f"CENSUS 2017: {len(census_2017_paris)} IRIS in Paris saved")
print(f"Columns: {list(census_2017_paris.columns)}")

Loading CENSUS 2017...


Downloading...
From: https://drive.google.com/uc?id=1KHGwB0S8D-gj7f3d3LBCzRjqBWxaewzw
To: /workspaces/thesis/raw_datasets/census_2017.xlsx
100%|██████████| 44.7M/44.7M [00:00<00:00, 61.5MB/s]



CENSUS 2017: 992 IRIS in Paris saved
Columns: ['code_iris', 'typ_iris', 'pop_total', 'pop_15plus', 'pop_cadres', 'pop_prof_inter', 'pop_employes', 'pop_ouvriers', 'pop_18_24', 'pop_25_39', 'pop_65plus', 'pop_immigres', 'pop_etrangers']


In [17]:
# Verify CENSUS 2017 data
print("=" * 60)
print("VERIFICATION - CENSUS 2017")
print("=" * 60)
loaded_census_2017 = pd.read_parquet(datasets_dir / 'census_2017_paris.parquet')
print(f"Number of IRIS: {len(loaded_census_2017)}")
print(f"Number of rows: {len(loaded_census_2017)}")
print(f"Number of columns: {len(loaded_census_2017.columns)}")
print(f"Columns: {list(loaded_census_2017.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_census_2017['code_iris'].head(10).tolist())
print(f"\nTotal population across all IRIS: {loaded_census_2017['pop_total'].sum():,.0f}")
print(f"\nData types:")
print(loaded_census_2017.dtypes)
print(f"\nMissing values:")
print(loaded_census_2017.isnull().sum())
print("=" * 60)

VERIFICATION - CENSUS 2017
Number of IRIS: 992
Number of rows: 992
Number of columns: 13
Columns: ['code_iris', 'typ_iris', 'pop_total', 'pop_15plus', 'pop_cadres', 'pop_prof_inter', 'pop_employes', 'pop_ouvriers', 'pop_18_24', 'pop_25_39', 'pop_65plus', 'pop_immigres', 'pop_etrangers']

Sample IRIS codes:
['751010101', '751010102', '751010103', '751010104', '751010105', '751010199', '751010201', '751010202', '751010203', '751010204']

Total population across all IRIS: 2,187,526

Data types:
code_iris          object
typ_iris           object
pop_total         float64
pop_15plus        float64
pop_cadres        float64
pop_prof_inter    float64
pop_employes      float64
pop_ouvriers      float64
pop_18_24         float64
pop_25_39         float64
pop_65plus        float64
pop_immigres      float64
pop_etrangers     float64
dtype: object

Missing values:
code_iris         0
typ_iris          0
pop_total         0
pop_15plus        0
pop_cadres        0
pop_prof_inter    0
pop_employes  

In [18]:
# CENSUS 2021
print("Loading CENSUS 2021...")

# Download file
file_id = '1yfZ3EYbtnDaRhDWvP_u8ovC6HwQNuq9s'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'census_2021.xlsx'
gdown.download(url, str(raw_file), quiet=False)

census_2021 = pd.read_excel(raw_file, header=5)

# Filter for Paris intra-muros
iris_col = [col for col in census_2021.columns if 'IRIS' in col.upper()][0]
census_2021_paris = census_2021[census_2021[iris_col].astype(str).str.startswith('75')].copy()

# Keep only selected variables (2021 uses P21_ prefix)
census_columns_mapping = {
    'IRIS': 'code_iris',
    'TYP_IRIS': 'typ_iris',
    'P21_POP': 'pop_total',
    'C21_POP15P': 'pop_15plus',
    'C21_POP15P_CS3': 'pop_cadres',
    'C21_POP15P_CS4': 'pop_prof_inter',
    'C21_POP15P_CS5': 'pop_employes',
    'C21_POP15P_CS6': 'pop_ouvriers',
    'P21_POP1824': 'pop_18_24',
    'P21_POP2539': 'pop_25_39',
    'P21_POP65P': 'pop_65plus',
    'P21_POP_IMM': 'pop_immigres',
    'P21_POP_ETR': 'pop_etrangers',
}

cols_to_keep = [col for col in census_columns_mapping.keys() if col in census_2021_paris.columns]
final_names = [census_columns_mapping[col] for col in cols_to_keep]

census_2021_paris = census_2021_paris[cols_to_keep].copy()
census_2021_paris.columns = final_names

# Save to data folder
census_2021_paris.to_parquet(datasets_dir / 'census_2021_paris.parquet', index=False)
print(f"CENSUS 2021: {len(census_2021_paris)} IRIS in Paris saved")
print(f"Columns: {list(census_2021_paris.columns)}")

Loading CENSUS 2021...


Downloading...
From: https://drive.google.com/uc?id=1yfZ3EYbtnDaRhDWvP_u8ovC6HwQNuq9s
To: /workspaces/thesis/raw_datasets/census_2021.xlsx
100%|██████████| 49.7M/49.7M [00:00<00:00, 59.7MB/s]



CENSUS 2021: 992 IRIS in Paris saved
Columns: ['code_iris', 'typ_iris', 'pop_total', 'pop_15plus', 'pop_cadres', 'pop_prof_inter', 'pop_employes', 'pop_ouvriers', 'pop_18_24', 'pop_25_39', 'pop_65plus', 'pop_immigres', 'pop_etrangers']


In [19]:
# Verify CENSUS 2021 data
print("=" * 60)
print("VERIFICATION - CENSUS 2021")
print("=" * 60)
loaded_census_2021 = pd.read_parquet(datasets_dir / 'census_2021_paris.parquet')
print(f"Number of IRIS: {len(loaded_census_2021)}")
print(f"Number of rows: {len(loaded_census_2021)}")
print(f"Number of columns: {len(loaded_census_2021.columns)}")
print(f"Columns: {list(loaded_census_2021.columns)}")
print(f"\nSample IRIS codes:")
print(loaded_census_2021['code_iris'].head(10).tolist())
print(f"\nTotal population across all IRIS: {loaded_census_2021['pop_total'].sum():,.0f}")
print(f"\nData types:")
print(loaded_census_2021.dtypes)
print(f"\nMissing values:")
print(loaded_census_2021.isnull().sum())
print("=" * 60)

VERIFICATION - CENSUS 2021
Number of IRIS: 992
Number of rows: 992
Number of columns: 13
Columns: ['code_iris', 'typ_iris', 'pop_total', 'pop_15plus', 'pop_cadres', 'pop_prof_inter', 'pop_employes', 'pop_ouvriers', 'pop_18_24', 'pop_25_39', 'pop_65plus', 'pop_immigres', 'pop_etrangers']

Sample IRIS codes:
['751010101', '751010102', '751010103', '751010104', '751010105', '751010199', '751010201', '751010202', '751010203', '751010204']

Total population across all IRIS: 2,133,111

Data types:
code_iris          object
typ_iris           object
pop_total         float64
pop_15plus        float64
pop_cadres        float64
pop_prof_inter    float64
pop_employes      float64
pop_ouvriers      float64
pop_18_24         float64
pop_25_39         float64
pop_65plus        float64
pop_immigres      float64
pop_etrangers     float64
dtype: object

Missing values:
code_iris         0
typ_iris          0
pop_total         0
pop_15plus        0
pop_cadres        0
pop_prof_inter    0
pop_employes  

## 4. Load DVF Mutations Dataset


Real estate transaction data (Demandes de Valeurs Foncières).

In [20]:
# DVF Mutations - Download from Google Drive using gdown
# File ID: 1tPQNJNFTpt0Hf_H5U9Ikk6L7c4y7d7nN
print("Loading DVF Mutations dataset...")

import geopandas as gpd

# Download file
file_id = '1tPQNJNFTpt0Hf_H5U9Ikk6L7c4y7d7nN'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'dvf_mutations.gpkg'
gdown.download(url, str(raw_file), quiet=False)

# Read and filter (keep as GeoDataFrame to preserve geometry)
dvf_mutations = gpd.read_file(raw_file)
dvf_mutations_paris = dvf_mutations[dvf_mutations['coddep'] == '75'].copy()

# Keep only selected columns + geometry
columns_to_keep = [
    'datemut', 'anneemut', 'moismut',  # temporal
    'coddep', 'l_codinsee',  # spatial
    'valeurfonc',  # transaction value
    'libtypbien', 'codtypbien',  # property type
    'sbati',  # built surface
    'geometry',  # geographic data (parcels)
]

# Only keep columns that exist
existing_columns = [col for col in columns_to_keep if col in dvf_mutations_paris.columns]
dvf_mutations_paris = dvf_mutations_paris[existing_columns].copy()

# Save as GeoParquet to preserve geometry
dvf_mutations_paris.to_parquet(datasets_dir / 'dvf_mutations_paris.parquet', index=False)
print(f"DVF Mutations: {len(dvf_mutations_paris)} transactions in Paris saved")
print(f"Columns: {list(dvf_mutations_paris.columns)}")
print(f"Geometry preserved: {dvf_mutations_paris.geometry.notna().sum()} parcels with geometry ({dvf_mutations_paris.geometry.notna().sum()/len(dvf_mutations_paris)*100:.1f}%)")

Loading DVF Mutations dataset...


Downloading...
From (original): https://drive.google.com/uc?id=1tPQNJNFTpt0Hf_H5U9Ikk6L7c4y7d7nN
From (redirected): https://drive.google.com/uc?id=1tPQNJNFTpt0Hf_H5U9Ikk6L7c4y7d7nN&confirm=t&uuid=cbce9ace-6da0-494b-baac-104d7e240711
To: /workspaces/thesis/raw_datasets/dvf_mutations.gpkg
100%|██████████| 358M/358M [00:03<00:00, 102MB/s]  
100%|██████████| 358M/358M [00:03<00:00, 102MB/s]
  result = read_func(
  result = read_func(


DVF Mutations: 457097 transactions in Paris saved
Columns: ['datemut', 'anneemut', 'moismut', 'coddep', 'l_codinsee', 'valeurfonc', 'libtypbien', 'codtypbien', 'sbati', 'geometry']
Geometry preserved: 456962 parcels with geometry (100.0%)


In [21]:
# Verify DVF Mutations data
print("=" * 60)
print("VERIFICATION - DVF MUTATIONS")
print("=" * 60)
loaded_dvf = gpd.read_parquet(datasets_dir / 'dvf_mutations_paris.parquet')
print(f"Number of transactions: {len(loaded_dvf)}")
print(f"Number of rows: {len(loaded_dvf)}")
print(f"Number of columns: {len(loaded_dvf.columns)}")
print(f"Columns: {list(loaded_dvf.columns)}")
print(f"\nDate range: {loaded_dvf['datemut'].min()} to {loaded_dvf['datemut'].max()}")
print(f"Years covered: {sorted(loaded_dvf['anneemut'].unique())}")
print(f"\nTotal transaction value: {loaded_dvf['valeurfonc'].sum():,.0f} EUR")
print(f"Average transaction value: {loaded_dvf['valeurfonc'].mean():,.2f} EUR")
print(f"\nProperty types:")
print(loaded_dvf['libtypbien'].value_counts())
print(f"\nGeometry information:")
print(f"  Has geometry column: {'geometry' in loaded_dvf.columns}")
if 'geometry' in loaded_dvf.columns:
    print(f"  Non-null geometries: {loaded_dvf.geometry.notna().sum()}")
    print(f"  Geometry types: {loaded_dvf[loaded_dvf.geometry.notna()].geometry.geom_type.unique()}")
print(f"\nData types:")
print(loaded_dvf.dtypes)
print(f"\nMissing values:")
print(loaded_dvf.isnull().sum())
print("=" * 60)

VERIFICATION - DVF MUTATIONS
Number of transactions: 457097
Number of rows: 457097
Number of columns: 10
Columns: ['datemut', 'anneemut', 'moismut', 'coddep', 'l_codinsee', 'valeurfonc', 'libtypbien', 'codtypbien', 'sbati', 'geometry']

Date range: 2014-01-02 to 2024-12-31
Years covered: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]

Total transaction value: 337,968,281,467 EUR
Average transaction value: 739,820.02 EUR

Property types:
libtypbien
UN APPARTEMENT                               330601
UNE DEPENDANCE                                51776
ACTIVITE                                      29326
DEUX APPARTEMENTS                             16431
BATI MIXTE - LOGEMENT/ACTIVITE                 7705
DES DEPENDANCES                                7327
APPARTEMENT INDETERMINE                        6346
BATI - INDETERMINE : Vefa sans descriptif      4184
UN

## 5. Load IRIS GeoJSON


Geographic boundaries for IRIS zones in France.

In [22]:
# IRIS GeoJSON - Geographic boundaries
print("Loading IRIS GeoJSON...")

# Download file
file_id = '1yWwsp5LcykD5UtvVPj_S695ALSKsCskP'
url = f'https://drive.google.com/uc?id={file_id}'
raw_file = raw_datasets_dir / 'iris.geojson'
gdown.download(url, str(raw_file), quiet=False)

iris_geo = gpd.read_file(raw_file)

# Filter for Paris intra-muros using the code_iris column (full 9-digit code)
iris_geo_paris = iris_geo[iris_geo['code_iris'].astype(str).str.startswith('75')].copy()

# Save to datasets folder
iris_geo_paris.to_file(datasets_dir / 'iris_paris.geojson', driver='GeoJSON')
print(f"IRIS GeoJSON: {len(iris_geo_paris)} IRIS zones in Paris saved")
print(f"Columns: {list(iris_geo_paris.columns)}")

Loading IRIS GeoJSON...


Downloading...
From: https://drive.google.com/uc?id=1yWwsp5LcykD5UtvVPj_S695ALSKsCskP
To: /workspaces/thesis/raw_datasets/iris.geojson
100%|██████████| 11.6M/11.6M [00:00<00:00, 52.2MB/s]



IRIS GeoJSON: 992 IRIS zones in Paris saved
Columns: ['dep', 'insee_com', 'nom_com', 'iris', 'code_iris', 'nom_iris', 'typ_iris', 'geo_point_2d', 'id', 'geometry']


In [23]:
# Verify IRIS GeoJSON data
print("=" * 60)
print("VERIFICATION - IRIS GEOJSON")
print("=" * 60)
loaded_iris_geo = gpd.read_file(datasets_dir / 'iris_paris.geojson')
print(f"Number of IRIS zones: {len(loaded_iris_geo)}")
print(f"Number of rows: {len(loaded_iris_geo)}")
print(f"Number of columns: {len(loaded_iris_geo.columns)}")
print(f"Columns: {list(loaded_iris_geo.columns)}")
print(f"\nCRS (Coordinate Reference System): {loaded_iris_geo.crs}")
print(f"Geometry type: {loaded_iris_geo.geometry.geom_type.unique()}")
print(f"\nSample IRIS codes:")
print(loaded_iris_geo['code_iris'].head(10).tolist())
print(f"\nIRIS type distribution:")
print(loaded_iris_geo['typ_iris'].value_counts())
print(f"\nData types:")
print(loaded_iris_geo.dtypes)
print(f"\nMissing values:")
print(loaded_iris_geo.isnull().sum())
print("=" * 60)

VERIFICATION - IRIS GEOJSON
Number of IRIS zones: 992
Number of rows: 992
Number of columns: 10
Columns: ['dep', 'insee_com', 'nom_com', 'iris', 'code_iris', 'nom_iris', 'typ_iris', 'geo_point_2d', 'id', 'geometry']

CRS (Coordinate Reference System): EPSG:4326
Geometry type: ['Polygon' 'MultiPolygon']

Sample IRIS codes:
['751072601', '751072603', '751093605', '751114108', '751114404', '751186903', '751208022', '751156099', '751072705', '751072804']

IRIS type distribution:
typ_iris
H    861
A     88
D     43
Name: count, dtype: int64

Data types:
dep               object
insee_com          int32
nom_com           object
iris              object
code_iris         object
nom_iris          object
typ_iris          object
geo_point_2d      object
id                object
geometry        geometry
dtype: object

Missing values:
dep             0
insee_com       0
nom_com         0
iris            0
code_iris       0
nom_iris        0
typ_iris        0
geo_point_2d    0
id              0
ge

## 6. Export & Quality Check


All datasets have been filtered for Paris intra-muros and saved to the `datasets/` folder.

In [24]:
# Summary of all loaded datasets
import os

print("=" * 80)
print("DATASETS LOADED - PARIS INTRA-MUROS ONLY")
print("=" * 80)

print("\nRAW DATASETS:")
print("-" * 40)
raw_files = sorted(os.listdir(raw_datasets_dir))
for file in raw_files:
    file_path = raw_datasets_dir / file
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"{file}: {size_mb:.2f} MB")

print("\nPROCESSED DATASETS:")
print("-" * 40)
datasets_files = sorted(os.listdir(datasets_dir))
for file in datasets_files:
    file_path = datasets_dir / file
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"{file}: {size_mb:.2f} MB")

print("\n" + "=" * 80)
print("DATA LOADING COMPLETE!")
print("=" * 80)
print("\nDatasets summary:")
print("- 3 FILOSOFI datasets (2013, 2017, 2021) - Income data")
print("- 3 EDUCATION datasets (2013, 2017, 2021) - Higher education (Bac+3/4)")
print("- 3 CENSUS datasets (2013, 2017, 2021) - Population data")
print("- DVF Mutations - Real estate transactions")
print("- IRIS GeoJSON - Geographic boundaries for Paris IRIS zones")
print(f"\nRaw data saved in '{raw_datasets_dir}/' folder")
print(f"Processed data saved in '{datasets_dir}/' folder - all filtered for Paris only.")

DATASETS LOADED - PARIS INTRA-MUROS ONLY

RAW DATASETS:
----------------------------------------
census_2013.xlsx: 67.43 MB
census_2017.xlsx: 42.63 MB
census_2021.xlsx: 47.39 MB
dvf_mutations.gpkg: 341.20 MB
education_2013.xlsx: 35.34 MB
education_2017.xlsx: 25.23 MB
education_2021.xlsx: 27.81 MB
filosofi_2013.xlsx: 4.81 MB
filosofi_2017.xlsx: 2.63 MB
filosofi_2021.xlsx: 2.69 MB
iris.geojson: 11.11 MB

PROCESSED DATASETS:
----------------------------------------
census_2013_paris.parquet: 0.11 MB
census_2013_paris_clean.parquet: 0.12 MB
census_2017_paris.parquet: 0.11 MB
census_2017_paris_clean.parquet: 0.12 MB
census_2021_paris.parquet: 0.11 MB
census_2021_paris_clean.parquet: 0.12 MB
dvf_mutations_paris.parquet: 109.86 MB
dvf_mutations_paris_clean.parquet: 3.46 MB
education_2013_paris.parquet: 0.02 MB
education_2013_paris_clean.parquet: 0.02 MB
education_2017_paris.parquet: 0.04 MB
education_2017_paris_clean.parquet: 0.04 MB
education_2021_paris.parquet: 0.04 MB
education_2021_paris_