## Loading and preprocessing ISTAT data

Processing pipeline to collect Istat cpa 2011 data for selected cities from downloaded data dump.

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
from os import path, listdir

In [None]:
## TODO: find way to put this into some global settings
import os
import sys
# go up two levels in path
nb_dir = os.path.dirname(os.path.dirname(os.getcwd()))
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from references import commonCfg
sezNameCol = commonCfg.sezioneColName

In [None]:
## Loading ISTAT data
fileList = commonCfg.get_istat_filelist()
dfList = []
dataDict = {}
for filename in fileList:
    regionData = pd.read_csv(
        path.join(commonCfg.cpaPath, filename), sep=';', encoding='latin').set_index('SEZ2011')
    # extract the councils we are intersted in
    for city in commonCfg.cityList:
        cityData = regionData[regionData.COMUNE==city]
        if cityData.size>0:
            dataDict[city] = cityData
            
# export as csv to final folder
for city, cityData in dataDict.items():
    pass
    #cityData.to_csv('../../data/processed/istat_cpa_2011/' + city + '_cpa_2011.csv', sep=';')

### Process ISTAT data for Milano to assign a quartiere label to each sezione

In [None]:
# Join sez geofile for Milano - bespoke processing
shapeDataMilano = gpd.read_file('../../data/raw/Milano_specific/Milano_sezioniShapefile')

# These aren't exactly equal to 1, need to check mismatches in joining
print(shapeDataMilano[sezNameCol].isin(dataDict['Milano'].index).mean())
print(dataDict['Milano'].index.isin(shapeDataMilano[sezNameCol]).mean())

joinedData = pd.merge(shapeDataMilano, dataDict['Milano'], how='inner',
                      right_index=True, left_on=commonCfg.sezioneColName)
# Cast as int
joinedData[sezNameCol] = joinedData[sezNameCol].astype(int)

In [None]:
# collect quartiere label
quartiereLabels = pd.read_csv('../../data/raw/Milano_specific/Milano_sezToQuartieri.csv')

# fix typo
quartiereLabels.NIL.replace(to_replace='MAGENTA - S.VITTORE', value='MAGENTA - S. VITTORE', inplace=True)

# load name-id table for quartieri
quartieriData = gpd.read_file('../../data/raw/Milano_specific/Milano_quartieri.geojson')

# join
quartiereLabels = quartiereLabels.join(quartieriData[['NIL', 'ID_NIL']].set_index('NIL'), on='NIL')
quartiereLabels = quartiereLabels.set_index(sezNameCol)

assert not any(quartiereLabels.ID_NIL.isnull()), 'Typos in NIL field'

In [None]:
joinedData = joinedData.join(quartiereLabels, on=sezNameCol)
joinedData = joinedData[~joinedData.ID_NIL.isnull()]
joinedData.ID_NIL = joinedData.ID_NIL.astype(int)
#joinedData.to_csv('test.csv')

In [None]:
quartiereLabels

In [None]:
# rename columns to match standard config
joinedData.rename({'NIL':commonCfg.quartiereDescColName, 'ID_NIL': commonCfg.IdQuartiereColName},\
                  axis='columns', inplace=True)


In [None]:
joinedData.to_file('../../data/processed/Milano_sezioni.geojson', driver='GeoJSON')

In [None]:
# check loading
print(gpd.read_file('../../final/Milano_sezioni.geojson'))