In [174]:
# Add your imports here
import pandas as pd
import numpy as np
import scipy as sp
import findspark
import os
findspark.init(os.environ['SPARK_HOME'])
from pyspark.sql import *
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

In [175]:
departements = (["{:0^3}".format(i) for i in range(1, 20)] +
               ["{:0^3}".format(i) for i in range(21, 96)] +
               # Corsica
               ["2A0", "2B0"] +
               # DOM-TOM
               ["{}".format(i) for i in range(971, 975)] + ["976"])

In [176]:
# Read all the departments-related sheets of the excel files
# The headers are split across rows 19 and 20.
ircom_communes = pd.read_excel("../data/raw/ircom_2017_revenus_2016.xlsx", header=[19, 20],
                               sheet_name=departements
                              )

In [177]:
def format_sheet(sheet, sheet_name):
    ircom_communes_processed = sheet
    ircom_communes_processed.columns = ['_'.join([column for column in col if not "Unnamed" in column])
                                            .strip().strip('_') 
                                        for col in ircom_communes_processed.columns.values
                                        ]
    return ircom_communes_processed


ircom_communes_aggregated = None
for sheet_name, sheet in ircom_communes.items():
    if ircom_communes_aggregated is not None:
        df = format_sheet(sheet, sheet_name)
        ircom_communes_aggregated = pd.concat([ircom_communes_aggregated, df])
    else:
        ircom_communes_aggregated = format_sheet(sheet, sheet_name)

ircom_communes_aggregated.head()

Unnamed: 0,Dép.,Commune,Libellé de la commune,Revenu fiscal de référence par tranche (en euros),Nombre de foyers fiscaux,Revenu fiscal de référence des foyers fiscaux,Impôt net (total)*,Nombre de foyers fiscaux imposés,Revenu fiscal de référence des foyers fiscaux imposés,Traitements et salaires_Nombre de foyers concernés,Traitements et salaires_Montant,Retraites et pensions_Nombre de foyers concernés,Retraites et pensions_Montant
,10,1,L'Abergement-Clémenciat,Total,413,12210.0,692.389,219,8917.02,280,8769.07,155,3802.81
,10,2,L'Abergement-de-Varey,Total,127,3866.62,196.849,67,2776.44,93,2825.99,42,1189.4
,10,4,Ambérieu-en-Bugey,0 à 10 000,1754,7411.82,-15.782,n.c.,n.c.,849,5426.35,467,3989.3
,10,4,Ambérieu-en-Bugey,10 001 à 12 000,497,5476.77,-9.969,n.c.,n.c.,318,4075.28,191,2516.25
,10,4,Ambérieu-en-Bugey,12 001 à 15 000,894,12154.1,-23.811,58,861.66,693,10835.4,227,3352.28


In [178]:
ircom_communes_aggregated = ircom_communes_aggregated[
    ircom_communes_aggregated['Revenu fiscal de référence par tranche (en euros)'] == 'Total'
]
ircom_communes_aggregated = ircom_communes_aggregated.drop(columns=['Impôt net (total)*', 'Traitements et salaires_Nombre de foyers concernés',
       'Traitements et salaires_Montant',
       'Retraites et pensions_Nombre de foyers concernés',
       'Revenu fiscal de référence des foyers fiscaux imposés',
       'Revenu fiscal de référence par tranche (en euros)',
       'Retraites et pensions_Montant'])
ircom_communes_aggregated.head()

Unnamed: 0,Dép.,Commune,Libellé de la commune,Nombre de foyers fiscaux,Revenu fiscal de référence des foyers fiscaux,Nombre de foyers fiscaux imposés
,10,1,L'Abergement-Clémenciat,413,12210.0,219
,10,2,L'Abergement-de-Varey,127,3866.62,67
,10,4,Ambérieu-en-Bugey,8186,194722.0,3505
,10,5,Ambérieux-en-Dombes,862,25561.9,442
,10,6,Ambléon,59,1755.41,32


In [179]:
ircom_communes_aggregated = ircom_communes_aggregated[ircom_communes_aggregated['Nombre de foyers fiscaux'] != "n.c."]

In [180]:
ircom_communes_aggregated = ircom_communes_aggregated[ircom_communes_aggregated['Revenu fiscal de référence des foyers fiscaux'] != "n.c."]
ircom_communes_aggregated = ircom_communes_aggregated[ircom_communes_aggregated['Nombre de foyers fiscaux imposés'] != "n.c."]

In [181]:
ircom_communes_aggregated['Nombre de foyers fiscaux'] = ircom_communes_aggregated['Nombre de foyers fiscaux'].astype('int')
ircom_communes_aggregated['Revenu fiscal de référence des foyers fiscaux'] = ircom_communes_aggregated['Revenu fiscal de référence des foyers fiscaux'].astype('float')
ircom_communes_aggregated['Nombre de foyers fiscaux imposés'] = ircom_communes_aggregated['Nombre de foyers fiscaux imposés'].astype('int')

In [182]:
ircom_communes_aggregated['Mean reference fiscal income (in k€)'] = ircom_communes_aggregated["Revenu fiscal de référence des foyers fiscaux"] / ircom_communes_aggregated['Nombre de foyers fiscaux']

In [183]:
ircom_communes_aggregated.head()

Unnamed: 0,Dép.,Commune,Libellé de la commune,Nombre de foyers fiscaux,Revenu fiscal de référence des foyers fiscaux,Nombre de foyers fiscaux imposés,Mean reference fiscal income (in k€)
,10,1,L'Abergement-Clémenciat,413,12209.992,219,29.564145
,10,2,L'Abergement-de-Varey,127,3866.621,67,30.445835
,10,4,Ambérieu-en-Bugey,8186,194722.047,3505,23.787203
,10,5,Ambérieux-en-Dombes,862,25561.917,442,29.654196
,10,6,Ambléon,59,1755.412,32,29.752746


In [184]:
# Helper to remove the accents
import unicodedata as ud

def remove_accents(input_str):
    nkfd_form = ud.normalize('NFKD', str(input_str))
    return u"".join([c for c in nkfd_form if not ud.combining(c)])

In [185]:
global_arrond = pd.read_csv("../data/processed/clean_food_cities_arrond.csv")
global_arrond.head()

Unnamed: 0.1,Unnamed: 0,food_item_index,nutrition-score-fr_100g,nutrition-score-uk_100g,nutrition_grade_numeric,serving_size,energy_100g,fat_100g,saturated-fat_100g,proteins_100g,carbohydrates_100g,sugars_100g,fiber_100g,Median revenue euros,Total poverty rate (%),Poverty rate (-30) (%),Poverty rate (30-39) (%),Poverty rate (40-49) (%),Poverty rate (50-59) (%),Poverty rate (60-74) (%),Poverty rate (75+) (%),Poverty rate (house owners) (%),Poverty rate (tenants) (%),Share of activity revenue (%),Share of retreat pension revenue (%),Share of heritage revenue and other (%),Share of social benefits revenue (%),Share of taxes (%),City name,Department,Region,custom_arrondissement_code,city_name,city_tag_from_food_item
0,0,362362,15.0,20.0,4,,1402.0,30.0,19.0,15.0,1.3,1.3,,18563.0,18.618075,,22.133096,22.284813,15.726548,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,Belley,1,84,1001,belley,belley-ain-france
1,1,362364,15.0,20.0,4,,1435.0,31.0,21.0,15.0,1.0,1.0,,18563.0,18.618075,,22.133096,22.284813,15.726548,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,Belley,1,84,1001,belley,belley-ain-france
2,2,362386,15.0,20.0,4,,1435.0,31.0,21.0,15.0,1.0,1.0,0.0,18563.0,18.618075,,22.133096,22.284813,15.726548,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,Belley,1,84,1001,belley,belley-ain-france
3,3,362366,14.0,19.0,4,,1238.0,24.0,18.0,19.0,1.0,1.0,,18563.0,18.618075,,22.133096,22.284813,15.726548,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,Belley,1,84,1001,belley,belley-ain-france
4,4,362389,14.0,19.0,4,150 g,1243.0,24.0,17.0,19.0,1.3,1.3,0.0,18563.0,18.618075,,22.133096,22.284813,15.726548,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,Belley,1,84,1001,belley,belley-ain-france


In [186]:
global_arrond = global_arrond.drop(columns=['Unnamed: 0'])
global_arrond = global_arrond.drop_duplicates()

In [187]:
global_arrond['Department'] = ["{:0^3}".format(d) for d in global_arrond['Department']]

In [188]:
ircom_communes_aggregated['city_tag'] = ircom_communes_aggregated['Libellé de la commune'].apply(
    lambda x: remove_accents(x.lower().replace("'", "-"))
)
ircom_communes_aggregated.head()

Unnamed: 0,Dép.,Commune,Libellé de la commune,Nombre de foyers fiscaux,Revenu fiscal de référence des foyers fiscaux,Nombre de foyers fiscaux imposés,Mean reference fiscal income (in k€),city_tag
,10,1,L'Abergement-Clémenciat,413,12209.992,219,29.564145,l-abergement-clemenciat
,10,2,L'Abergement-de-Varey,127,3866.621,67,30.445835,l-abergement-de-varey
,10,4,Ambérieu-en-Bugey,8186,194722.047,3505,23.787203,amberieu-en-bugey
,10,5,Ambérieux-en-Dombes,862,25561.917,442,29.654196,amberieux-en-dombes
,10,6,Ambléon,59,1755.412,32,29.752746,ambleon


In [189]:
global_arrond = pd.merge(ircom_communes_aggregated, global_arrond, left_on=['city_tag', 'Dép.'], right_on=['city_name', 'Department'])

In [190]:
pd.set_option('display.max_columns', 50)
global_arrond.head()

Unnamed: 0,Dép.,Commune,Libellé de la commune,Nombre de foyers fiscaux,Revenu fiscal de référence des foyers fiscaux,Nombre de foyers fiscaux imposés,Mean reference fiscal income (in k€),city_tag,food_item_index,nutrition-score-fr_100g,nutrition-score-uk_100g,nutrition_grade_numeric,serving_size,energy_100g,fat_100g,saturated-fat_100g,proteins_100g,carbohydrates_100g,sugars_100g,fiber_100g,Median revenue euros,Total poverty rate (%),Poverty rate (-30) (%),Poverty rate (30-39) (%),Poverty rate (40-49) (%),Poverty rate (50-59) (%),Poverty rate (60-74) (%),Poverty rate (75+) (%),Poverty rate (house owners) (%),Poverty rate (tenants) (%),Share of activity revenue (%),Share of retreat pension revenue (%),Share of heritage revenue and other (%),Share of social benefits revenue (%),Share of taxes (%),City name,Department,Region,custom_arrondissement_code,city_name,city_tag_from_food_item
0,2B0,58,Canari,184,3017.682,49,16.400446,canari,307386,25.0,25.0,5,31.2 g,2159.0,27.0,18.0,5.2,64.0,23.0,0.0,14962.0,,,,,,,,,,,,,,,Canari,2B0,94,2B002,canari,canari-corse-france
1,2B0,88,Chiatra,145,3047.807,46,21.019359,chiatra,485735,4.0,-3.0,3,,192.0,0.0,0.0,0.0,11.0,10.0,,19395.555556,,,,,,,,,,,,,,,Chiatra,2B0,94,2B003,chiatra,chiatra-corse-france
2,2B0,88,Chiatra,145,3047.807,46,21.019359,chiatra,485732,3.0,-3.0,3,,180.0,0.0,0.0,0.0,10.0,10.0,0.0,19395.555556,,,,,,,,,,,,,,,Chiatra,2B0,94,2B003,chiatra,chiatra-corse-france
3,2B0,123,Ghisonaccia,2190,47584.009,718,21.727858,ghisonaccia,335427,13.0,18.0,4,,1833.0,38.5,8.4,24.0,0.0,0.0,0.0,16595.5,28.895753,,,39.115411,,,,13.776435,44.707741,67.3,30.1,12.8,6.9,-17.1,Ghisonaccia,2B0,94,2B003,ghisonaccia,ghisonaccia-corse-france
4,2B0,167,Montegrosso,268,5397.784,101,20.140985,montegrosso,370573,14.0,14.0,4,,2000.0,19.4,1.69,6.9,67.5,28.8,,18541.333333,,,,,,,,,,,,,,,Montegrosso,2B0,94,2B005,montegrosso,montegrosso-corse-france


In [191]:
global_arrond.drop(columns=["Dép.", "Commune", "Libellé de la commune", "Revenu fiscal de référence des foyers fiscaux"],
                   inplace=True)
global_arrond.rename(columns={"Nombre de foyers fiscaux": "Fiscal household number", 
                              "Nombre de foyers fiscaux imposés": "Taxed households number",
                             })

Unnamed: 0,Fiscal household number,Taxed households number,Mean reference fiscal income (in k€),city_tag,food_item_index,nutrition-score-fr_100g,nutrition-score-uk_100g,nutrition_grade_numeric,serving_size,energy_100g,fat_100g,saturated-fat_100g,proteins_100g,carbohydrates_100g,sugars_100g,fiber_100g,Median revenue euros,Total poverty rate (%),Poverty rate (-30) (%),Poverty rate (30-39) (%),Poverty rate (40-49) (%),Poverty rate (50-59) (%),Poverty rate (60-74) (%),Poverty rate (75+) (%),Poverty rate (house owners) (%),Poverty rate (tenants) (%),Share of activity revenue (%),Share of retreat pension revenue (%),Share of heritage revenue and other (%),Share of social benefits revenue (%),Share of taxes (%),City name,Department,Region,custom_arrondissement_code,city_name,city_tag_from_food_item
0,184,49,16.400446,canari,307386,25.0,25.0,5,31.2 g,2159.0,27.0,18.0,5.2,64.0,23.0,0.0,14962.0,,,,,,,,,,,,,,,Canari,2B0,94,2B002,canari,canari-corse-france
1,145,46,21.019359,chiatra,485735,4.0,-3.0,3,,192.0,0.0,0.0,0.0,11.0,10.0,,19395.555556,,,,,,,,,,,,,,,Chiatra,2B0,94,2B003,chiatra,chiatra-corse-france
2,145,46,21.019359,chiatra,485732,3.0,-3.0,3,,180.0,0.0,0.0,0.0,10.0,10.0,0.0,19395.555556,,,,,,,,,,,,,,,Chiatra,2B0,94,2B003,chiatra,chiatra-corse-france
3,2190,718,21.727858,ghisonaccia,335427,13.0,18.0,4,,1833.0,38.5,8.4,24.0,0.0,0.0,0.0,16595.5,28.895753,,,39.115411,,,,13.776435,44.707741,67.3,30.1,12.8,6.9,-17.1,Ghisonaccia,2B0,94,2B003,ghisonaccia,ghisonaccia-corse-france
4,268,101,20.140985,montegrosso,370573,14.0,14.0,4,,2000.0,19.4,1.69,6.9,67.5,28.8,,18541.333333,,,,,,,,,,,,,,,Montegrosso,2B0,94,2B005,montegrosso,montegrosso-corse-france
5,117,58,26.911607,urtaca,393685,21.0,21.0,5,,1218.0,20.1,8.2,26.0,1.1,0.5,0.0,20603.846154,,,,,,,,,,,,,,,Urtaca,2B0,94,2B005,urtaca,urtaca-corse-france
6,117,58,26.911607,urtaca,335610,15.0,15.0,4,,933.0,9.4,3.8,33.7,0.8,0.4,,20603.846154,,,,,,,,,,,,,,,Urtaca,2B0,94,2B005,urtaca,urtaca-corse-france
7,117,58,26.911607,urtaca,420298,24.0,24.0,5,,1615.0,31.0,13.0,27.0,1.0,0.0,,20603.846154,,,,,,,,,,,,,,,Urtaca,2B0,94,2B005,urtaca,urtaca-corse-france
8,117,58,26.911607,urtaca,335609,21.0,21.0,5,,1211.0,20.1,8.2,26.3,1.1,0.5,,20603.846154,,,,,,,,,,,,,,,Urtaca,2B0,94,2B005,urtaca,urtaca-corse-france


In [193]:
global_arrond.to_csv("../data/processed/clean_food_cities_arrond_with_tax.csv")