In [1]:
# Add your imports here
import pandas as pd
import numpy as np
import scipy as sp
import findspark
import os
findspark.init(os.environ['SPARK_HOME'])
from pyspark.sql import *
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

%matplotlib inline

spark = SparkSession.builder.getOrCreate()


In [2]:
food_facts = spark.read.csv('../data/raw/en.openfoodfacts.org.products.csv', header=True, sep="\t")
food_facts.registerTempTable('food_facts')

In [3]:
food_facts.printSchema()

root
 |-- code: string (nullable = true)
 |-- url: string (nullable = true)
 |-- creator: string (nullable = true)
 |-- created_t: string (nullable = true)
 |-- created_datetime: string (nullable = true)
 |-- last_modified_t: string (nullable = true)
 |-- last_modified_datetime: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- generic_name: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- packaging: string (nullable = true)
 |-- packaging_tags: string (nullable = true)
 |-- brands: string (nullable = true)
 |-- brands_tags: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- categories_tags: string (nullable = true)
 |-- categories_en: string (nullable = true)
 |-- origins: string (nullable = true)
 |-- origins_tags: string (nullable = true)
 |-- manufacturing_places: string (nullable = true)
 |-- manufacturing_places_tags: string (nullable = true)
 |-- labels: string (nullable = true)
 |-- labels_tags: string (nullable 

# First exploration of the data

In [4]:
number_of_products = food_facts.count()
print(number_of_products)

689473


In [5]:
print("Number of distinct city tags without france: {}".format(
    food_facts.select('cities_tags').filter('cities_tags NOT LIKE "%france%"')
    .distinct().count())
)
print("Number of distinct city tags : {}".format(
    food_facts.select('cities_tags').distinct().count())
)

Number of distinct city tags without france: 3
Number of distinct city tags : 3322


In [6]:
print("Number of distinct purchase places without france in the name: {}".format(
    food_facts.select('purchase_places').filter('LOWER(purchase_places) NOT LIKE "%france%"')
    .distinct().count()))
print("Number of distinct purchase places : {}".format(
    food_facts.select('purchase_places').distinct().count()))

Number of distinct purchase places without france in the name: 3412
Number of distinct purchase places : 7733


In [7]:
print("Number of manufacturing places without france in the name: {}".format(
    food_facts.select('manufacturing_places_tags')
              .filter('LOWER(manufacturing_places_tags) NOT LIKE "%france%"')
              .distinct().count())
)
print("Number of manufacturing places : {}".format(
    food_facts.select('manufacturing_places_tags').distinct().count())
)

Number of manufacturing places without france in the name: 7501
Number of manufacturing places : 11866


It seems more interesting to make an analysis focused on the French situation rather than the situation in the US.

In [8]:
food_facts.select('generic_name').filter('generic_name IS NOT NULL') \
    .distinct().toPandas().head()

Unnamed: 0,generic_name
0,Banana & Berries - Banane et baies
1,Boisson au jus de cranberry
2,Bebida de almendras UHT con calcio y vitaminas
3,Cereal de maiz inflado endulzado
4,Base en pâte brisée avec du bœuf d'origine bri...


We'll probably have to filter out the names that are not present in French.

In [9]:
nutrition_grades = food_facts.select(
    'nutrition_grade_fr', 'nutrition_grade_uk', 'code', 'nutrition-score-fr_100g', 'nutrition-score-uk_100g'
    ).filter('nutrition_grade_fr IS NOT NULL OR nutrition_grade_uk IS NOT NULL OR `nutrition-score-uk_100g` IS NOT NULL OR `nutrition-score-fr_100g` IS NOT NULL').toPandas()
print("Total number of products for which we have nutrition informations: {}"
      .format(nutrition_grades.count())
)
print("Proportion of products for which we have nutrition informations: {ratio:.3%}"
      .format(ratio=(nutrition_grades['code'].count()/number_of_products))
)
print("Description of the table: ")
nutrition_grades.describe()

Total number of products for which we have nutrition informations: nutrition_grade_fr         138913
nutrition_grade_uk              0
code                       138913
nutrition-score-fr_100g    138913
nutrition-score-uk_100g    138913
dtype: int64
Proportion of products for which we have nutrition informations: 20.148%
Description of the table: 


Unnamed: 0,nutrition_grade_fr,nutrition_grade_uk,code,nutrition-score-fr_100g,nutrition-score-uk_100g
count,138913,0.0,138913,138913,138913
unique,5,0.0,138909,55,55
top,d,,3350033435445,0,0
freq,40532,,2,7222,10015


`nutrition_grade_uk` is useless.

## Data cleaning

## Food

In [10]:
# select only the columns we plan to use
food_dataframe = food_facts.select('generic_name',
                                   'code',
                                   'nutrition_grade_fr',
                                   'nutrition-score-fr_100g',
                                   'nutrition-score-uk_100g',
                                   'serving_size',
                                   'energy_100g',
                                   'energy-from-fat_100g', 
                                   'trans-fat_100g',
                                   'fat_100g',
                                   'saturated-fat_100g',
                                   'monounsaturated-fat_100g',
                                   'polyunsaturated-fat_100g',
                                   'cholesterol_100g',
                                   'proteins_100g',
                                   'carbohydrates_100g', 
                                   'sugars_100g', 
                                   'fiber_100g',
                                   'fruits-vegetables-nuts_100g',
                                   'fruits-vegetables-nuts-estimate_100g',
                                   'glycemic-index_100g',
                                   'cities',
                                   'cities_tags',
                                   'purchase_places',
                                   'stores',
                                   'countries',
                                   'countries_tags')
# move to pandas for the rest of the analysis
food_dataframe = food_dataframe.toPandas()

We first remove all products that are not sold in France.

In [11]:
food_dataframe = food_dataframe[np.logical_not(food_dataframe['countries_tags'].apply(
                                        lambda x: x is None or "france" not in x.lower())
               & food_dataframe['countries'].apply(lambda x: x is None or "france" not in x.lower())
               & food_dataframe['cities_tags'].apply(lambda x: x is None or "france" not in x.lower())
               & food_dataframe['cities'].apply(lambda x: x is None or "france" not in x.lower())
               & food_dataframe['purchase_places'].apply(lambda x: x is None or "france" not in x.lower())
               & food_dataframe['stores'].apply(lambda x: x is None or "france" not in x.lower()))]

Now, we want to check how many of our features have a significant amount of entries.

In [12]:
total_rows = len(food_dataframe)
for col in list(food_dataframe.columns):
    none_values = np.count_nonzero(food_dataframe[col].apply(lambda x: x is None))
    not_none_percentage = (total_rows - none_values) / total_rows
    print('Rows that are **not** None in {col}: {p:.3%}'.format(col=col, p=not_none_percentage))

Rows that are **not** None in generic_name: 12.576%
Rows that are **not** None in code: 100.000%
Rows that are **not** None in nutrition_grade_fr: 26.282%
Rows that are **not** None in nutrition-score-fr_100g: 26.282%
Rows that are **not** None in nutrition-score-uk_100g: 26.282%
Rows that are **not** None in serving_size: 11.084%
Rows that are **not** None in energy_100g: 86.363%
Rows that are **not** None in energy-from-fat_100g: 0.024%
Rows that are **not** None in trans-fat_100g: 0.308%
Rows that are **not** None in fat_100g: 85.263%
Rows that are **not** None in saturated-fat_100g: 86.017%
Rows that are **not** None in monounsaturated-fat_100g: 0.433%
Rows that are **not** None in polyunsaturated-fat_100g: 0.440%
Rows that are **not** None in cholesterol_100g: 0.300%
Rows that are **not** None in proteins_100g: 86.294%
Rows that are **not** None in carbohydrates_100g: 85.196%
Rows that are **not** None in sugars_100g: 86.043%
Rows that are **not** None in fiber_100g: 25.606%
Rows 

**Based on this, we will have to remove some rows:**

We remove the rows for which we have less than 10% of not None values.

- energy-from-fat_100g
- monounsaturated-fat_100g
- polyunsaturated-fat_100g
- cholesterol_100g
- trans-fat_100g
- fruits-vegetables-nuts_100g
- fruits-vegetables-nuts-estimate_100g
- glycemic-index_100g
- cities
- generic_name

Since we have the code of the product, we drop the generic_name attribute too.

We also need to drop all rows for which we have no information whatsoever (nutrition_grade, fat, saturated fat, sugars, proteins, fiber, carbohydrates, energy).

Finally, we need to drop all rows for which we have no information on the city where it is sold.

In [13]:
# drop the columns with too few values
food_dataframe = food_dataframe.drop(columns=['energy-from-fat_100g', 
                             'monounsaturated-fat_100g', 
                             'polyunsaturated-fat_100g',
                             'cholesterol_100g',
                             'trans-fat_100g',
                             'fruits-vegetables-nuts_100g',
                             'fruits-vegetables-nuts-estimate_100g',
                             'glycemic-index_100g',
                             'glycemic-index_100g',
                             'cities',
                             'generic_name'])

# drop the rows with too little nutritional information
food_dataframe = food_dataframe[np.logical_not(food_dataframe['nutrition_grade_fr'].apply(lambda x: x is None)
                                               # we do not check the values of nutrition-score-fr_100g or
                                               # nutrition-score-uk_100g, because we know that they exists for exactly
                                               # the same rows as nutrition_grade_fr
                                               & food_dataframe['energy_100g'].apply(lambda x: x is None) 
                                               & food_dataframe['fat_100g'].apply(lambda x: x is None) 
                                               & food_dataframe['saturated-fat_100g'].apply(lambda x: x is None) 
                                               & food_dataframe['carbohydrates_100g'].apply(lambda x: x is None) 
                                               & food_dataframe['sugars_100g'].apply(lambda x: x is None) 
                                               & food_dataframe['fiber_100g'].apply(lambda x: x is None) 
                                               & food_dataframe['proteins_100g'].apply(lambda x: x is None)
                                              )
                               ]

# drop the rows with no purchase place
food_dataframe = food_dataframe[food_dataframe['purchase_places'].apply(lambda x: x is not None)]

In [14]:
food_dataframe.head()

Unnamed: 0,code,nutrition_grade_fr,nutrition-score-fr_100g,nutrition-score-uk_100g,serving_size,energy_100g,fat_100g,saturated-fat_100g,proteins_100g,carbohydrates_100g,sugars_100g,fiber_100g,cities_tags,purchase_places,stores,countries,countries_tags
106,20114,,,,16 ml,0,0.0,0.0,0.0,0.0,0.0,,,France,,en:france,en:france
251,274722,b,0.0,0.0,285 g,450,2.2,0.9,6.8,15.3,0.5,0.5,,France,Comme J'aime,France,en:france
252,274739,b,1.0,1.0,300 g,455,4.2,2.1,4.4,12.5,1.4,1.8,,France,,France,en:france
289,7020254,e,12.0,4.0,33 cl,213,0.0,0.0,0.0,14.0,0.0,0.0,,"France,États-Unis",,"France,États-Unis","en:france,en:united-states"
298,9125124,b,1.0,1.0,40 g + 100 ml d'eau,1577,0.5,0.1,1.5,93.0,66.0,,,France,,France,en:france


In [15]:
total_rows = len(food_dataframe)
for col in list(food_dataframe.columns):
    none_values = np.count_nonzero(food_dataframe[col].apply(lambda x: x is None))
    not_none_percentage = (total_rows - none_values) / total_rows
    print('Rows that are **not** None in {col}: {p:.3%}'.format(col=col, p=not_none_percentage))

Rows that are **not** None in code: 100.000%
Rows that are **not** None in nutrition_grade_fr: 89.740%
Rows that are **not** None in nutrition-score-fr_100g: 89.740%
Rows that are **not** None in nutrition-score-uk_100g: 89.740%
Rows that are **not** None in serving_size: 49.844%
Rows that are **not** None in energy_100g: 99.521%
Rows that are **not** None in fat_100g: 99.089%
Rows that are **not** None in saturated-fat_100g: 97.997%
Rows that are **not** None in proteins_100g: 99.252%
Rows that are **not** None in carbohydrates_100g: 98.723%
Rows that are **not** None in sugars_100g: 98.078%
Rows that are **not** None in fiber_100g: 54.712%
Rows that are **not** None in cities_tags: 30.553%
Rows that are **not** None in purchase_places: 100.000%
Rows that are **not** None in stores: 75.713%
Rows that are **not** None in countries: 99.981%
Rows that are **not** None in countries_tags: 99.981%


We still have rows for which the city tag is none and the purchase place is very vague, like France or the US. We need to take care of those. We will do it by creating a matching between the Open Food Facts dataset and a dataset including all cities of France.

## Cities

In [16]:
# The dataframe: 
# https://www.insee.fr/fr/statistiques/3126432

revenue_df = pd.read_excel("../data/raw/base-cc-filosofi-2014.xls", skiprows=[0,1,2,3])

# Drop rows that are not data and reset index
revenue_df.drop(0, inplace=True)
revenue_df.reset_index(inplace=True, drop=True)


# Drop useless columns
revenue_df = revenue_df.drop(columns=['Nombre de ménages fiscaux','Code géographique',
                                      'dont part des salaires, traitements ou chômage (%)',
                                      "dont part des revenus d'activités non salariées (%)",
                                      'dont part des prestations familiales (%)',
                                      'dont part des minima sociaux (%)',
                                      'dont part des prestations logement (%)',
                                      '1er décile du niveau de vie (€)',
                                      '9e décile du niveau de vie (€)',
                                      'Rapport inter-décile 9e décile/1er decile',
                                      'Part des ménages fiscaux imposés (%)'
                                     ]
                            )

# Translate the remaining columns to English
revenue_df = revenue_df.rename(columns={"ANNEE" : "Year",
                                        'Nombre de personnes dans les ménages fiscaux': "Household inhabitants", 
                                        'Médiane du niveau vie (€)': "Median revenue euros", 
                                        'Taux de pauvreté-Ensemble (%)' : "Total poverty rate (%)", 
                                        'Taux de pauvreté-moins de 30 ans (%)' : "Poverty rate (-30) (%)",
                                        'Taux de pauvreté-30 à 39 ans  (%)' : "Poverty rate (30-39) (%)",
                                        'Taux de pauvreté-40 à 49 ans (%)': "Poverty rate (40-49) (%)", 
                                        "Taux de pauvreté-50 à 59 ans (%)" :  "Poverty rate (50-59) (%)", 
                                        "Taux de pauvreté-60 à 74 ans (%)" :  "Poverty rate (60-74) (%)", 
                                        "Taux de pauvreté-75 ans ou plus (%)":  "Poverty rate (75+) (%)", 
                                        "Taux de pauvreté-propriétaires (%)" :  "Poverty rate (house owners) (%)", 
                                        "Taux de pauvreté-locataires (%)" :  "Poverty rate (tenants) (%)",
                                        "Part des revenus d'activité (%)" :  "Share of activity revenue (%)", 
                                        'Part des pensions, retraites et rentes (%)' :  "Share of retreat pension revenue (%)", 
                                        'Part des revenus du patrimoine et autres revenus (%)' :  "Share of heritage revenue and other (%)",  
                                        "Part de l'ensemble des prestations sociales (%)" :  "Share of social benefits revenue (%)", 
                                        'Part des impôts (%)' :  "Share of taxes (%)",
                                        'Libellé géographique' : "City name"
                                       }
                              )

In [17]:
revenue_df.head()

Unnamed: 0,City name,Household inhabitants,Median revenue euros,Total poverty rate (%),Poverty rate (-30) (%),Poverty rate (30-39) (%),Poverty rate (40-49) (%),Poverty rate (50-59) (%),Poverty rate (60-74) (%),Poverty rate (75+) (%),Poverty rate (house owners) (%),Poverty rate (tenants) (%),Share of activity revenue (%),Share of retreat pension revenue (%),Share of heritage revenue and other (%),Share of social benefits revenue (%),Share of taxes (%)
0,L'Abergement-Clémenciat,799.5,21576.7,,,,,,,,,,,,,,
1,L'Abergement-de-Varey,235.5,21672.9,,,,,,,,,,,,,,
2,Ambérieu-en-Bugey,13660.5,19756.1,15.7534,19.4181,19.5204,19.1982,14.7159,,,5.40116,24.796,71.8,27.3,10.1,6.5,-15.7
3,Ambérieux-en-Dombes,1661.5,23204.8,,,,,,,,,,,,,,
4,Ambléon,102.0,22157.5,,,,,,,,,,,,,,


In [18]:
print("Total number of rows:", len(revenue_df))

Total number of rows: 36621


In [19]:
print("Total number of cities: ", len(list(set(revenue_df["City name"]))))

Total number of cities:  34099


In [20]:
len(food_dataframe)

68305

# Mapping cities - food products

In [21]:
# Helper to remove the accents
import unicodedata as ud

def remove_accents(input_str):
    nkfd_form = ud.normalize('NFKD', str(input_str))
    return u"".join([c for c in nkfd_form if not ud.combining(c)])

In [22]:
# Try to find the intersection between the 2 datasets
# First, let us remove the accents
food_dataframe['purchase_places'] = food_dataframe['purchase_places'].apply(lambda x: remove_accents(x)) 
revenue_df['City name'] = revenue_df['City name'].apply(lambda x: remove_accents(x)) 


In [23]:
revenue_df.head()

Unnamed: 0,City name,Household inhabitants,Median revenue euros,Total poverty rate (%),Poverty rate (-30) (%),Poverty rate (30-39) (%),Poverty rate (40-49) (%),Poverty rate (50-59) (%),Poverty rate (60-74) (%),Poverty rate (75+) (%),Poverty rate (house owners) (%),Poverty rate (tenants) (%),Share of activity revenue (%),Share of retreat pension revenue (%),Share of heritage revenue and other (%),Share of social benefits revenue (%),Share of taxes (%)
0,L'Abergement-Clemenciat,799.5,21576.7,,,,,,,,,,,,,,
1,L'Abergement-de-Varey,235.5,21672.9,,,,,,,,,,,,,,
2,Amberieu-en-Bugey,13660.5,19756.1,15.7534,19.4181,19.5204,19.1982,14.7159,,,5.40116,24.796,71.8,27.3,10.1,6.5,-15.7
3,Amberieux-en-Dombes,1661.5,23204.8,,,,,,,,,,,,,,
4,Ambleon,102.0,22157.5,,,,,,,,,,,,,,


In [24]:
# get the tags of the cities per food item:
# until now, the tags were a comma separated list of tags
# we first split them
cities_for_food = (food_dataframe['cities_tags'].str.split(',', expand=True)
                    # then we create one entry per couple (food item, city tag)
                    .stack()
                    # we remove the index, as we will need to keep the column containing the id of the food item
                    .reset_index()
                    # and we drop duplicates and unnecessary columns
                    .drop(columns='level_1').drop_duplicates())

In [25]:
# in the income dataframe, we still have apostrophes which are replaced in the tags by a caret
# thus, we replace them in the cities name as well.
city_names = revenue_df["City name"].str.lower().str.replace("'", "-").reset_index()

In [26]:
# we move back to spark, because pandas is too weak
sdf_city_names = spark.createDataFrame(city_names)
sdf_cities_for_food = spark.createDataFrame(cities_for_food)
sdf_cities_for_food.createTempView('cities_for_food')
sdf_city_names.createTempView('city_names')

In [27]:
sdf_cities_for_food.show()

+-------+--------------------+
|level_0|                   0|
+-------+--------------------+
|  18598|saint-didier-au-m...|
|  21780|donzere-drome-france|
|  38067|clecy-calvados-fr...|
|  71039|saint-alban-les-e...|
|  96951| vergeze-gard-france|
| 119969|chantonnay-vendee...|
| 122774|sainte-livrade-su...|
| 122781|foucarmont-seine-...|
| 123192|mesnil-en-vallee-...|
| 123199|saint-pee-sur-niv...|
| 123362| ducey-manche-france|
| 123636|plelo-cotes-d-arm...|
| 123849|isigny-sur-mer-ca...|
| 123932|saint-martin-des-...|
| 124526|annot-alpes-de-ha...|
| 125034|avoudrey-doubs-fr...|
| 125442|larceveau-arros-c...|
| 125629|villaz-haute-savo...|
| 125658|brignais-rhone-fr...|
| 125660|brignais-rhone-fr...|
+-------+--------------------+
only showing top 20 rows



In [28]:
# we join our two dataframes, explanation below
sdf_joined = spark.sql("""
SELECT city_names.index AS city_index, city_names.`City name` AS city_name, 
       aux.food_item_index, aux.city_tag_from_food_item
FROM city_names
JOIN (
    SELECT MAX(LENGTH(city_names.`City name`)) AS length_city_name, 
           cities_for_food.level_0 AS food_item_index, cities_for_food.`0` AS city_tag_from_food_item
    FROM cities_for_food
    JOIN city_names
    ON cities_for_food.`0` LIKE CONCAT(city_names.`City name`, '%')
    GROUP BY cities_for_food.level_0, cities_for_food.`0`
) AS aux
ON aux.city_tag_from_food_item LIKE CONCAT(city_names.`City name`, '%')
WHERE aux.length_city_name == LENGTH(city_names.`City name`)
""")

A city's tag seems to usually be 'city-s-name-department-name-country-name'.
A city's name is now 'city-s-name'.

Thus we can join on the condition: `city_s_tag LIKE city_s_name + '%'`.

Unfortunately, the city's name "Saint Alban", formatted as "saint-alban", will also match the tag "saint-alban-les-eaux", though these two cities maybe totally different and far away from one another.

Thus, we want to kep only the longest city's name matching the tag, hence:
```sql
SELECT MAX(LENGTH(city_names.`City name`)) AS length_city_name, 
           cities_for_food.level_0 AS food_item_index, cities_for_food.`0` AS city_tag_from_food_item
    FROM cities_for_food
    JOIN city_names
    ON cities_for_food.`0` LIKE CONCAT(city_names.`City name`, '%')
    GROUP BY cities_for_food.level_0, cities_for_food.`0`
```
returning the size of the longest matching city name.

We then have to re-do the same join and filter using the computed condition.

In [29]:
sdf_joined.show()

+----------+--------------------+---------------+-----------------------+
|city_index|           city_name|food_item_index|city_tag_from_food_item|
+----------+--------------------+---------------+-----------------------+
|      4315|sainte-genevieve-...|         469071|   sainte-genevieve-...|
|      4315|sainte-genevieve-...|         661077|   sainte-genevieve-...|
|      4315|sainte-genevieve-...|         469062|   sainte-genevieve-...|
|      4315|sainte-genevieve-...|         435033|   sainte-genevieve-...|
|      4315|sainte-genevieve-...|         469068|   sainte-genevieve-...|
|      8649|saint-barthelemy-...|         395433|   saint-barthelemy-...|
|      8649|saint-barthelemy-...|         395422|   saint-barthelemy-...|
|      8649|saint-barthelemy-...|         289381|   saint-barthelemy-...|
|      8649|saint-barthelemy-...|         395428|   saint-barthelemy-...|
|      8649|saint-barthelemy-...|         395431|   saint-barthelemy-...|
|      8649|saint-barthelemy-...|     

In [30]:
sdf_joined.write.mode('overwrite').parquet("../data/interim/sdf_joined_city_names.parquet")

In [31]:
sdf_joined.orderBy('food_item_index').show()

+----------+--------------------+---------------+-----------------------+
|city_index|           city_name|food_item_index|city_tag_from_food_item|
+----------+--------------------+---------------+-----------------------+
|     28581|saint-didier-au-m...|          18598|   saint-didier-au-m...|
|      9550|             donzere|          21780|   donzere-drome-france|
|      4680|               clecy|          38067|   clecy-calvados-fr...|
|     16579|saint-alban-les-eaux|          71039|   saint-alban-les-e...|
|     11863|             vergeze|          96951|    vergeze-gard-france|
|     34272|          chantonnay|         119969|   chantonnay-vendee...|
|     18116|sainte-livrade-su...|         122774|   sainte-livrade-su...|
|     31068|          foucarmont|         122781|   foucarmont-seine-...|
|     26727|saint-pee-sur-niv...|         123199|   saint-pee-sur-niv...|
|     18885|               ducey|         123362|    ducey-manche-france|
|      7828|               plelo|     

In [32]:
sdf_joined = spark.read.parquet("../data/interim/sdf_joined_city_names.parquet")

In [33]:
# now we still have to join our food items and our cities, using the mapping we managed to get above
# we switch back to pandas
pdf_joined = sdf_joined.toPandas()
pdf_joined.head()

Unnamed: 0,city_index,city_name,food_item_index,city_tag_from_food_item
0,30,belley,359180,belley-ain-france
1,30,belley,359153,belley-ain-france
2,30,belley,359177,belley-ain-france
3,30,belley,359157,belley-ain-france
4,30,belley,359155,belley-ain-france


In [34]:
food_dataframe.head()

Unnamed: 0,code,nutrition_grade_fr,nutrition-score-fr_100g,nutrition-score-uk_100g,serving_size,energy_100g,fat_100g,saturated-fat_100g,proteins_100g,carbohydrates_100g,sugars_100g,fiber_100g,cities_tags,purchase_places,stores,countries,countries_tags
106,20114,,,,16 ml,0,0.0,0.0,0.0,0.0,0.0,,,France,,en:france,en:france
251,274722,b,0.0,0.0,285 g,450,2.2,0.9,6.8,15.3,0.5,0.5,,France,Comme J'aime,France,en:france
252,274739,b,1.0,1.0,300 g,455,4.2,2.1,4.4,12.5,1.4,1.8,,France,,France,en:france
289,7020254,e,12.0,4.0,33 cl,213,0.0,0.0,0.0,14.0,0.0,0.0,,"France,Etats-Unis",,"France,États-Unis","en:france,en:united-states"
298,9125124,b,1.0,1.0,40 g + 100 ml d'eau,1577,0.5,0.1,1.5,93.0,66.0,,,France,,France,en:france


In [35]:
# we join our mapping with the food dataframe
food_df_for_join = pdf_joined.join(food_dataframe, on="food_item_index")
food_df_for_join.head()

Unnamed: 0,city_index,city_name,food_item_index,city_tag_from_food_item,code,nutrition_grade_fr,nutrition-score-fr_100g,nutrition-score-uk_100g,serving_size,energy_100g,...,saturated-fat_100g,proteins_100g,carbohydrates_100g,sugars_100g,fiber_100g,cities_tags,purchase_places,stores,countries,countries_tags
0,30,belley,359180,belley-ain-france,3307906000064,d,14,19,150 g,1243,...,17,19,1.3,1.3,0.0,belley-ain-france,"Miribel,France",Carrefour market,Francia,en:france
1,30,belley,359153,belley-ain-france,3307902060086,d,15,20,,1402,...,19,15,1.3,1.3,,belley-ain-france,"Saint-Loup,France",Leclerc,en:france,en:france
2,30,belley,359177,belley-ain-france,3307905810084,d,15,20,,1435,...,21,15,1.0,1.0,0.0,belley-ain-france,"Paris,France","Franprix, Magasins U",en:france,en:france
3,30,belley,359157,belley-ain-france,3307904400125,d,14,19,,1238,...,18,19,1.0,1.0,,belley-ain-france,France,Leclerc,"France,Switzerland","en:france,en:switzerland"
4,30,belley,359155,belley-ain-france,3307903690121,d,15,20,,1435,...,21,15,1.0,1.0,,belley-ain-france,France,Auchan,France,en:france


In [36]:
# we join again on the income dataframe
global_df = food_df_for_join.join(revenue_df, on="city_index")
# we can now drop the intermediary columns: city_name, and cities_tags
global_df.drop(columns=['city_name', 'cities_tags'], inplace=True)
global_df.head()

Unnamed: 0,city_index,food_item_index,city_tag_from_food_item,code,nutrition_grade_fr,nutrition-score-fr_100g,nutrition-score-uk_100g,serving_size,energy_100g,fat_100g,...,Poverty rate (50-59) (%),Poverty rate (60-74) (%),Poverty rate (75+) (%),Poverty rate (house owners) (%),Poverty rate (tenants) (%),Share of activity revenue (%),Share of retreat pension revenue (%),Share of heritage revenue and other (%),Share of social benefits revenue (%),Share of taxes (%)
0,30,359180,belley-ain-france,3307906000064,d,14,19,150 g,1243,24,...,15.7265,16.9492,,6.20557,29.6095,65.9,31.2,11.1,7.1,-15.3
1,30,359153,belley-ain-france,3307902060086,d,15,20,,1402,30,...,15.7265,16.9492,,6.20557,29.6095,65.9,31.2,11.1,7.1,-15.3
2,30,359177,belley-ain-france,3307905810084,d,15,20,,1435,31,...,15.7265,16.9492,,6.20557,29.6095,65.9,31.2,11.1,7.1,-15.3
3,30,359157,belley-ain-france,3307904400125,d,14,19,,1238,24,...,15.7265,16.9492,,6.20557,29.6095,65.9,31.2,11.1,7.1,-15.3
4,30,359155,belley-ain-france,3307903690121,d,15,20,,1435,31,...,15.7265,16.9492,,6.20557,29.6095,65.9,31.2,11.1,7.1,-15.3


In [37]:
# we cannot use the nutrition_grade as is. Thus we turn it into a numerical value
def grade_to_number(x):
    if x == "a":
        return 1
    if x == "b":
        return 2
    if x == "c":
        return 3
    if x == "d":
        return 4
    if x == "e":
        return 5
    return 0

In [38]:
global_df['nutrition_grade_numeric'] = global_df['nutrition_grade_fr'].apply(grade_to_number).astype('int')

In [39]:
global_df.to_csv("../data/processed/clean_food_and_cities.csv")

# Stores

In [40]:
global_df = pd.read_csv('../data/processed/clean_food_and_cities.csv')

In [41]:
global_df.drop(columns="Unnamed: 0", inplace=True)
global_df.head()

Unnamed: 0,city_index,food_item_index,city_tag_from_food_item,code,nutrition_grade_fr,nutrition-score-fr_100g,nutrition-score-uk_100g,serving_size,energy_100g,fat_100g,...,Poverty rate (60-74) (%),Poverty rate (75+) (%),Poverty rate (house owners) (%),Poverty rate (tenants) (%),Share of activity revenue (%),Share of retreat pension revenue (%),Share of heritage revenue and other (%),Share of social benefits revenue (%),Share of taxes (%),nutrition_grade_numeric
0,30,359180,belley-ain-france,3307906000064,d,14.0,19.0,150 g,1243.0,24.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4
1,30,359153,belley-ain-france,3307902060086,d,15.0,20.0,,1402.0,30.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4
2,30,359177,belley-ain-france,3307905810084,d,15.0,20.0,,1435.0,31.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4
3,30,359157,belley-ain-france,3307904400125,d,14.0,19.0,,1238.0,24.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4
4,30,359155,belley-ain-france,3307903690121,d,15.0,20.0,,1435.0,31.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4


In [42]:
global_df['stores'] = global_df['stores'].astype('str')

In [43]:
stores_series = global_df['stores'].apply(lambda x: remove_accents(x.lower()) if x is not None else None)
# get the name of the store per food item:
# until now, the stores were a comma separated list of stores
# we first split them
stores_for_food = (stores_series.str.split(',', expand=True)
                    # then we create one entry per couple (food item, store)
                    .stack()
                    # we remove the index, as we will need to keep the column containing the id of the food item
                    .reset_index()
                    # and we drop duplicates and unnecessary columns
                    .drop(columns='level_1'))
stores_for_food[0] = stores_for_food[0].str.strip()
stores_for_food.drop_duplicates(inplace=True)

In [44]:
food_stores_formatted = stores_for_food.join(global_df, on='level_0').rename(
    columns={0: "store_name", "level_0": "index"}).set_index("index")
food_stores_formatted.head()

Unnamed: 0_level_0,store_name,city_index,food_item_index,city_tag_from_food_item,code,nutrition_grade_fr,nutrition-score-fr_100g,nutrition-score-uk_100g,serving_size,energy_100g,...,Poverty rate (60-74) (%),Poverty rate (75+) (%),Poverty rate (house owners) (%),Poverty rate (tenants) (%),Share of activity revenue (%),Share of retreat pension revenue (%),Share of heritage revenue and other (%),Share of social benefits revenue (%),Share of taxes (%),nutrition_grade_numeric
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,carrefour market,30,359180,belley-ain-france,3307906000064,d,14.0,19.0,150 g,1243.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4
1,leclerc,30,359153,belley-ain-france,3307902060086,d,15.0,20.0,,1402.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4
2,franprix,30,359177,belley-ain-france,3307905810084,d,15.0,20.0,,1435.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4
2,magasins u,30,359177,belley-ain-france,3307905810084,d,15.0,20.0,,1435.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4
3,leclerc,30,359157,belley-ain-france,3307904400125,d,14.0,19.0,,1238.0,...,16.949153,,6.205567,29.609475,65.9,31.2,11.1,7.1,-15.3,4


In [45]:
food_stores_formatted['nutrition_grade_numeric'] = food_stores_formatted['nutrition_grade_fr'].apply(grade_to_number).astype('int')
food_stores_formatted.to_csv("../data/processed/clean_food_and_stores.csv")

In [46]:
global_df['city_tag_from_food_item'].drop_duplicates()

0                                        belley-ain-france
5                                        nantua-ain-france
9                                        servas-ain-france
24                                       viriat-ain-france
69                                     braine-aisne-france
81                                      fleury-oise-france
82                                    abrest-allier-france
83                   angles-alpes-de-haute-provence-france
84                              neffes-hautes-alpes-france
88                           carros-alpes-maritimes-france
107                                  arcens-ardeche-france
109                                  meyras-ardeche-france
110                                 mouzon-charente-france
111                                tailly-cote-d-or-france
112                                   moulis-ariege-france
113                                    verdun-meuse-france
147                                     troyes-aube-fran

# Arrondissements

In order to visualize data, we need to make the correspondences between the cities that we have and their respective arrondissement. The arrondissement is the territorial subdivision of the French territory that comes after the Département. There are around 340 arrondissements in France. We thought this subdivision was small enough to show the differences across the French territory but large enough for it to be visible on a map.

In [47]:
global_df = pd.read_csv('../data/processed/clean_food_and_cities.csv')

In [48]:
# The new dataframe:
# https://www.insee.fr/fr/information/2028028
# Table d'appartenance géographique des communes au 1ᵉʳ janvier 2017

city_arr = pd.read_excel('../data/raw/table-appartenance-geo-communes-17.xls', skiprows=[0,1,2,3])
city_arr.drop(0, inplace=True)
city_arr.reset_index(inplace=True, drop=True)

# Remove accents
city_arr['Libellé géographique'] = city_arr['Libellé géographique'].apply(lambda x: remove_accents(x)) 
# Replace ' ' by '-'
city_arr['Libellé géographique'] = city_arr['Libellé géographique'].apply(lambda x: x.replace(' ', '-'))

# Drop useless columns
city_arr = city_arr.drop(columns=['Intercommunalité - Métropole', "Nature d'EPCI", "Zone d'emploi 2010", 
                       "Unité urbaine 2010", "Tranche d'unité urbaine 2014", 
                       "Tranche détaillée d'unité urbaine 2014", "Aire urbaine 2010", 
                       "Tranche d'aire urbaine 2014", "Bassin de vie 2012"]
             )
city_arr.rename(columns={'Libellé géographique' : 'City name'}, inplace=True)
city_arr.head()

Unnamed: 0,Code géographique,City name,Département,Région,Arrondissement,Canton ville,Catégorie commune dans aires urbaines
0,1001,L'Abergement-Clemenciat,1,84,12,108,120
1,1002,L'Abergement-de-Varey,1,84,11,101,112
2,1004,Amberieu-en-Bugey,1,84,11,101,112
3,1005,Amberieux-en-Dombes,1,84,12,122,112
4,1006,Ambleon,1,84,11,104,300


In [49]:
city_arr["custom_arrondissement_code"] = city_arr["Département"] + "00" + city_arr['Arrondissement'].astype('str').apply(lambda x: x[-1])
city_arr.head()

Unnamed: 0,Code géographique,City name,Département,Région,Arrondissement,Canton ville,Catégorie commune dans aires urbaines,custom_arrondissement_code
0,1001,L'Abergement-Clemenciat,1,84,12,108,120,1002
1,1002,L'Abergement-de-Varey,1,84,11,101,112,1001
2,1004,Amberieu-en-Bugey,1,84,11,101,112,1001
3,1005,Amberieux-en-Dombes,1,84,12,122,112,1002
4,1006,Ambleon,1,84,11,104,300,1001


In [50]:
city_arr.to_csv("../data/processed/city_region_arrondissement.csv")

In [51]:
global_arrond = global_df.merge(city_arr, how='inner')

In [None]:
global_arrond = global_arrond.rename(columns={"Département" : "Department", "Région" : "Region"})
global_arrond = global_arrond[['food_item_index', 'nutrition-score-fr_100g', 'nutrition-score-uk_100g',
                       'nutrition_grade_numeric', 'serving_size', 'energy_100g', 'fat_100g', 
                       'saturated-fat_100g', 'proteins_100g', 'carbohydrates_100g', 'sugars_100g', 
                       'fiber_100g', 'Median revenue euros', 'Total poverty rate (%)', 'Poverty rate (-30) (%)',
                       'Poverty rate (30-39) (%)', 'Poverty rate (40-49) (%)',
                       'Poverty rate (50-59) (%)', 'Poverty rate (60-74) (%)',
                       'Poverty rate (75+) (%)', 'Poverty rate (house owners) (%)',
                       'Poverty rate (tenants) (%)', 'Share of activity revenue (%)',
                       'Share of retreat pension revenue (%)',
                       'Share of heritage revenue and other (%)',
                       'Share of social benefits revenue (%)', 'Share of taxes (%)', 'City name', 
                       'Department', 'Region', 'custom_arrondissement_code']]

In [52]:
global_arrond.to_csv("../data/processed/clean_food_cities_arrond.csv")