In [1]:
import pandas as pd
import numpy as np
import scipy as scp
import geopy # pip install geopy if necessary

In [2]:
data = pd.read_csv('../openfoodfacts.csv',delimiter='\t',low_memory=False)

In [3]:
print(data.shape)

(665697, 174)


# Data for distance computation

In [4]:
#a = ~data["first_packaging_code_geo"].isnull() # 27572
b = ~data["origins_tags"].isnull()
c = ~data["manufacturing_places_tags"].isnull()

sele = b | c  # 88001

data_loc = data[sele] # 57688 (88001)

sel = ['categories_en', 'main_category_en', 'labels_en', 'origins_tags', 'manufacturing_places_tags', 
       'first_packaging_code_geo', 'countries_en']

data_loc = data_loc[sel]
data_loc

Unnamed: 0,categories_en,main_category_en,labels_en,origins_tags,manufacturing_places_tags,first_packaging_code_geo,countries_en
96,Dietary supplements,Dietary supplements,Made in France,,france,,France
238,"Meals,Meat-based products,Meals with meat,Micr...",Meals,"fr:Viande Française,Made in France",france,france,,France
241,"Plant-based foods and beverages,Plant-based fo...",Plant-based foods and beverages,,quebec,brossard-quebec,,Canada
254,"Sugary snacks,Biscuits and cakes,Pastries",Sugary snacks,,quebec,brossard-quebec,,Canada
259,"Plant-based foods and beverages,Plant-based fo...",Plant-based foods and beverages,,france,united-kingdom,,United Kingdom
264,"Sugary snacks,Biscuits and cakes,Pastries",Sugary snacks,,quebec,brossard-quebec,,Canada
276,fr:Boulange,fr:Boulange,,quebec,"brossard,quebec",,Canada
280,"Beverages,Carbonated drinks,Sodas,Sugared beve...",Beverages,"Kosher,Contains GMOs",,etats-unis,,"France,United States"
284,"Sugary snacks,Biscuits and cakes,Biscuits,fr:S...",Sugary snacks,Green Dot,,"france,avranches",,France
288,"Meals,Pizzas pies and quiches,Quiches,Lorraine...",Meals,,quebec,"brossard,quebec",,Canada


## Pre-process on coutrnies_en section

Sometimes the countries_en section contains several countries. Thus duplications of the product have to be done in order to consider the product in each country independantly.

In [None]:
z = data_loc["countries_en"].str.split(',').apply(pd.Series)
z1 = z.unstack().dropna()
z1 = z1.to_frame()

In [None]:
z1.columns = ['destination']

In [None]:
# change destination with special characters
# manually checked on wikipedia + google maps
z1[z1['destination'] == 'Other-日本'] = 'Japon' # manually checked on wiki
z1[z1['destination'] == '中国'] = 'Chine'
z1[z1['destination'] == 'fr:香港'] = 'Hong Kong'
z1[z1['destination'] == 'fr:日本'] = 'Japon'
z1[z1['destination'] == 'Ελλάδα'] = 'Grèce'
z1[z1['destination'] == 'ar:صنعاء'] = 'Sanaa'
z1[z1['destination'] == 'fr:الجزائر'] = 'Algérie'


In [None]:
# remove fr: or others xx: in destination
def split_xx(l):
    tmp = l.split(':')
    
    if len(tmp) == 1:
        return l
    else:
        return tmp[1]
        
    #.str.get(0).str[1,:]
        
z1['destination'] = z1['destination'].apply(lambda l: split_xx(l))

In [None]:
print(z1.shape)
z1 = z1[~(z1 == 'Photos uploaded')]
z1 = z1[~(z1 == 'Photos validated')]
z1 = z1[~(z1 == 'Product name completed')]
z1 = z1[~(z1 == 'Photos to be uploaded')]
z1 = z1[~(z1 == 'Photos to be validated')]
z1 = z1[~(z1 == 'Quantity completed')]
z1 = z1[~(z1 == 'Nutrition facts completed')]
z1 = z1[~(z1 == 'Ingredients completed')]
z1 = z1[~(z1 == 'Brands completed')]
z1 = z1[~(z1 == 'Packaging completed')]
z1 = z1[~(z1 == 'Characteristics completed')]
z1 = z1[~(z1 == 'Categories completed')]
z1 = z1[~(z1 == 'Packaging-code-completed')]
z1 = z1[~(z1 == 'Expiration date completed')]
z1 = z1[~(z1 == 'Expiration date to be completed')]
z1 = z1[~(z1 == 'En')]
z1 = z1[~(z1 == 'To be completed')]
z1 = z1[~(z1 == 'To be checked')]
z1 = z1[~(z1 == 'Worldwide')]
z1 = z1[~(z1 == 'World')]
z1 = z1[~(z1 == 'Sucre')]
z1 = z1[~(z1 == 'Complete')]
z1 = z1[~(z1 == 'Global-market')]
z1 = z1[~(z1 == 'Dat')]
z1 = z1[~(z1 == 'France-others')]

z1 = z1.dropna()
print(z1.shape)
#z1 = z1.to_frame()

In [None]:
# replace values manually
z1[z1['destination'] == 'Leclerc-bois-d-arcy-france'] = 'France'
z1[z1['destination'] == 'Francecontient-des-sulfites'] = 'France'
z1[z1['destination'] == 'Leclerc'] = 'France'
z1[z1['destination'] == 'Auchan'] = 'France'
z1[z1['destination'] == 'Carrefour'] = 'France'
z1[z1['destination'] == 'Estadps-unidos'] = 'United States'
z1[z1['destination'] == 'Polyensie-francaise'] = 'French Polynesia'

In [None]:
z1 = z1.swaplevel()
z1 = z1.reset_index(level=1, drop=True)

In [None]:
z1.head()

Apply the dictionnary to translate all countries in english

## Select french products

In [None]:
countries = pd.read_csv('data/Destination-Coordinate-MAN.csv',delimiter=',',low_memory=False)
dico_countries = dict(zip(countries['destination'], countries['country']))

In [None]:
dico_countries

In [None]:
z1['Arrival'] = z1['destination'].apply(lambda l: dico_countries[l])
z1.head()

In [None]:
selection = z1[z1['Arrival'] == 'France']

In [None]:
selection.shape

In [None]:
selection.head()

In [None]:
selection.index

In [None]:
french_product = data_loc.loc[selection.index]
french_product.head()

In [None]:
french_product.to_csv('data/FrenchProduct.csv')

In [None]:
# check
country_count = french_product["countries_en"].value_counts()[0:20]
country_count