In [None]:
import pandas as pd
import numpy as np
import scipy as scp
import scipy.stats

In [None]:
DATAPATH='data/'

In [None]:
data = pd.read_csv(DATAPATH+'openfoodfacts.csv',delimiter='\t',low_memory=True)

In [None]:
print(data.shape) # 681'602
print(data["code"].value_counts().count()) # 681533
print(data["manufacturing_places_tags"].value_counts().count()) # 11731
print(data["manufacturing_places"].value_counts().count()) # 12884
print(data["categories"].value_counts().count()) # 62746
print(data["origins_tags"].value_counts().count()) # 6606

data.head()

In [None]:
#fasters than pd.Series.corr(...)
def correlation_columns(column1,column2):
    matrix=data[[column1,column2]].dropna().values
    return scp.stats.spearmanr(matrix[:,0],matrix[:,1])

In [None]:
#example of use of the correlation function
print(correlation_columns('energy-from-fat_100g','fat_100g'))
print(correlation_columns('energy_100g','fat_100g'))
print(correlation_columns('energy_100g','energy-from-fat_100g'))
print(correlation_columns('energy_100g','salt_100g'))
print(correlation_columns('salt_100g','fat_100g'))

In [None]:
matrix=data[['energy_100g','fat_100g']].dropna().values

In [None]:
np.set_printoptions(threshold=np.nan)
data.columns.values

# Get rid of dupplicates

In [None]:
# Check values thave haveno code value. 
null_code = data[pd.isnull(data['code'])].shape[0]
print(null_code)
start_dummy_code = 10000000000000
end_dummy_code = 10000000000000+null_code
dummy_code = list(range(start_dummy_code,end_dummy_code+1))


for i,row in enumerate(data[pd.isnull(data['code'])].index):
    data.loc[[row],['code']] = dummy_code[i]


In [None]:
# investigate duplicates. 
data[data.duplicated(subset=['code'],keep=False)]

In [None]:
# Drop duplicates, set index, and check if ok. 
data = data.drop_duplicates(subset=['code'],keep="first")
data.set_index('code',inplace=True)
data.index.is_unique

In [None]:
data.to_csv(DATAPATH+"/cleaned_unique_index.csv")

In [None]:
a = ~data["first_packaging_code_geo"].isnull() # 27572
b = ~data["origins_tags"].isnull()
c = ~data["manufacturing_places_tags"].isnull()

sele = a|b|c  # 88001

data_loc = data[sele] # 88001 
print(data_loc.shape)
# data_loc = data_loc[sele]
data_loc[["origins_tags","origins_tags","manufacturing_places_tags"]].to_csv(DATAPATH+'food_origins.csv')

set for palm oil

In [None]:
palm_oil = []
for col in data.columns:
    ## keep columns that have info about palm oil.. 
    if ("palm_oil") in col:
        palm_oil.append(col)

# Drop rows that have absolutely no information about palm_oil.         
palm_oil_data = data.dropna(how='all',subset=palm_oil)[palm_oil]
palm_oil_data.head()

In [None]:
palm_oil_data.to_csv(DATAPATH+'palm_oil.csv',columns=palm_oil_data.columns,index=True)

set for labels

In [None]:
labels =[]
for col in data.columns:
    #isolate columns with info about labels. 
    if ("labels") in (col):
        labels.append(col)
print("Columns for label : " + str(labels))

# drop articles with no info about labels.
labels_data = data.dropna(how='all',subset=labels)[labels]
labels_data.head()


In [None]:
labels_data.to_csv(DATAPATH+'labels.csv',columns=labels_data.columns,index=True)

set for vegan

In [None]:
data_=data.dropna(subset=['labels_tags'])
#keeping only vegan labelled data, removing the non-vegan or no-vegan labelled
vegan_data=data_[data_['labels_tags'].str.contains('vegan').fillna(False) & (~ data_['labels_tags'].str.contains('no-vegan').fillna(False)) ]
del data_

In [None]:
vegan_data=to_csv(DATAPATH+'vegan.csv',columns=vegan_data.columns,index=True)

# Palm Oil

## Exploring the number of aliments that contain palm oil

In [None]:
total_count = data.size
palm_oil_count = palm_oil_data.size
percentage = palm_oil_count / total_count * 100

print("We have a total of %d articles entered" % total_count)
print("Out of those we have information about palm oil on %d articles " % palm_oil_count)
print("This represents %f %% " % percentage)

- While this amount seems a bit low it is still more than half articles. We remove from our study all articles that we do not have information about as they could make future analysis biased. The sample of size 387964 is still a fairly large sample size that we can make some statistical analysis on

In [None]:
palm_oil_data.head()

In [None]:
contains_palm = palm_oil_data[palm_oil_data['ingredients_from_palm_oil_n'] > 0.0]
may_contain_palm = palm_oil_data[palm_oil_data['ingredients_that_may_be_from_palm_oil_n'] > 0.0]
contains_palm['count'] = 1
may_contain_palm['count'] = 1

In [None]:
contains_palm.index.difference(may_contain_palm.index)
# This can be useful to see that there is some correlation between may_contain <->contains

In [None]:
## Group by how many ingredients contain palm oil. and may contain. 
count_palm = contains_palm.groupby("ingredients_from_palm_oil_n")['count'].count()
count_may = may_contain_palm.groupby("ingredients_that_may_be_from_palm_oil_n")['count'].count()
count_palm.plot.bar(logy=True,title="Count of palm oil ingredient by number in product",color='b')
plt.xlabel("Ingredient amount")
plt.show()
count_may.plot.bar(logy=True,title="Count of palm oil related ingredient by number in product",color='b')
plt.xlabel("Ingredient amount")
plt.show()

- We can see that that most of the product seem to have only one ingredient or two that are related to palm oil
- However the product that may contains palm oil can have a little more ingredients. 

In [None]:
## One can wonder how much this represents in terms of relations. 
cnt_palm = contains_palm.shape[0] 
may_cnt_palm = may_contain_palm.shape[0]
print("There is %d articles that contain palm oil."%cnt_palm)
print("There is %d articles that may contain palm oil "%may_cnt_palm)
correlation = data["ingredients_from_palm_oil_n"].corr(data["ingredients_that_may_be_from_palm_oil_n"],method="pearson")
print("The correlation between food that contains palm oil and that may contains palm oil is %f " % correlation)

data.plot.scatter(x="ingredients_from_palm_oil_n",y="ingredients_that_may_be_from_palm_oil_n")
plt.show()

<p> We can't infer much as the scatter plots are not very helpful in visualize the data. Nut we can see that the correlation is low (less than 0.2) so there is not much relation between the number of ingredients and how many may contain palm oil.  </p>

# Vegan

In [None]:
vegan_data.head()
origin_lists=vegan_data.origins_tags.dropna().str.split(',')
origin_exploded=origin_lists.apply(pd.Series).unstack()
del origin_lists

In [None]:
origin_exploded.value_counts().head(15).plot.barh()
plt.title('Number of vegan-labelled products')
plt.show()

In [None]:
total_origins=data.origins_tags.dropna().str.split(',')
total_origins_exploded=total_origins.apply(pd.Series).unstack()


In [None]:
total_origins_exploded.value_counts().head()
total_origins_exploded.value_counts().head(15).plot.barh()
plt.title('total number products in DB')
plt.show()


In [None]:
vegan_origin_df=origin_exploded.to_frame().reset_index().drop(labels=['level_0'],axis=1)
total_origin_df=total_origins_exploded.to_frame().reset_index().drop(labels=['level_0'],axis=1

In [None]:
total_origin_df.columns=['level_1','origin']
total_origin_df=total_origin_df.dropna()
total_count=total_origin_df.groupby(by=['origin']).size().sort_values(ascending=False).to_frame()
vegan_origin_df.columns=['level_1', 'vegan_loc']
vegan_count=vegan_origin_df.groupby(by=['vegan_loc']).size().to_frame()
comparison=pd.merge(total_count,vegan_count,right_index=True,left_index=True,how='inner')


## Comment on data

The origins_tags series is messy.

First of all, same country entries are in different languange. The first approach is to deal with similar strings clustering: put together italy, italia and italien, but it would not work for austria osterreich as well as Germany Allemagne and Republica Federale Tedesca.

Another problem is the bias of the data base: it is a french data base, so will have more entries and more details for product from France, and frome Europe in a lower extend. To overcome that problem, we want to compare countries by their relative vegan production (vegan labelled production from one origin_tag over total production from the same).

But this solution shows a third problem: irrelevent tags. Some tags are to precise: regions or province, or even wrong ('soja' is not a country). For these tags are very rare, it is possible to have a confidence (the ratio that we talked about) of 100%: for instance, there is only one product tagged 'zamora-provincia' and this product is also tagged vegan, then 'zamora-provincia' will be the "vegan champion", but this is irrelevant to compare a precise province with entire country.

We addressed that last problem by thresholding the support (at 100 labbeled products), so we are sure to retrieve frequent countriews in the DB. Also we manually removed too general labelled because those can have a high support as well ('eu').

In [None]:
relevent_index=['aceitunas', 'afrique', 'agadir', 'agypten', 'alava', 'albacete-provincia', 'albatera', 'alemania', 'alicante', 'alicante-provincia', 'allemagne', 'almendras', 'almeria-provincia', 'almunecar', 'alps', 'amandes-d-italie', 'amazonas', 'amazonie', 'america-del-sur', 'amerique', 'amerique-centrale', 'amerique-du-nord', 'amerique-du-sud', 'andalucia', 'andes', 'angleterre', 'angleterre-betterave', 'aragon', 'aranjuez', 'argelia', 'argentina', 'argentinien', 'arroz', 'asie', 'asie-pour-le-fruit-de-moines', 'asturias', 'atlantico-suroeste', 'australia', 'austria', 'autol', 'autriche', 'avoine-francaise', 'azucar', 'azur-quelle', 'bajo-aragon', 'bali', 'batata', 'belgica', 'belgien', 'belgium', 'bolivia', 'bolivie', 'bolivien', 'bosnien-herzegowina', 'brazil', 'bresil', 'bretagne', 'burkina-faso', 'cacahuetes', 'cacao', 'caceres-provincia', 'cadiz-provincia', 'caldes-de-malavella', 'calera-y-chozas', 'california', 'californie', 'camargue', 'cambrils', 'cana-de-azucar', 'canada', 'casas-de-haro', 'castilla-la-mancha', 'castilla-y-leon', 'cataluna', 'cevennes', 'chile', 'china', 'chine', 'chufa', 'ciego-montero', 'cienfuegos', 'ciruelas', 'ciudad-real-provincia', 'col', 'col-lombarda', 'colombia', 'columbia', 'commerce-equitable', 'comunidad-valenciana', 'conil-de-la-frontera', 'cordoba-provincia', 'corea', 'costa-de-galicia', 'costa-de-marfil', 'costa-rica', 'cuba', 'cuenca-provincia', 'desconocido', 'deutschland', 'dominikanische-republik', 'e-u-a', 'ecuador', 'egipto', 'egnil', 'egypte', 'el-bolson', 'el-mirador', 'equateur', 'esmeraldas', 'espagne', 'espana', 'estados-unidos', 'etats-unis','eutschland', 'extremadura', 'filderstadt', 'filipinas', 'fontanilles', 'fougerolles', 'france', 'francia', 'frankreich', 'fresas', 'fuera-de-espana', 'galicia', 'gard', 'garray', 'germany', 'gerona-provincia', 'gers', 'gers-en-france-pour-le-ble', 'ghana', 'girona', 'golfo-de-vizcaya', 'granada-provincia', 'grece', 'grecia', 'greece', 'griechenland', 'guadalajara-provincia', 'haiti', 'haute-provence', 'herefordshire', 'holanda', 'hongrie','huelva-provincia', 'inde', 'india', 'indien', 'indien-landwirtschaft', 'indonesia', 'indonesie', 'indonesien', 'industria-argentina', 'ingrediente-s', 'iran', 'ireland', 'islas-canarias', 'israel', 'italia', 'italie', 'italien', 'italy', 'jaen', 'japon', 'jativa', 'kanada', 'kenia', 'kirgistan', 'kolumbien', 'la-coruna-provincia', 'la-mancha-comarca', 'la-manchuela-comarca', 'la-montiela-pedania', 'la-orotava', 'la-rioja', 'leimuiden', 'leinfelden-echterdingen', 'lerida-provincia', 'lombardia', 'lomellina', 'madagascar', 'madrid-comunidad-autonoma', 'malaga-provincia', 'malaui', 'manantial-amer-palatin', 'manantial-bezoya-de-trescasas', 'manantial-font-sacalm', 'manantial-fuente-primavera', 'manantial-fuentevera', 'manantial-siguenza', 'manantial-vichy-catalan', 'marinaleda', 'marruecos', 'mazarron', 'melocotones', 'mexico', 'mexiko', 'mexique', 'midlands-de-l-ouest', 'milano', 'minglanilla', 'minho-lima', 'modena', 'montricoux', 'montseny', 'moratilla-de-henares', 'motril', 'munchsteinach', 'murcia', 'murcia-comunidad-autonoma', 'mures-des-pays-bas', 'myrtilles-du-maroc', 'nao-ue', 'nardo', 'navarra', 'niederosterreich', 'niger', 'nijar','nueva-zelanda', 'osterreich', 'pais-vasco', 'pakistan', 'palmira', 'papua-neuguinea', 'paraguay', 'pavia-provincia', 'pays-bas', 'pepinillos', 'perou', 'peru', 'philippinen', 'philippines', 'pichincha-provincia', 'piemont', 'pioz', 'plano-tx', 'poland', 'polonia', 'portugal', 'quintanar-del-rey', 'regiao-do-norte', 'region-centro-norte', 'reino-unido', 'remolacha-y-zanahoria', 'republica-dominicana', 'republique-dominicaine', 'requena', 'requena-utiel-comarca', 'rheinland', 'ribera-del-jalon-comarca', 'rice', 'rio-negro', 'riz', 'romania', 'roumanie', 'royaume-uni', 'rugen', 'rumanien', 'salamanca-provincia', 'san-antonio-requena', 'sant-hilari-sacalm', 'santa-cruz-de-tenerife-provincia', 'santaella', 'santo-domingo', 'santo-tome-y-principe', 'sao-tome', 'sao-tome-und-principe', 'schweden', 'segovia-provincia', 'selva-comarca', 'serbia', 'sevilla-provincia', 'sicile', 'sierra-de-gredos', 'siguenza', 'slowakei', 'soja', 'soja-bio', 'soja-de-france', 'soja-de-francia', 'soja-europeen', 'soja-origine-france', 'soja-sud-est', 'soria-provincia', 'south-africa', 'south-africa-local-and-imported', 'spain', 'spanien', 'sri-lanka', 'sud-est-de-la-france', 'sud-ouest', 'sud-ouest-de-la-france', 'sudafrica', 'sudafrika', 'sudamerica', 'sudamerika', 'suisse', 'switzerland', 'tailandia', 'tanzania', 'tanzanie', 'tarn-et-garonne', 'tarragona-provincia', 'thailand', 'thailande', 'toledo-provincia', 'tomates', 'tozeur', 'trentino-alto-adigio', 'trescasas', 'trinidad', 'trinidad-und-tobago', 'tunesien', 'tunez', 'tunisia', 'tunisie', 'turkei', 'turquia', 'turquie', 'uca', 'ucrania', 'ue', 'ue-non-ue', 'uganda', 'ukraine',  'united-kingdom', 'united-states', 'united-states-of-america', 'usa', 'usa-and-other-unspecified-countries', 'val-venosta', 'valencia', 'valencia-provincia', 'valladolid-provincia', 'vancouver-bc', 'various', 'venezie', 'venezuela', 'verin', 'viana-do-castelo', 'viana-do-castelo-distrito', 'vietnam', 'villa-del-prado', 'villalgordo-del-jucar', 'villamalea', 'villanueva-de-la-jara', 'waldviertel', 'west-indies', 'wiesbaum', 'yuca', 'zamora-provincia', 'zaragoza-provincia' ] 
relevent_comparison=comparison.loc[relevent_index]
relevent_comparison=relevent_comparison[relevent_comparison['0_x']>100]
del relevent_index

In [None]:
relevent_comparison['ratio']=relevent_comparison['0_y']/relevent_comparison['0_x']
relevent_comparison.ratio.sort_values(ascending=False).head(15).plot.barh()
plt.title('Relative production of vegan-labelled products')
plt.show()

## Comment on observations

Spain is the country that produce the higher part of vegan product relatively to its total production.
A solution is to check the categories

In [None]:
vegan_set["created_datetime"] = pd.to_datetime(vegan_set["created_datetime"],errors="coerce")
dateparsed = vegan_set[vegan_set["created_datetime"].notnull()]
vegan_date=dateparsed.reset_index().set_index('created_datetime')

In [None]:
vegan_date["count"] = 1
vegan_date = vegan_date["count"].resample("30D").sum()
vegan_date.plot.line(title="Trend of articles containing palm oil over time")
plt.xlabel("Date")
plt.ylabel("Count")
plt.show()

In [None]:
vegan_date.cumsum().plot.line()
plt.show()

The total ammount of vegan products increases in a linear was, suggesting that the rate of vegan produced food is consant.