In [26]:
import numpy as np 
from importlib import reload # for reloading modules after a script has been modified
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
sns.set()

# from PIL import Image
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import os
print(os.listdir("data"))


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD, FastICA
from sklearn.manifold import TSNE

from scipy.stats import boxcox

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


['fr.openfoodfacts.org.products.csv', 'world-food-facts.zip', 'en.openfoodfacts.org.products.tsv']


In [14]:
import re

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

import metric_learn

PUNCTUATION_REGEX = re.compile(r"""[:,;.&~"'|`_\\={}%()\[\]]+""")
DIGIT_REGEX = re.compile(r"[0-9]+")
MULTIPLE_SPACES_REGEX = re.compile(r" +")

def preprocess(text):
    text = PUNCTUATION_REGEX.sub(' ', text)
    text = DIGIT_REGEX.sub(' ', text)
    return MULTIPLE_SPACES_REGEX.sub(' ', text)

## Load data

In [3]:
df = pd.read_csv("data/fr.openfoodfacts.org.products.csv",
                       delimiter='\t',
                       encoding='utf-8',
                         nrows = 300000,
                      usecols=['product_name', 
                               'ingredients_text', 
                               'nutrition-score-fr_100g', 'carbon-footprint_100g', 
                             #  'energy_100g', 'fat_100g', 
                             #  'carbohydrates_100g', 'sugars_100g', 'proteins_100g', 'salt_100g', 'sodium_100g',
                              'categories_tags','countries_tags','categories'
                              ],
                      converters={'categories_tags': lambda x: x.split(',') if x else np.NaN}
                      )
df.head(n=5)

Unnamed: 0,product_name,categories,categories_tags,countries_tags,ingredients_text,carbon-footprint_100g,nutrition-score-fr_100g
0,Vitória crackers,,,en:france,,,
1,Cacao,,,en:france,,,
2,Sauce Sweety chili 0%,,,en:france,,,
3,Mini coco,,,en:france,,,
4,Pistou d'ail des ours,,,en:france,,,


In [4]:
df = df[pd.notnull(df['categories_tags'])]
df = df[pd.notnull(df['ingredients_text'])]
# df = df[pd.notnull(df['carbon-footprint_100g'])]


fr_df = df[df['countries_tags'] == 'en:france']

In [5]:
selected_categories = [
    "en:beverages",
    "en:sugary-snacks",
    "en:meals",
    "en:dairies",
    "en:meats",
    "en:desserts",
    "en:frozen-foods",
    "en:breakfasts",
    "en:cheeses",
    "en:biscuits",
    "en:groceries",
    "en:fats",
    "en:chocolates",
    "en:sauces",
]

selected_categories_set = set(selected_categories)

criterion = fr_df['categories_tags'].map(lambda x: bool(set(x).intersection(selected_categories_set)))
cat_df = fr_df[criterion]
cat_df['categories_tags_int'] = fr_df['categories_tags'].map(lambda categories: [selected_categories.index(x) for x in categories if x in selected_categories])
print(f"{len(fr_df)} elements in original dataframe, {len(cat_df)} after category filter")

21140 elements in original dataframe, 15615 after category filter


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [17]:
cat_df.head(5)

Unnamed: 0,product_name,categories,categories_tags,countries_tags,ingredients_text,carbon-footprint_100g,nutrition-score-fr_100g,categories_tags_int
260,Blanquette de Volaille et son Riz,"Plats préparés, Produits à la viande, Plats à ...","[en:meals, en:meat-based-products, en:meals-wi...",en:france,"Riz précuit 40,4 % (eau, riz, huile de colza, ...",,0.0,[2]
276,Entremets Crème Brulée,"Produits laitiers, Desserts, Produits déshydra...","[en:dairies, en:desserts, en:dried-products, e...",en:france,"Sucre, poudre de _lait_, poudre au beurre (_la...",,2.0,"[3, 5]"
307,Biscuits sablés fourrage au cacao,"Snacks sucrés, Biscuits et gâteaux, Biscuits, ...","[en:sugary-snacks, en:biscuits-and-cakes, en:b...",en:france,"Sucre, farine de _Blé_, graisse et huiles végé...",,,"[1, 9]"
309,A&w - Root Beer - 355ml,Boissons,"[en:beverages, en:non-alcoholic-beverages]",en:france,"Eau gazéifiée, sirop de mais riche en fructose...",,16.0,[0]
313,Compote de Pomme,"Aliments et boissons à base de végétaux, Alime...","[en:plant-based-foods-and-beverages, en:plant-...",en:france,"Flocons de pommes 76 % (pomme, amidon de maïs,...",,1.0,[5]


In [6]:
target = [np.array(c) for c in cat_df['categories_tags_int'].values] # several categories per sample
y = MultiLabelBinarizer().fit_transform(target) # one-hot of the targets
X = cat_df.ingredients_text.values

target = np.argmax(y,axis=1) # trivial reduce to 1 category per sample (loss of information)

idx_shuffle = np.random.permutation(X.shape[0])
X = X[idx_shuffle]
y = y[idx_shuffle,:]

idx_train = np.arange(1000)
idx_test = np.arange(1000,2000,1)
idx_both = np.concatenate([idx_train,idx_test],axis=0)
X_train = X[idx_train]
X_test = X[idx_test]
target_train = target[idx_train]
target_test = target[idx_test]
y_train = y[idx_train,:]
y_test = y[idx_test,:]


## Iterative HFS

In [71]:
decomposition_learner = Pipeline([
    ('vectorizer', CountVectorizer(strip_accents='unicode', min_df=5, preprocessor=preprocess)),
    ('tfidf', TfidfTransformer()),
    ('pca', TruncatedSVD(64))
])

In [72]:
X_proj = decomposition_learner.fit_transform(X)

In [73]:
import large_algos as lalg
from importlib import reload
reload(lalg)

pred,f = lalg.iterative_hfs(samples=X_proj[idx_both,:],
                            idx_lbl=idx_train,
                            labels_binary=y[idx_train,:], 
                              var=0.000001, eps=0, k=5, 
                          niter = 3, laplacian_regularization=0.1)

In [74]:
y[[idx_test,pred[idx_test]]].mean()

0.205

## Incremental centroids

In [212]:
from large_algos import incremental_k_centers

incremental_k_centers(labeled_samples=X_proj[:2000,:], labels)

SyntaxError: invalid syntax (<ipython-input-212-ac28494dfb51>, line 2)

## Baseline

In [203]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer(strip_accents='unicode', min_df=5, preprocessor=preprocess)),
    ('tfidf', TfidfTransformer()),
    ('pca', TruncatedSVD(64)), # try with
    ('clf', OneVsRestClassifier(LinearSVC()))
])

classifier.fit(X[idx_train], y[idx_train,:])

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1),
        preprocessor=<function preprocess a..._class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))])

In [207]:
y_pred = classifier.predict(X[idx_test])
print(classification_report(y[idx_test,:], y_pred, target_names=selected_categories))

                  precision    recall  f1-score   support

    en:beverages       0.90      0.64      0.74       174
en:sugary-snacks       0.87      0.84      0.86       219
        en:meals       0.84      0.65      0.73       127
      en:dairies       0.97      0.85      0.90       170
        en:meats       0.96      0.66      0.78       138
     en:desserts       0.71      0.36      0.48        69
 en:frozen-foods       0.00      0.00      0.00        34
   en:breakfasts       0.97      0.50      0.66        70
      en:cheeses       0.97      0.77      0.86        90
     en:biscuits       0.86      0.54      0.67        57
    en:groceries       0.88      0.12      0.21        58
         en:fats       0.00      0.00      0.00        31
   en:chocolates       0.71      0.34      0.46        44
       en:sauces       0.60      0.09      0.16        32

       micro avg       0.90      0.61      0.72      1313
       macro avg       0.73      0.45      0.54      1313
    weighted

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## PCA + Metric_learn + clf (LinearSVC / kNN)

In [28]:
decomposition_learner = Pipeline([
    ('vectorizer', CountVectorizer(strip_accents='unicode', min_df=5, preprocessor=preprocess)),
    ('tfidf', TfidfTransformer()),
    ('pca', TruncatedSVD(64)), # need this to transform above obtained sparse matrix
    ('metric_decomp', metric_learn.LFDA(k=8,embedding_type='weighted')) # does not impact the results much
  #  ('metric_decomp', metric_learn.LMNN(k=5,learn_rate=1e-5,min_iter=1,max_iter=3))
  #  ('metric_decomp', metric_learn.NCA(max_iter=10))
])

X_proj = decomposition_learner.fit_transform(np.concatenate([X_train,X_test],axis=0),
                                            np.concatenate([target_train,target_test],axis=0))

TypeError: All intermediate steps should be transformers and implement fit and transform. 'TSNE(angle=0.5, early_exaggeration=12.0, init='random', learning_rate=200.0,
   method='barnes_hut', metric='euclidean', min_grad_norm=1e-07,
   n_components=64, n_iter=1000, n_iter_without_progress=300,
   perplexity=30.0, random_state=None, verbose=0)' (type <class 'sklearn.manifold.t_sne.TSNE'>) doesn't

In [22]:
classifier = Pipeline([
  #  ('clf', OneVsRestClassifier(LinearSVC())),
    ('clf', KNeighborsClassifier(n_neighbors=5))
])

classifier.fit(X_proj[idx_train],y_train)
pred = classifier.predict(X_proj[idx_test])

In [23]:
print(classification_report(y_test, pred, target_names=selected_categories))

                  precision    recall  f1-score   support

    en:beverages       0.70      0.58      0.64       183
en:sugary-snacks       0.89      0.73      0.80       231
        en:meals       0.64      0.64      0.64       123
      en:dairies       0.91      0.77      0.84       182
        en:meats       0.86      0.77      0.81       124
     en:desserts       0.69      0.29      0.41        86
 en:frozen-foods       0.00      0.00      0.00        33
   en:breakfasts       0.76      0.40      0.53        62
      en:cheeses       0.82      0.76      0.79        93
     en:biscuits       0.66      0.42      0.51        60
    en:groceries       0.71      0.33      0.45        72
         en:fats       0.50      0.07      0.12        29
   en:chocolates       0.81      0.63      0.71        46
       en:sauces       0.73      0.31      0.43        36

       micro avg       0.79      0.59      0.68      1360
       macro avg       0.69      0.48      0.55      1360
    weighted

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Baseline : tfidf + LinearSVC

- baseline
```
       micro avg       0.90      0.68      0.77      1313
       macro avg       0.85      0.55      0.64      1313
    weighted avg       0.88      0.68      0.75      1313
     samples avg       0.72      0.68      0.69      1313
```
- tfidf + pca + LinearSVC
```
       micro avg       0.90      0.60      0.72      1313
       macro avg       0.79      0.45      0.53      1313
    weighted avg       0.86      0.60      0.68      1313
     samples avg       0.66      0.61      0.62      1313
```
- tfidf + pca + kNN
```
       micro avg       0.84      0.64      0.73      1360
       macro avg       0.83      0.55      0.63      1360
    weighted avg       0.84      0.64      0.72      1360
     samples avg       0.69      0.65      0.66      1360
```
### Metric learning : tfidf + TruncSVD(64) + metric + kNN
- LMNN(k=5,learn_rate=1e-5,min_iter=1,max_iter=3)
```
       micro avg       0.81      0.65      0.72      1360
       macro avg       0.71      0.54      0.60      1360
    weighted avg       0.78      0.65      0.70      1360
     samples avg       0.68      0.66      0.66      1360
```
- LFDA(k=8,embedding_type='weighted')
```
       micro avg       0.83      0.59      0.69      1360
       macro avg       0.84      0.50      0.59      1360
    weighted avg       0.84      0.59      0.67      1360
     samples avg       0.64      0.60      0.61      1360
```
- NCA(max_iter=10)
```
       micro avg       0.79      0.59      0.68      1360
       macro avg       0.69      0.48      0.55      1360
    weighted avg       0.76      0.59      0.65      1360
     samples avg       0.63      0.60      0.60      1360
```
### Metric learning : tfidf + TruncSVD(64) + metric + LinearSVC

- LMNN(k=5,learn_rate=1e-5,min_iter=1,max_iter=3)
```
       micro avg       0.88      0.67      0.76      1313
       macro avg       0.83      0.54      0.63      1313
    weighted avg       0.87      0.67      0.74      1313
     samples avg       0.69      0.67      0.67      1313
```
- LFDA(k=8,embedding_type='weighted')
```
       micro avg       0.92      0.62      0.74      1313
       macro avg       0.83      0.48      0.57      1313
    weighted avg       0.89      0.62      0.71      1313
     samples avg       0.68      0.63      0.64      1313
```
- NCA(max_iter=3)
```
       micro avg       0.86      0.68      0.76      1313
       macro avg       0.76      0.56      0.62      1313
    weighted avg       0.83      0.68      0.74      1313
     samples avg       0.70      0.68      0.68      1313
```
- NCA(max_iter=10)
```
       micro avg       0.73      0.70      0.72      1313
       macro avg       0.69      0.61      0.62      1313
    weighted avg       0.78      0.70      0.72      1313
     samples avg       0.66      0.71      0.66      1313
```
- NCA(max_iter=10) LinearSVC(maxiter=30000)
```
       micro avg       0.78      0.69      0.73      1313
       macro avg       0.67      0.58      0.62      1313
    weighted avg       0.77      0.69      0.73      1313
     samples avg       0.69      0.70      0.67      1313
```
- 
```
```
- 
```
```

TODO

carbon footprint :
- chercher un algo regression qui puisse utiliser entrées sparse
- comparer avec PCA + LMNN + (log ?) regression


NOTES

PCA appears to be a bad decomposer (LinearSVC loses drastic performances) yet it is an simple algorithm to extract information from large scale sparse matrices for algorithms that cannot work on such large matrices

We try and find the best decomposer based on the categorization problem to then use it for the carbon footprint regression problem.

We find that PCA+LMNN is a good decomposer, giving results very close to LinearSVC for categorization.