# Create classification model(s)
- start with the "food.csv"
- tfidf vectorizer & classification models
- combining text fields except for "food_group" and/or "food_subgroup"

In [1]:
import pprint
import importlib
import numpy as np
import pandas as pd

# import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error

import modeling

In [None]:
importlib.reload(modeling);

## pull pre-cleaned & flagged food data
- this parquet file is the merged "food" and "content" tables
- it has an "all_text" field
- it has columns with binary values to capture the presence of specific flavor descriptors in that text: `flavor_cols = ["salty", "sour", "bitter", "sweet", "fatty", "umami", "acid"]`

In [2]:
food = pd.read_parquet("data/big_food.parquet")
food.shape

(4074596, 22)

In [3]:
food.columns

Index(['food_id', 'orig_food_id', 'orig_food_common_name',
       'orig_food_scientific_name', 'orig_food_part', 'orig_source_id',
       'orig_source_name', 'id', 'name', 'name_scientific', 'description',
       'wikipedia_id', 'food_group', 'food_subgroup', 'all_text', 'salty',
       'sour', 'bitter', 'sweet', 'fatty', 'umami', 'acid'],
      dtype='object')

In [4]:
flavor_cols = ["salty", "sour", "bitter", "sweet", "fatty", "umami", "acid"]

In [21]:
label_encoder = LabelEncoder()
label_enc_y = label_encoder.fit_transform(food["food_group"])

## build pipelines
### references for choosing these hyperparameter options:
- [svc](https://www.baeldung.com/cs/svm-multiclass-classification) kernel options

In [23]:
# now that I know my label_encoder works, defining params for the different models I want to try
pipe_params = {
    "tfidf__max_features": [None, 250, 300],
    "tfidf__min_df": [1, 0.01, 0.05],
    "tfidf__max_df": [1.0, 0.98, 0.95, 0.9],
    "tfidf__ngram_range": [(1,1), (1,2), (1,3)],
    "lr__solver": ["lbfgs", "liblinear"],
    "lr__C": [1.0, 0.1, 10],
    "multi__alpha": [1.0, 0.5],
    "rfc__n_estimators": [100, 50, 150],
    "rfc__max_depth": [None, 5, 10],
    "rfc__min_samples_split": [2, 5, 7],
    "rfc__min_samples_leaf": [1, 2, 3],
    "rfc__bootstrap": [True, False],
    "knn__n_neighbors": range(1,11),
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2],
    "svc__C": [1.0, 0.1, 10],
    "svc__kernel": ["poly", "rbf"],   
}

In [24]:
# and the components
vectorizer = ("tfidf", TfidfVectorizer(stop_words="english"))
lr_tuple = ("lr", LogisticRegression(max_iter=1000))
multi_tuple = ("multi", MultinomialNB())
rand_forest_tuple = ("rfc", RandomForestClassifier(n_jobs=-1))
knn_tuple = ("knn", KNeighborsClassifier())
svc_tuple = ("svc", SVC(random_state=42))

model_options = [lr_tuple, multi_tuple, rand_forest_tuple, knn_tuple, svc_tuple]

### build models
- the `modeling.train_save_best_model` function builds a Pipeline and a GridSearch from the `pipe_params` for a given model and then saves the best estimator as a pickle file
- commenting this cell out once I have those pickle files so I can (if necessary) "run all" cells in the notebook

In [None]:
# this raises a warning on the "food" data:
# UserWarning: y_pred contains classes not in y_true - warnings.warn("y_pred contains classes not in y_true")
#  which makes sense because there are a couple of "food_group" labels with only 1 or 2 instances in that df

# estimators = []
# for model in model_options:
#     pipe_components = [vectorizer, model]
#     print(f"starting {model[0]} grid search")
#     est = modeling.train_save_best_model(pipe=pipe_components, pipe_params=pipe_params, X_train=food["all_text"],
#                                    y_train=label_enc_y, file_path=f"models/{model[0]}.pkl")
#     estimators.append(est)

In [25]:
X = food["all_text"]
y = label_enc_y

pipeline_names = ["knn", "lr", "multi", "rfc", "svc"]

In [26]:
# use `fetch_fitted_pipeline` to fetch previously-fitted pipeline
scores = {}
for model_type in pipeline_names:
    gs = modeling.fetch_fitted_pipeline(f"models/{model_type}.pkl")
    gs.fit(X, y)
    scores[model_type] = gs.score(X, y)
    
scores

{'knn': 1.0,
 'lr': 0.9968186638388123,
 'multi': 0.8494167550371156,
 'rfc': 1.0,
 'svc': 1.0}

## combine content & food data

In [27]:
content = pd.read_csv("data/foodb_2020_04_07_csv/Content.csv")
content.columns

Index(['id', 'source_id', 'source_type', 'food_id', 'orig_food_id',
       'orig_food_common_name', 'orig_food_scientific_name', 'orig_food_part',
       'orig_source_id', 'orig_source_name', 'orig_content', 'orig_min',
       'orig_max', 'orig_unit', 'orig_citation', 'citation', 'citation_type',
       'creator_id', 'updater_id', 'created_at', 'updated_at', 'orig_method',
       'orig_unit_expression', 'standard_content', 'preparation_type',
       'export'],
      dtype='object')

In [64]:
content.head(10)

Unnamed: 0,food_id,orig_food_id,orig_food_common_name,orig_food_scientific_name,orig_food_part,orig_source_id,orig_source_name
0,4,29,Kiwi,Actinidia chinensis PLANCHON [Actinidiaceae],Fruit,FAT,FAT
1,6,53,Onion,Allium cepa L. [Liliaceae],Bulb,FAT,FAT
2,6,53,Onion,Allium cepa L. [Liliaceae],Leaf,FAT,FAT
3,9,55,Chives,Allium schoenoprasum L. [Liliaceae],Leaf,FAT,FAT
4,11,70,Cashew,Anacardium occidentale L. [Anacardiaceae],Fruit,FAT,FAT
5,11,70,Cashew,Anacardium occidentale L. [Anacardiaceae],Leaf,FAT,FAT
6,11,70,Cashew,Anacardium occidentale L. [Anacardiaceae],Seed,FAT,FAT
7,12,74,Pineapple,Ananas comosus (L.) MERR. [Bromeliaceae],Fruit,FAT,FAT
8,13,83,Dill,Anethum graveolens L. [Apiaceae],Plant,FAT,FAT
9,13,83,Dill,Anethum graveolens L. [Apiaceae],Seed,FAT,FAT


In [28]:
content_drop_cols = ["id", "source_id", "source_type", "orig_content", "orig_min",
       "orig_max", "orig_unit", "orig_citation", "citation", "citation_type",
       "creator_id", "updater_id", "created_at", "updated_at", "orig_method",
       "orig_unit_expression", "standard_content", "preparation_type",
       "export"]

print(f"keeping columns: {set(content.columns).difference(set(content_drop_cols))}")

keeping columns: {'orig_source_id', 'orig_food_scientific_name', 'orig_food_common_name', 'orig_food_part', 'orig_source_name', 'food_id', 'orig_food_id'}


## find flavor descriptors
- add flags where the descriptors are present

In [66]:
# including some basic synonyms here, there's some overlap
salty = {"salt", "salty", "sharp", "tangy", "pungent"}
sour = {"sour", "sourness", "tart", "vinegar", "vinegary"}
bitter = {"bitter", "bitterness"}
sweet = {"sweet", "sweetness", "syrupy", "syrupiness", "treacle", "treacly", "sugary", "sugariness"}
fatty = {"fat", "fats", "fatty", "fattiness", "creamy", "luscious"}
umami = {"umami", "meaty", "meatiness", "savory", "savoriness"}
acid = {"acid", "acidic", "vinegar", "vinegary", "acidy"}