# Run classification models in colab
- this is mostly the same work as in the classifying_food.ipynb but running in colab to accommodate the size of the data
- check out that notebook for additional EDA on these tables

In [30]:
import pickle
import importlib
import numpy as np
import pandas as pd

# import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error

In [2]:
# from https://medium.com/analytics-vidhya/importing-your-own-python-module-or-python-file-into-colab-3e365f0a35ec
# there's NO WAY colab needs all of those permissions
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import sys
sys.path.insert(0,"/content/drive/My Drive/GeneralAssembly/capstone")
import modeling

#### pull data & basic cleanup
- get rid of columns that I won"t use
- remove the 47 rows that are missing "description" data

In [4]:
food = pd.read_csv("/content/sample_data/Food.csv")
food.head(3)

Unnamed: 0,id,name,name_scientific,description,itis_id,wikipedia_id,picture_file_name,picture_content_type,picture_file_size,picture_updated_at,...,food_type,created_at,updated_at,creator_id,updater_id,export_to_afcdb,category,ncbi_taxonomy_id,export_to_foodb,public_id
0,1,Angelica,Angelica keiskei,Angelica is a genus of about 60 species of tal...,,Angelica,1.jpg,image/jpeg,111325.0,2012-04-20 09:29:57 UTC,...,Type 1,2011-02-09 00:37:14 UTC,2019-05-14 18:04:13 UTC,,2.0,False,specific,357850.0,True,FOOD00001
1,2,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,,Savoy cabbage,2.jpg,image/jpeg,155178.0,2012-04-20 09:39:54 UTC,...,Type 1,2011-02-09 00:37:15 UTC,2019-05-14 18:04:13 UTC,,,False,specific,1216010.0,True,FOOD00002
2,3,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,845789.0,Tilia tomentosa,3.jpg,image/jpeg,56367.0,2012-04-20 09:41:25 UTC,...,Type 1,2011-02-09 00:37:15 UTC,2019-05-17 16:19:45 UTC,,,False,specific,,True,FOOD00003


In [5]:
# getting rid of columns I don"t care about
drop_columns = ["itis_id", "picture_file_name", "picture_content_type", "picture_file_size", "picture_updated_at", "legacy_id", "created_at", "updated_at", "creator_id",
       "updater_id", "export_to_afcdb", "category", "ncbi_taxonomy_id", "export_to_foodb", "public_id", "food_type"]

print(f"keeping columns: {set(food.columns).difference(set(drop_columns))}")

keeping columns: {'wikipedia_id', 'name', 'id', 'food_group', 'name_scientific', 'food_subgroup', 'description'}


In [6]:
food.drop(columns=drop_columns, inplace=True)
food.head(3)

Unnamed: 0,id,name,name_scientific,description,wikipedia_id,food_group,food_subgroup
0,1,Angelica,Angelica keiskei,Angelica is a genus of about 60 species of tal...,Angelica,Herbs and Spices,Herbs
1,2,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,Savoy cabbage,Vegetables,Cabbages
2,3,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,Tilia tomentosa,Herbs and Spices,Herbs


In [7]:
food.dropna(subset=["description"], inplace=True)
food.shape

(945, 7)

---
## clean up "food_group"

In [8]:
print(f"nulls in 'food_subgroup': {food['food_subgroup'].isna().sum()}")
print(f"nulls in 'food_group': {food['food_group'].isna().sum()}")

nulls in 'food_subgroup': 2
nulls in 'food_group': 2


In [9]:
food.dropna(subset=["food_group", "food_subgroup"], inplace=True)
food.shape

(943, 7)

In [10]:
food["food_group"] = food["food_group"].str.replace("Herbs and spices", "Herbs and Spices")
food["food_group"].unique()

array(['Herbs and Spices', 'Vegetables', 'Fruits', 'Nuts',
       'Cereals and cereal products', 'Pulses', 'Teas', 'Gourds',
       'Coffee and coffee products', 'Soy', 'Cocoa and cocoa products',
       'Beverages', 'Aquatic foods', 'Animal foods',
       'Milk and milk products', 'Eggs', 'Confectioneries',
       'Baking goods', 'Dishes', 'Snack foods', 'Baby foods',
       'Fats and oils'], dtype=object)

In [11]:
len(food["food_group"].unique())

22

In [12]:
len(food["food_subgroup"].unique())

112

---
## prepare data for modeling
- combine text columns, except for "food_group"

In [13]:
text_cols = list(food.columns)
text_cols.remove("food_group")
text_cols.remove("id")
text_cols

['name', 'name_scientific', 'description', 'wikipedia_id', 'food_subgroup']

In [14]:
food.fillna("", inplace=True)

food["all_text"] = food[text_cols].apply(" ".join, axis=1)
food["all_text"].head(3)

0    Angelica Angelica keiskei Angelica is a genus ...
1    Savoy cabbage Brassica oleracea var. sabauda S...
2    Silver linden Tilia argentea Tilia tomentosa (...
Name: all_text, dtype: object

## combine content & food data

In [15]:
content = pd.read_csv("/content/sample_data/Content.csv")
content.columns

  content = pd.read_csv("/content/sample_data/Content.csv")



Index(['id', 'source_id', 'source_type', 'food_id', 'orig_food_id',
       'orig_food_common_name', 'orig_food_scientific_name', 'orig_food_part',
       'orig_source_id', 'orig_source_name', 'orig_content', 'orig_min',
       'orig_max', 'orig_unit', 'orig_citation', 'citation', 'citation_type',
       'creator_id', 'updater_id', 'created_at', 'updated_at', 'orig_method',
       'orig_unit_expression', 'standard_content', 'preparation_type',
       'export'],
      dtype='object')

In [16]:
content_drop_cols = ["id", "source_id", "source_type", "orig_content", "orig_min",
       "orig_max", "orig_unit", "orig_citation", "citation", "citation_type",
       "creator_id", "updater_id", "created_at", "updated_at", "orig_method",
       "orig_unit_expression", "standard_content", "preparation_type",
       "export"]

print(f"keeping columns: {set(content.columns).difference(set(content_drop_cols))}")

keeping columns: {'orig_source_id', 'orig_source_name', 'orig_food_common_name', 'orig_food_id', 'orig_food_scientific_name', 'orig_food_part', 'food_id'}


In [17]:
content.drop(content_drop_cols, axis=1, inplace=True)
content.isna().sum()

food_id                            0
orig_food_id                 4308143
orig_food_common_name         971656
orig_food_scientific_name    5099435
orig_food_part               5104956
orig_source_id               4448551
orig_source_name             4445227
dtype: int64

In [18]:
# pretty sure this is the column I care the most about so nulls here are no good to me
content.dropna(subset=["orig_food_common_name"], inplace=True)

## combine "food" & "content" data

In [19]:
big_food = content.merge(food, how="inner", left_on="food_id", right_on="id")
big_food.shape

(4074596, 15)

In [20]:
big_food.columns

Index(['food_id', 'orig_food_id', 'orig_food_common_name',
       'orig_food_scientific_name', 'orig_food_part', 'orig_source_id',
       'orig_source_name', 'id', 'name', 'name_scientific', 'description',
       'wikipedia_id', 'food_group', 'food_subgroup', 'all_text'],
      dtype='object')

In [21]:
# seems like there might be good text in these columns to add to the text I already have from food.csv
# check out the "classifying_food.ipynb" for more details about these columns
more_text_columns = ["orig_food_common_name", "orig_food_part", "orig_food_scientific_name",
                     "orig_source_name", "all_text"]

for col in more_text_columns:
    big_food[col].fillna("", inplace=True)

In [22]:
big_food["all_text"] = big_food[more_text_columns].apply(" ".join, axis=1)

In [23]:
big_food["all_text"].head(3)

0    Kiwi Fruit Actinidia chinensis PLANCHON [Actin...
1    Kiwi fruit, raw   Protein, total Kiwi Actinidi...
2    Kiwi fruit, raw   Protein, total-N Kiwi Actini...
Name: all_text, dtype: object

---
## set X & y, train/test split
- there are 22 labels in the "food_group" column
- using label encoder: just assigns a numerical value to each label
- this notebook is running the classification models on the combined "content" and "food" data so there's less of an issue with representation, will be doing the usual train/test split here

In [24]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_x = tfidf.fit_transform(big_food["all_text"])

In [25]:
label_encoder = LabelEncoder()
label_enc_y = label_encoder.fit_transform(big_food["food_group"])
y = label_enc_y

X_train, X_test, y_train, y_test = train_test_split(tfidf_x, y, random_state=42, stratify=y)

pipeline_names = ["knn", "lr", "multi", "rfc", "svc"]

In [26]:
lr = LogisticRegression(max_iter=1000)


In [27]:
lr.fit(X_train, y_train)

In [28]:
lr_score_train = lr.score(X_train, y_train)
lr_score_test = lr.score(X_test, y_test)
lr_preds = lr.predict(X_test)

lr_df = pd.DataFrame.from_dict([{
    "model": "LogisticRegression",
    "score": lr_score_train,
    "score_type": "training score"
}, {
    "model": "LogisticRegression",
    "score": lr_score_test,
    "score_type": "test score"
}])

In [29]:
print(f"training score: {lr_score_train}")
print(f"test score: {lr_score_test}")

training score: 1.0
test score: 1.0


In [31]:
with open("/content/models/lr.pkl", "wb") as f:
    pickle.dump(lr, f)

In [32]:
knn = KNeighborsClassifier()  # starting out with the defaults
knn.fit(X_train, y_train)

In [None]:
knn_score_train = knn.score(X_train, y_train)
knn_score_test = knn.score(X_test, y_test)
knn_preds = knn.predict(X_test)

knn_df = pd.DataFrame.from_dict([{
    "model": "LogisticRegression",
    "score": knn_score_train,
    "score_type": "training score"
}, {
    "model": "LogisticRegression",
    "score": knn_score_test,
    "score_type": "test score"
}])

## build pipelines
### references for choosing these hyperparameter options:
- [svc](https://www.baeldung.com/cs/svm-multiclass-classification) kernel options

In [None]:
pipe_params = {
    "tfidf__max_features": [None, 250, 300],
    "tfidf__min_df": [1, 0.01, 0.05],
    "tfidf__max_df": [1.0, 0.98, 0.95, 0.9],
    "tfidf__ngram_range": [(1,1), (1,2), (1,3)],
    "lr__solver": ["lbfgs", "liblinear"],
    "lr__C": [1.0, 0.1, 10],
    "multi__alpha": [1.0, 0.5],
    "rfc__n_estimators": [100, 50, 150],
    "rfc__max_depth": [None, 5, 10],
    "rfc__min_samples_split": [2, 5, 7],
    "rfc__min_samples_leaf": [1, 2, 3],
    "rfc__bootstrap": [True, False],
    "knn__n_neighbors": range(1,11),
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2],
    "svc__C": [1.0, 0.1, 10],
    "svc__kernel": ["poly", "rbf"],
}

In [None]:
# and the components
vectorizer = ("tfidf", TfidfVectorizer(stop_words="english"))
lr_tuple = ("lr", LogisticRegression(max_iter=1000))
multi_tuple = ("multi", MultinomialNB())
rand_forest_tuple = ("rfc", RandomForestClassifier(n_jobs=-1))
knn_tuple = ("knn", KNeighborsClassifier())
svc_tuple = ("svc", SVC(random_state=42))

model_options = [lr_tuple, multi_tuple, rand_forest_tuple, knn_tuple, svc_tuple]

In [None]:
big_estimators = []
for model in model_options:
    pipe_components = [vectorizer, model]
    print(f"starting {model[0]} grid search")
    est = modeling.train_save_best_model(pipe=pipe_components, pipe_params=pipe_params, X_train=X_train,
                                   y_train=y_train, file_path=f"/content/models/big_{model[0]}.pkl")
    big_estimators.append(est)

starting lr grid search
again!
Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [None]:
# use `fetch_fitted_pipeline` to fetch previously-fitted pipeline
scores = {}
for model_type in pipeline_names:
    gs = modeling.fetch_fitted_pipeline(f"models/{model_type}.pkl")
    gs.fit(X, y)
    scores[model_type] = gs.score(X, y)

scores

{'knn': 1.0,
 'lr': 0.9968186638388123,
 'multi': 0.8494167550371156,
 'rfc': 1.0,
 'svc': 1.0}