# 0 Import packages

In [None]:
### Load default packages
import numpy as np
import pandas as pd
import os
import re
import random
import seaborn as sns
import matplotlib.pyplot as plt
# import lightgbm as lgb
import statistics 

# set random
seed = 1234
random.seed = 1234
np.random.seed(seed)

# no warning
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

# 64 bit
import sys
def is_64bit() -> bool:
    return sys.maxsize > 2**32
is_64bit()

In [None]:
### Load embedding-SBERT packages
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

### Load t-SNE packages
from sklearn.manifold import TSNE

### Load xgboost packages
import xgboost

### Load prediction packages
from sklearn import metrics, preprocessing, linear_model, svm, gaussian_process, neighbors
from sklearn.metrics import roc_auc_score,mean_squared_error,r2_score,accuracy_score,balanced_accuracy_score,roc_curve,auc,f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
# classification
from sklearn.multiclass import OneVsRestClassifier ##### !!!! MultiClass Classifier
from sklearn.ensemble import RandomForestClassifier #Random Forest
from xgboost.sklearn import XGBClassifier #XGBoost Classifier
from sklearn.svm import SVC #Support Vector Machine
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.neighbors import KNeighborsClassifier #KNN (k-nearest neighbor)
from sklearn.linear_model import LogisticRegression #Logistic Regression
from sklearn.ensemble import GradientBoostingClassifier #Gradient Boosting Classifier
from lightgbm import LGBMClassifier #LGBM Classifier
from sklearn.naive_bayes import GaussianNB #Naive Bayes (Gaussian)
from sklearn.naive_bayes import MultinomialNB #Naive Bayes (Multinomial)
from sklearn.linear_model import SGDClassifier #Stochastic Gradient Descent Classifier
from sklearn.ensemble import VotingClassifier
# regression
from sklearn.linear_model import LinearRegression #LinearR
from sklearn.svm import SVR #Support Vector Machine
from xgboost.sklearn import XGBRegressor #XGBoost
from sklearn.ensemble import RandomForestRegressor #Random Forest
from lightgbm import LGBMRegressor #LGBM Regressor
from sklearn.neighbors import KNeighborsRegressor #KNN (k-nearest neighbor)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import BayesianRidge 
from sklearn.linear_model import SGDRegressor #Stochastic Gradient Descent Regression
from sklearn.linear_model import ElasticNet
from sklearn.kernel_ridge import KernelRidge

# 1 Import data

In [None]:
### Import FLIP raw data
dfFLIP = pd.read_csv('/Users/path/FLIP_20220814.csv',header=0)
df2010 = dfFLIP.loc[dfFLIP["FLIP_year"]==2010].reset_index(drop=True)
df2013 = dfFLIP.loc[dfFLIP["FLIP_year"]==2013].reset_index(drop=True)
df2017 = dfFLIP.loc[dfFLIP["FLIP_year"]==2017].reset_index(drop=True)
df2020 = dfFLIP.loc[dfFLIP["FLIP_year"]==2020].reset_index(drop=True)

In [None]:
### Embedding Algorithm (too slow therefore input embedding results from other computer)
# (0_SBERT_processing_FLIP_20220815)
# data = data2017
# i = data['i'].to_list()
# ns = data['Product_n'].to_list()
# emb = model.encode(i,show_progress_bar=True)
# emb = model.encode(ns,show_progress_bar=True)

### Import Emdedding raw data
emb2017n = np.load('/Users/path/emb_name_FLIP_2017_20220815.npz')['emb']
emb2017i = np.load('/Users/path/emb_ingred_FLIP_2017_20220815.npz')['emb']
emb2017b = np.load('/Users/path/emb_brand_FLIP_2017_20220815.npz')['emb']

emb2020n = np.load('/Users/path/emb_name_FLIP_2020_20220815.npz')['emb']
emb2020i = np.load('/Users/path/emb_ingred_FLIP_2020_20220815.npz')['emb']
emb2020b = np.load('/Users/path/emb_brand_FLIP_2020_20220815.npz')['emb']

### Creating Embedding combined data for n+ingred+b, n+b, n+ingred
# can custumize into emb2017ni and emb2017nbi

# 2 Visualize embeddings

In [None]:
df = df2017
emb = emb2017ni
dfset = "df2017-name/ingredient"
print(dfset)

df_a = df.loc[((df["TRA_Cat"] != "ZY")& (df["TRA_Cat"] != "ZX")),]
df_b = df_a.loc[df_a["Product_Name"].notna(),].copy()
df_c = df_b.loc[df_b["Ingredients"].notna(),].copy()
df_d = df_c.loc[df_c["Brand"].notna(),].copy()
#df_c = df_c.drop_duplicates(subset='FLIP_UPC',keep='first') ######## unique UPC
df_e = df_d.loc[df_d['TRA_Cat'].notna(),].copy()
df_o = df_e.loc[df_e['TRA_Cat_code'].notna(),].copy()
print("FLIP_raw",df.shape,"after remove ZY ZX", df_a.shape, "after Product_Name NA", df_b.shape, "after Ingredients NA", df_c.shape, 
      "after Brand NA", df_d.shape, "after TRA validation NA", df_e.shape, "after TRA_code validation NA", df_o.shape)

locations = df_o.index.to_list()
emb_o = emb[locations]

tsne = TSNE(random_state=1234, n_jobs=8)
x_embedded = tsne.fit_transform(emb_o)

palette = sns.set_palette(sns.color_palette(colors,len(set(df_o['TRA_Cat'].to_list()))))
plt.figure(figsize=(10,10), dpi=120)
sns.scatterplot(x_embedded[:,0], x_embedded[:,1], hue=df_o['TRA_Cat'].to_list(), legend='full', palette=palette)
plt.legend(loc='upper left')

In [None]:
df = df2020
emb = emb2020ni
dfset = "df2020-name/ingredient"
print(dfset)

df_a = df.loc[((df["TRA_Cat"] != "ZY")& (df["TRA_Cat"] != "ZX")),]
df_b = df_a.loc[df_a["Product_Name"].notna(),].copy()
df_c = df_b.loc[df_b["Ingredients"].notna(),].copy()
df_d = df_c.loc[df_c["Brand"].notna(),].copy()
#df_c = df_c.drop_duplicates(subset='FLIP_UPC',keep='first') ######## unique UPC
df_e = df_d.loc[df_d['TRA_Cat'].notna(),].copy()
df_o = df_e.loc[df_e['TRA_Cat_code'].notna(),].copy()
print("FLIP_raw",df.shape,"after remove ZY ZX", df_a.shape, "after Product_Name NA", df_b.shape, "after Ingredients NA", df_c.shape, 
      "after Brand NA", df_d.shape, "after TRA validation NA", df_e.shape, "after TRA_code validation NA", df_o.shape)

locations = df_o.index.to_list()
emb_o = emb[locations]

tsne = TSNE(random_state=1234, n_jobs=8)
x_embedded = tsne.fit_transform(emb_o)

palette = sns.set_palette(sns.color_palette(colors,len(set(df_o['TRA_Cat'].to_list()))))
plt.figure(figsize=(10,10), dpi=120)
sns.scatterplot(x_embedded[:,0], x_embedded[:,1], hue=df_o['TRA_Cat'].to_list(), legend='full', palette=palette)
plt.legend(loc='upper left')

# 3 TRA_Cat Algrothim

In [None]:
df = df2020
emb = emb2020ni
dfset = "df2020-TRA_Cat-name/ingredient"
print(dfset) 

df_a = df.loc[((df["TRA_Cat"] != "ZY")& (df["TRA_Cat"] != "ZX")),]
df_b = df_a.loc[df_a["Product_Name"].notna(),].copy()
df_c = df_b.loc[df_b["Ingredients"].notna(),].copy()
df_d = df_c.loc[df_c["Brand"].notna(),].copy()
#df_c = df_c.drop_duplicates(subset='FLIP_UPC',keep='first') ######## unique UPC
df_e = df_d.loc[df_d['TRA_Cat'].notna(),].copy()
df_o = df_e.loc[df_e['TRA_Cat_code'].notna(),].copy()
print("FLIP_raw",df.shape,"after remove ZY ZX", df_a.shape, "after Product_Name NA", df_b.shape, "after Ingredients NA", df_c.shape, 
      "after Brand NA", df_d.shape, "after TRA validation NA", df_e.shape, "after TRA_code validation NA", df_o.shape)

locations = df_o.index.to_list()
emb_o = emb[locations]

x = emb_o
y = df_o['TRA_Cat_code']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=random.seed)

clf1ni = SGDClassifier(penalty='elasticnet',n_jobs=8, random_state=1234)
clf1ni.fit(x_train, y_train)
y_pred = clf1ni.predict(x_test)
y_true = y_test
print('Elastic Net Acc: {:.2f}'.format(accuracy_score(y_true=y_test, y_pred=y_pred)))
print('Elastic Net Balanced Accuracy Score: {:.2f}'.format(balanced_accuracy_score(y_true=y_test, y_pred=y_pred)))
measures = dict()
measures['Accuracy'] = accuracy_score(y_true=y_test, y_pred=y_pred)
measures['Balanced-Accuracy'] = balanced_accuracy_score(y_true, y_pred)
measures['n_train'] = x_train.shape[0]
measures['n'] = df_o.shape[0]
measures['dfset'] = dfset
results['ElasticNet'] = measures

clf2ni = KNeighborsClassifier(n_neighbors=3)
clf2ni.fit(x_train, y_train)
y_pred = clf2ni.predict(x_test)
y_true = y_test
print("KNN Acc: {:.2f}".format(accuracy_score(y_true=y_test, y_pred=y_pred)))
print("KNN Balanced Accuracy Score: %.2g" % balanced_accuracy_score(y_true, y_pred))
measures = dict()
measures['Accuracy'] = accuracy_score(y_true=y_test, y_pred=y_pred)
measures['Balanced-Accuracy'] = balanced_accuracy_score(y_true, y_pred)
measures['n_train'] = x_train.shape[0]
measures['n'] = df_o.shape[0]
measures['dfset'] = dfset
results['KNN'] = measures

clf3ni = OneVsRestClassifier(xgboost.XGBClassifier(n_estimators=100,learning_rate=0.3,max_depth=6, subsample=1, gamma=0,reg_lambda=1,max_delta_step=0, colsample_bytree=1,min_child_weight=1, n_jobs=10,random_state=random.seed, reg_alpha=0,num_class=0,nthread=4,eval_metric='auc'))
clf3ni.fit(x_train, y_train)
y_pred = clf3ni.predict(x_test)
y_true = y_test
print("XGBoost Acc: {:.2f}".format(accuracy_score(y_true=y_test, y_pred=y_pred)))
print("XGBoost Balanced Accuracy Score: %.2g" % balanced_accuracy_score(y_true, y_pred))
measures = dict()
measures['Accuracy'] = accuracy_score(y_true=y_test, y_pred=y_pred)
measures['Balanced-Accuracy'] = balanced_accuracy_score(y_true, y_pred)
measures['n_train'] = x_train.shape[0]
measures['n'] = df_o.shape[0]
measures['dfset'] = dfset
results['XGBoost'] = measures

results_table1ni = pd.DataFrame.from_dict(results, orient='index').reset_index()
display(results_table1ni)

In [None]:
# clf3ni
### by each TRA_Cat categories
### by each TRA_Cat categories 
### by each TRA_Cat categories 
from sklearn.utils.multiclass import unique_labels ##### edited labels
TRA_Cat_list = unique_labels(y) ##### edited labels
TRA_Cat_list = [str(int(x)) for x in TRA_Cat_list] 
clf3ni_traw = metrics.classification_report(y_true, y_pred) #print(t3raw)
clf3ni_t = classification_report(y_true, y_pred, output_dict = True, digits=2)
clf3ni_t = pd.DataFrame(clf3ni_t)

### confusion matrix (non-normalized + normalized) clf3ni
from sklearn.metrics import ConfusionMatrixDisplay
plt.rcParams['figure.dpi'] = 300
sns.set(rc={'figure.figsize':(10,10)}, style='ticks', font="Arial", font_scale=0.75)

class_names = df2020.TRA_Cat_code
display_labels = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X']
#display_labels =[ '1.',  '3.',  '8'., '13'., '18'.,  '2'., 15., 19., 21.,  4., 10.,  5., 11., 22., 14., 20.,  9., 12., 23.,  6., 16.,  7.,  0., 17.]
values_format=['.0f','.2f']
cmap=[plt.cm.Blues, plt.cm.Blues]

titles_options = [("Confusion Matrix \n", None), #("Confusion Matrix \n (test sample)", None),
                  ("Confusion Matrix \n", "pred"),] # normolizer: 'true' 'pred' 'all' None

for i in range(len(titles_options)):
    title, normalize = titles_options[i]
    disp = ConfusionMatrixDisplay.from_estimator(clf3ni, ###### change here
            x_test, y_test, display_labels=display_labels,
            cmap=cmap[i], normalize=normalize, values_format=values_format[i],)  # include_values=False, # xticks_rotation=270,
    disp.ax_.set_title(title, fontsize=13,fontweight="bold") # ,
    disp.ax_.set_xlabel('Predicted TRA Category', fontsize=13)
    #plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    disp.ax_.set_ylabel('True TRA Category', fontsize=13)
    print(title)
    print(disp.confusion_matrix)
plt.show()

# 4 TRA_Item Algrothm

In [None]:
df = df2020
emb = emb2020ni
dfset = "df2020-TRA_Item-name/ingredient"
print(dfset) 

df_a = df.loc[((df["TRA_Item"] != "ZY")& (df["TRA_Item"] != "ZX")),]
df_b = df_a.loc[df_a["Product_Name"].notna(),].copy()
df_c = df_b.loc[df_b["Ingredients"].notna(),].copy()
df_d = df_c.loc[df_c["Brand"].notna(),].copy()
#df_c = df_c.drop_duplicates(subset='FLIP_UPC',keep='first') ######## unique UPC
df_e = df_d.loc[df_d['TRA_Item'].notna(),].copy()
df_o = df_e.loc[df_e['TRA_Item_code'].notna(),].copy()
print("FLIP_raw",df.shape,"after remove ZY ZX", df_a.shape, "after Product_Name NA", df_b.shape, "after Ingredients NA", df_c.shape, 
      "after Brand NA", df_d.shape, "after TRA validation NA", df_e.shape, "after TRA_code validation NA", df_o.shape)

locations = df_o.index.to_list()
emb_o = emb[locations]

x = emb_o
y = df_o['TRA_Item_code']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=random.seed)

clfbb1ni = SGDClassifier(penalty='elasticnet',n_jobs=8, random_state=1234)
clfbb1ni.fit(x_train, y_train)
y_pred = clfbb1ni.predict(x_test)
y_true = y_test
print('Elastic Net Acc: {:.2f}'.format(accuracy_score(y_true=y_test, y_pred=y_pred)))
print('Elastic Net Balanced Accuracy Score: {:.2f}'.format(balanced_accuracy_score(y_true=y_test, y_pred=y_pred)))
measures = dict()
measures['Accuracy'] = accuracy_score(y_true=y_test, y_pred=y_pred)
measures['Balanced-Accuracy'] = balanced_accuracy_score(y_true, y_pred)
measures['n_train'] = x_train.shape[0]
measures['n'] = df_o.shape[0]
measures['dfset'] = dfset
results['ElasticNet'] = measures

clfb2ni = KNeighborsClassifier(n_neighbors=3)
clfb2ni.fit(x_train, y_train)
y_pred = clfb2ni.predict(x_test)
y_true = y_test
print("KNN Acc: {:.2f}".format(accuracy_score(y_true=y_test, y_pred=y_pred)))
print("KNN Balanced Accuracy Score: %.2g" % balanced_accuracy_score(y_true, y_pred))
measures = dict()
measures['Accuracy'] = accuracy_score(y_true=y_test, y_pred=y_pred)
measures['Balanced-Accuracy'] = balanced_accuracy_score(y_true, y_pred)
measures['n_train'] = x_train.shape[0]
measures['n'] = df_o.shape[0]
measures['dfset'] = dfset
results['KNN'] = measures

clfb3ni = OneVsRestClassifier(xgboost.XGBClassifier(n_estimators=100,learning_rate=0.3,max_depth=6, subsample=1, gamma=0,reg_lambda=1,max_delta_step=0, colsample_bytree=1,min_child_weight=1, n_jobs=10,random_state=random.seed, reg_alpha=0,num_class=0,nthread=4,eval_metric='auc'))
clfb3ni.fit(x_train, y_train)
y_pred = clfb3ni.predict(x_test)
y_true = y_test
print("XGBoost Acc: {:.2f}".format(accuracy_score(y_true=y_test, y_pred=y_pred)))
print("XGBoost Balanced Accuracy Score: %.2g" % balanced_accuracy_score(y_true, y_pred))
measures = dict()
measures['Accuracy'] = accuracy_score(y_true=y_test, y_pred=y_pred)
measures['Balanced-Accuracy'] = balanced_accuracy_score(y_true, y_pred)
measures['n_train'] = x_train.shape[0]
measures['n'] = df_o.shape[0]
measures['dfset'] = dfset
results['XGBoost'] = measures