In [None]:
from itertools import chain
from pathlib import Path
import pickle

import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

In [None]:
datapath = Path('..', 'data')

In [None]:
with open(datapath / 'dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [None]:
len(dataset['ingredient_codes']), len(dataset['cuisine_codes'])

In [None]:
encoder = MultiLabelBinarizer()
encoder.fit([range(len(dataset['ingredient_codes']))])

In [None]:
X_train = encoder.transform(dataset['X_train'])
y_train = dataset['y_train']
X_val = encoder.transform(dataset['X_cls_val'])
y_val = dataset['y_cls_val']

---

In [None]:
classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)

In [None]:
confusion = confusion_matrix(y_val, y_pred, normalize='pred')
plt.matshow(confusion, vmin=0, vmax=1)
plt.colorbar()
plt.show()

In [None]:
accuracy_score(y_val, y_pred), precision_score(y_val, y_pred, average='weighted')

In [None]:
classifier = LogisticRegression(C=0.1, max_iter=1000, n_jobs=-1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)

In [None]:
confusion = confusion_matrix(y_val, y_pred, normalize='pred')
plt.matshow(confusion, vmin=0, vmax=1)
plt.colorbar()
plt.show()

In [None]:
accuracy_score(y_val, y_pred), precision_score(y_val, y_pred, average='weighted')

In [None]:
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)

In [None]:
confusion = confusion_matrix(y_val, y_pred, normalize='pred')
plt.matshow(confusion, vmin=0, vmax=1)
plt.colorbar()
plt.show()

In [None]:
accuracy_score(y_val, y_pred), precision_score(y_val, y_pred, average='weighted')

In [None]:
classifier = LinearSVC(C=0.03, max_iter=10000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)

In [None]:
confusion = confusion_matrix(y_val, y_pred, normalize='pred')
plt.matshow(confusion, vmin=0, vmax=1)
plt.colorbar()
plt.show()

In [None]:
accuracy_score(y_val, y_pred), precision_score(y_val, y_pred, average='weighted')

---

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [None]:
device = torch.device('cuda')

In [None]:
savepath = Path('..', 'save')
savepath.mkdir(exist_ok=True)

In [None]:
class Net(nn.Module):
    def __init__(self, in_feats, out_feats, p_dropout):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Dropout(p_dropout),
            nn.Linear(in_feats, 300),
            nn.Dropout(p_dropout),
            nn.Linear(300, 300),
            nn.ReLU(),
            nn.Dropout(p_dropout),
            nn.Linear(300, out_feats),
        )

    def forward(self, X):
        logits = self.linear_relu_stack(X)
        return logits

In [None]:
dataloader = DataLoader(TensorDataset(
    torch.tensor(X_train, dtype=torch.float, device=device),
    torch.tensor(y_train, dtype=torch.long, device=device),
), shuffle=True, batch_size=8)
valloader = DataLoader(TensorDataset(
    torch.tensor(X_val, dtype=torch.float, device=device),
    torch.tensor(y_val, dtype=torch.long, device=device),
), batch_size=64)

In [None]:
net = Net(
    len(dataset['ingredient_codes']),
    len(dataset['cuisine_codes']),
    p_dropout=0.5
).to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=3e-4)

In [None]:
best_val_loss = float('inf')

for epoch in range(20, 40):
    train_loss = 0
    val_loss = 0

    net.train()
    for X, y in dataloader:
        y_prob = net(X)
        loss = F.cross_entropy(y_prob, y)

        train_loss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    net.eval()
    with torch.no_grad():
        for X, y in valloader:
            y_prob = net(X)
            loss = F.cross_entropy(y_prob, y)

            val_loss += loss

    train_loss /= len(dataloader)
    val_loss /= len(valloader)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(net.state_dict(), savepath / f'_300_300_{epoch}.pt')

    print(f'Epoch {epoch} | Train loss: {train_loss} | Val loss: {val_loss}', flush=True)

In [None]:
net.load_state_dict(torch.load(savepath / '_300_300_16.pt'))
net.eval()

In [None]:
y_logits = net(torch.tensor(X_val, dtype=torch.float, device=device))
y_pred = torch.argmax(y_logits, dim=-1).detach().cpu().numpy()

In [None]:
confusion = confusion_matrix(y_val, y_pred, normalize='pred')
plt.matshow(confusion, vmin=0, vmax=1)
plt.colorbar()
plt.show()

In [None]:
accuracy_score(y_val, y_pred), precision_score(y_val, y_pred, average='weighted')

## Classification tasks baselines

### RandomForestClassifier
n_estimators|accuracy|precision
---|---|---
100|0.6973751274209989|0.7004341328977298
200|**0.7027268093781855**|**0.7076188339183768**
500|0.7010703363914373|0.7074596677137079

### LogisticRegression
C|accuracy|precision
---|---|---
0.1|0.7396788990825688|0.7464361349930922
1|**0.7663098878695209**|**0.7641427551142695**
3|0.758664627930683|0.7560706645009344
10|0.7433741080530072|0.7413634213346916

### MultinomialNB
alpha|accuracy|precision
---|---|---
0|0.6980122324159022|0.7006106868535751
0.1|0.7463047910295617|0.7552348477734679
0.2|**0.7480886850152905**|**0.7560115265091186**
0.3|0.7466870540265036|0.7541760647567084
0.5|0.7394240570846076|0.7516239025446847
1|0.71572375127421|0.7395414152003985
2|0.6776248725790011|0.7195488889808616
3|0.6526503567787971|0.7010370839331049

### LinearSVC
C|accuracy|precision
---|---|---
1|0.7529306829765545|0.7490671014362292
0.3|0.7708970438328236|0.7674023585348014
0.1|**0.7771406727828746**|**0.7753928549218926**
0.03|0.7668195718654435|0.7681267914534073
0.01|0.7464322120285423|0.7551715358401799


### NeuralNet
architecture|accuracy|precision
---|---|---
-100-|0.7819826707441386|0.7840162397622802
-1000-|0.7882262996941896|0.7878159531920711
-300-300-|0.7777777777777778|0.7826972381549467