In [None]:
from google.colab import files
files.upload() #upload kaggle.json

In [None]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [None]:

!kaggle competitions download -c whats-cooking

In [None]:
!unzip -q train.json.zip -d .
!unzip -q test.json.zip -d .
!ls

In [None]:
!pip install tensorflow-text

In [None]:
!pip install tf-models-official

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import tensorflow_text
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
tf.get_logger().setLevel('ERROR')

In [None]:
import os

if os.environ['COLAB_TPU_ADDR']:
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.TPUStrategy(cluster_resolver)
  print('Using TPU')
elif tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
else:
  raise ValueError('Running on CPU is not recomended.')

In [None]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"
bert_preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2"

In [None]:
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

In [None]:
train_df.head()

In [None]:
label_count = train_df['cuisine'].value_counts()
label_count['italian']

In [None]:
import matplotlib.pyplot as plt
y = train_df['cuisine'].to_numpy()

plt.rcdefaults()
fig, ax = plt.subplots()

# Example data
cuisine = pd.unique(train_df.cuisine)
y_pos = np.arange(len(cuisine))
occ = [label_count[c] / len(train_df) for c in cuisine]

ax.barh(y_pos, occ, align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(cuisine)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('% of occurence')
ax.set_title('Popularity of cuisine in the dataset')

plt.show()

In [None]:
labels = pd.unique(train_df.cuisine)

In [None]:
labels_dict = {label : i for label, i in zip(labels, range(len(labels)))}

In [None]:
def ingredients_list_to_sentence(ingredients):
  ing_str = ', '.join(ingredients)
  sentence = 'cuisine with '
  sentence += ing_str
  return sentence

train_df.ingredients = train_df.ingredients.apply(lambda ingredients : ingredients_list_to_sentence(ingredients))
test_df.ingredients = test_df.ingredients.apply(lambda ingredients : ingredients_list_to_sentence(ingredients))

In [None]:
print(train_df.head())
print(test_df.head())

In [None]:
target_df = train_df.pop('cuisine')
ids = train_df.pop('id')

test_ids = test_df.pop('id')

In [None]:
target_df = target_df.apply(lambda x : labels_dict[x])

In [None]:
target_np = target_df.to_numpy()
targets = np.zeros((target_np.size, target_np.max()+1))
targets[np.arange(target_np.size),target_np] = 1

In [None]:
train_df.head()

In [None]:
def make_bert_preprocess_model(sentence_features, seq_length=128):
  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  bert_preprocess = hub.load(bert_preprocess_url)
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  segments = [tokenizer(s) for s in input_segments]

  truncated_segments = segments

  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  
  model_inputs = packer(truncated_segments)
  return tf.keras.Model(input_segments, model_inputs)

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def load_dataset_from_df(dataset_df, targets, batch_size,
                           bert_preprocess_model):
  dataset = tf.data.Dataset.from_tensor_slices((bert_preprocess_model(dataset_df.values), targets))
  dataset = dataset.shuffle(buffer_size=1)
  
  DATASET_SIZE = len(targets)
  train_size = int(0.6 * DATASET_SIZE)
  val_size = int(0.2 * DATASET_SIZE)
  test_size = int(0.2 * DATASET_SIZE)

  train_dataset = dataset.take(train_size)
  test_dataset = dataset.skip(train_size)
  val_dataset = test_dataset.skip(test_size)
  test_dataset = test_dataset.take(test_size)

  train_dataset = train_dataset.batch(batch_size)
  test_dataset = test_dataset.batch(batch_size)
  val_dataset = val_dataset.batch(batch_size)

  train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
  test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)
  val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)
  return train_dataset, val_dataset, test_dataset

In [None]:
def build_classifier_model(num_classes):
  inputs = dict(
      input_word_ids=tf.keras.layers.Input(shape=(256), dtype=tf.int32),
      input_mask=tf.keras.layers.Input(shape=(256), dtype=tf.int32),
      input_type_ids=tf.keras.layers.Input(shape=(256), dtype=tf.int32),
  )

  encoder = hub.KerasLayer(module_url, trainable=True, name='encoder')
  net = encoder(inputs)['pooled_output']
  net = tf.keras.layers.Dropout(rate=0.5)(net)
  net = tf.keras.layers.Dense(num_classes, activation='softmax', name='classifier')(net)
  return tf.keras.Model(inputs, net, name='prediction')

In [None]:
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"

bert_preprocess_model = make_bert_preprocess_model([''], 256)

In [None]:
from official.nlp import optimization
history = {}
with strategy.scope():
  batch_size = 128
  tf.config.run_functions_eagerly(False)
  train_dataset, val_dataset, test_dataset = load_dataset_from_df(
      train_df, targets, batch_size, bert_preprocess_model)
  
  epochs = 20
  init_lr = 1e-5
  steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
  num_train_steps = steps_per_epoch * epochs
  num_warmup_steps = int(0.1*num_train_steps)
  validation_steps = tf.data.experimental.cardinality(val_dataset).numpy()

  classifier_model = build_classifier_model(20)

  optimizer = optimization.create_optimizer(
      init_lr=init_lr,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      optimizer_type='adamw')

  loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
  metrics = tf.metrics.CategoricalAccuracy()
  classifier_model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

  history = classifier_model.fit(
      x=train_dataset,
      validation_data=val_dataset,
      steps_per_epoch=steps_per_epoch,
      epochs=epochs,
      validation_steps=validation_steps)

In [None]:
classifier_model.evaluate(test_dataset)

Dodawanie dodatkowych warst Dense w modelu nigdy nie poprawiło wyniku, najlepsze wyniki były uzyskane dla Dropout 0.5

In [None]:
tf.keras.utils.plot_model(classifier_model, show_shapes=True)

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['categorical_accuracy']
val_acc = history_dict['val_categorical_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')


In [None]:
submission_dataset = tf.data.Dataset.from_tensor_slices(bert_preprocess_model(test_df.values)).batch(batch_size)

In [None]:
predictions = classifier_model.predict(submission_dataset)

In [None]:
predictions = [labels[np.argmax(prediction)] for prediction in predictions]

In [None]:
sub = {'id': test_ids, 'cuisine': predictions}
submission = pd.DataFrame(data=sub)

In [None]:
submission

In [None]:
submission.to_csv('bert_submission.csv', index=False)

In [None]:
!kaggle competitions submit -f bert_submission.csv -m "Bert submission" -c whats-cooking