# AI Center Lipik - NLP zadatak od Darija Karla (Klasifikacija na razini tokena)

In [None]:
!pip install tensorflow
!pip install spacy

In [None]:
import tensorflow as tf
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

import spacy

import sklearn
import scipy
import os
import json
import csv
import string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd

/content


In [None]:
#Prvi dataset koji nam treba
df = pd.read_csv("drive/MyDrive/food-related.csv")

In [None]:
df.head()

Unnamed: 0,a-yeast
0,aai
1,abalone
2,abba-zaba
3,abbreviations
4,abietate


In [None]:
df.tail()

Unnamed: 0,a-yeast
7126,zinger
7127,zip-a-dee
7128,ziploc
7129,zodicarbonamide
7130,zucchini


In [None]:
df.shape

(7131, 1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7131 entries, 0 to 7130
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   a-yeast  7129 non-null   object
dtypes: object(1)
memory usage: 55.8+ KB


In [None]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7122 entries, 0 to 7130
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   a-yeast  7122 non-null   object
dtypes: object(1)
memory usage: 111.3+ KB


In [None]:
df.value_counts()

a-yeast                       
TRUE                              1
pasteurize                        1
patties                           1
patrially                         1
patients                          1
                                 ..
fatgcholestrol                    1
fatgcholesterolmgsodiummgtotal    1
fatgcholesterolmgsodium           1
fatgcholesterolgsodium            1
zucchini                          1
Length: 7122, dtype: int64

In [None]:
#Drugi dataset koji nam treba
with open("drive/MyDrive/TM-2-2020/data/food-ordering.json") as f:
    food_related = json.load(f)

In [None]:
#Treći dataset koji nam treba
with open("drive/MyDrive/TM-2-2020/data/restaurant-search.json") as f:
    restaurant_search = json.load(f)

In [None]:
len(food_related), len(restaurant_search)

(1050, 3276)

In [None]:
### ČITANJE JSONA: 
def read_food_json(filepath):
    with open(filepath) as f:
        data = json.loads(f.read())

    texts = []
    foods = []
    labels = []
    for conversation in data:
        utterances = conversation['utterances']
        for utterance in utterances:
            text = utterance['text']
            texts.append(text) # ok

            try:
                fs = []
                anns = []
                for segment in utterance['segments']:
                    food_words = segment['text']
                    fs.append(food_words)
                    annotations = segment['annotations']
                    for annotation in annotations:
                        anns.append(annotation['name'])
                foods.append(fs)
                labels.append(anns)
            except KeyError:
                foods.append([])
                labels.append([])
                continue

    assert len(texts) == len(foods) == len(labels)
    return texts, foods, labels

In [None]:
#fo_texts (skraćeno od food-ordering texts). Isto vrijedi i za ostale.
fo_texts, fo_foods, fo_labels = read_food_json("drive/MyDrive/TM-2-2020/data/food-ordering.json")

In [None]:
rs_texts, rs_foods, rs_labels = read_food_json("drive/MyDrive/TM-2-2020/data/restaurant-search.json")

In [None]:
#fo_texts bit će ulaz u model (nakon što se prethodno predprocesira). Ostale fo_foods, fo_labels, rs_texts, rs_foods i rs_labels nećemo koristiti
print(type(fo_texts))
fo_texts[:5]

<class 'list'>


['Hi.',
 'How can I help you?',
 'What would you like to order in Barbecue?',
 "Yeah, I'd like to get a rack of ribs.",
 'What kind of sides can I get with that?']

In [None]:
fo_foods[:5]

[[], [], ['Barbecue'], ['a rack of ribs'], []]

In [None]:
fo_labels[:5]

[[], [], ['food_order.type.food'], ['food_order.name.item'], []]

In [None]:
len(fo_texts)

13953

In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


##Restartaj runtime

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
# Prvo ćemo pripremiti oznake/labele/targete, tj. ono što želimo da model predviđa, ono što će biti njegov izlaz. Njih ćemo spremiti u listu fo_texts_labels.

In [None]:
#Ovo traje cca 2 minute i 20 sekunde na Colabu. "sequence" se može nazvati i sentence i row. Uglavnom, to se odnosi na naš jedan data point/instancu/sample/entitet (njih ima 13953)
fo_texts_labels = []

for sequence in fo_texts:
  tmp_list = []
  doc = nlp(sequence)
  for token in doc:
    if token.text in df["a-yeast"].values:
      tmp_list.append(1)
    else:
      tmp_list.append(0)
  fo_texts_labels.append(tmp_list)

In [None]:
fo_texts_labels[:5]

[[0, 0],
 [0, 1, 0, 1, 0, 0],
 [0, 0, 0, 1, 1, 1, 1, 0, 0],
 [0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0],
 [0, 1, 1, 0, 1, 0, 1, 0, 0, 0]]

In [None]:
len(fo_texts_labels)

13953

In [None]:
max_seq_len = len(fo_texts_labels[0])

In [None]:
for i in range(1, len(fo_texts_labels)):
  sequence = fo_texts_labels[i]
  if len(sequence) > max_seq_len:
    max_seq_len = len(sequence)

In [None]:
max_seq_len

116

In [None]:
#Treba napraviti padding, tj. popuniti redove iz "fo_texts_labels" s dvojkama (2) tako da svi imaju isti broj stupaca, ato je 116 (max_seq_len)
fo_texts_labels_padded = []

for sequence in fo_texts_labels:
  current_seq_len = len(sequence)
  if current_seq_len < max_seq_len:
    fo_texts_labels_padded.append(sequence+(max_seq_len-current_seq_len)*[2])
  else:
    fo_texts_labels_padded.append(sequence)

In [None]:
# Ispisuje broj redaka
len(fo_texts_labels_padded)

13953

In [None]:
# Ispisuje broj stupaca
len(fo_texts_labels_padded[0])

116

### Oznake/labele su pripremljene, sada je još potrebno pripremiti X, tj. ulaz u naš model, a to će se napraviti tako da se vektorizira text iz varijable/liste "fo_texts"

In [None]:
food_order_df = pd.DataFrame({"texts": fo_texts})

In [None]:
food_order_df.head()

Unnamed: 0,texts
0,Hi.
1,How can I help you?
2,What would you like to order in Barbecue?
3,"Yeah, I'd like to get a rack of ribs."
4,What kind of sides can I get with that?


In [None]:
#Placeholder za one tokene koji se ne nalaze u vokabularu/rječniku
def get_placeholder_vector(d):
  return np.ones(d)

In [None]:
def spacy_parser(nlp, text):
    # https://spacy.io/api/doc
    doc = nlp(text)
    tokens = []
    vectors = []
    # https://spacy.io/usage/linguistic-features#sbd
    for token in doc:
        # https://spacy.io/api/token
        # https://spacy.io/usage/linguistic-features#lemmatization
        # https://spacy.io/api/lemmatizer
        # https://spacy.io/usage/linguistic-features#vectors-similarity

        # print(token.text, token.has_vector, token.vector_norm, token.is_oov)
        if token.has_vector:
            vector = token.vector
        else:
            vector = get_placeholder_vector(300) # ((300,1) is the shape of the vector in Spacy)

        tokens.append(token.text)
        vectors.append(vector)
    # print("tokens with no vectors:", no_vector)
    # if there is a very low count of vectors: https://spacy.io/usage/linguistic-features#adding-vectors 
    return tokens, vectors

In [None]:
#Ovo traje cca 2 minute na Colabu
tokens = []
embeddings = []
for idx, row in food_order_df.iterrows():
    t, v = spacy_parser(nlp, row["texts"])
    tokens.append(t) #Lista tokena
    embeddings.append(v) #Lista embeddinga za svaki token

In [None]:
#Broj redaka (treba bi biti isto kao i prije)
len(embeddings)

13953

In [None]:
#Broj tokena (to su embeddinzi, ali lakše je za ovaj primjer o njima razmišljati kao o tokenima) za prvi data point. Za prvi data point treba biti 2 zato što imamo "Hi.", a to su dva tokena: "Hi" i "."
len(embeddings[0])

2

In [None]:
#Svaki od njih ima dimenziju 300 zato što koristimo "en_core_web_lg" model čiji embeddinzi su dimenzije 300. Model "en_core_web_sm" koristi 96-dimenzijske embeddinge
np.array(embeddings[0]).shape

(2, 300)

In [None]:
embedding_size=300

In [None]:
#Treba opet napraviti padding do max_seq_len. Traje cca 40 sekundi
padded_embeddings = []
for embedding in embeddings:
  tmp_paddings = []
  embedding_len = len(embedding)
  if embedding_len < max_seq_len:
    for _ in range(max_seq_len-embedding_len):
      tmp_paddings.append([0]*embedding_size)
    padded_embeddings.append(np.concatenate((embedding, tmp_paddings), axis=0))
  else:
    padded_embeddings.append(embedding)

In [None]:
padded_embeddings = np.array(padded_embeddings)

In [None]:
#Broj primjera x broj vremenskih koraka x broj značajki u svakom vremenskom koraku
padded_embeddings.shape

(13953, 116, 300)

In [None]:
#Provjera jesu li manji floatovi u embeddingu
padded_embeddings[0][0][:10]

array([ 0.028796  ,  0.41306001, -0.46689999, -0.078175  ,  0.37057999,
        0.12867001,  0.47714001, -0.92372   , -0.067789  ,  0.62380999])

In [None]:
fo_texts_labels_padded = np.array(fo_texts_labels_padded)
fo_texts_labels_padded.shape

(13953, 116)

In [None]:
#Treba biti isto 3d oblik/shape
fo_texts_labels_padded = fo_texts_labels_padded.reshape(fo_texts_labels_padded.shape[0], fo_texts_labels_padded.shape[1], 1)

In [None]:
fo_texts_labels_padded.shape

"padded_embeddings" nam je ulaz u model (X), a fo_texts_labels_padded su oznake/labele (y)

In [None]:
#Da si pojednostavimo
X = padded_embeddings
y = fo_texts_labels_padded

In [None]:
#Spremanje ulaza (X) i izlaza (y) kako ne bismo kasnije trebali ponovo pokretati sve linije iznad nego ćemo ih samo učitati (load) s diska (sljedeća čelija)
np.save("drive/MyDrive/X_food.npy", X)
np.save("drive/MyDrive/y_food.npy", y)

In [None]:
#Učitavanje prethodno spremljenih podataka
X = np.load("drive/MyDrive/X_food.npy")
y = np.load("drive/MyDrive/y_food.npy")

In [None]:
X.shape, y.shape

((13953, 116, 300), (13953, 116, 1))

In [None]:
n_classes = len(np.unique(y))
n_classes

3

In [None]:
model = tf.keras.Sequential()
#U input_shape ide dimenzija jednog primjera (instance/entiteta/opservacije/uzorka/retka/zapisa - različiti nazivi za jednaku stvar), a to je u našem slučaju (116, 300). Imamo 13 953 primjera koju su dimenzije (116, 300).
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, input_shape=X[0].shape, return_sequences=True)))
#TimeDistributed sloj (ili omotač eng. wrapper) omogućuje da se neki drugi sloj, u ovom slučaju Dense(3, ...), aplicira/primjeni na svaki vremenski korak. Mi ih imamo 116
#Dense(3, ...)i softmax iz razloga što u y-u imamo 3 različite klase (0, 1 i 2)
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(3)))
opt = tf.keras.optimizers.Adam(learning_rate=.0003)
#SparseCategoricalCrossentropy jer nam vrijednosti u y-u nisu one-hot encodane nego su integer/label encodane
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer=opt, metrics=["accuracy"])

In [None]:
epochs = 1
batch_size = 32

In [None]:
history = model.fit(X, y, epochs=epochs, batch_size=batch_size, validation_split=.3)

