[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/jkanclerz/analiza-dokumentow/blob/main/41--sentiment-word2vec.ipynb)

## Klasyfikacja z wykorzystaniem word embedings

In [None]:
pip install numpy tensorflow pandas

In [None]:
!mkdir -p var
!wget http://blog.jkan.pl/polish_sentiment_dataset.csv -O var/polish_sentiment.csv

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras


In [None]:
filename = 'var/polish_sentiment.csv'

dataset = pd.read_csv(filename, delimiter = ",")

In [None]:
dataset.describe()

In [None]:
dataset = dataset.drop(columns=['length'])

In [None]:
dataset = dataset[dataset['description'].notnull() & dataset['rate'].notnull() & dataset['rate'] != 0]

In [None]:
dataset['description'] = dataset['description'].str.lower()

In [None]:
dataset.rate.value_counts()

In [None]:
len(dataset[dataset['rate'] == 0])

In [None]:
X = dataset['description']

In [None]:
y = dataset['rate']

In [None]:
y = y.map(lambda x: x if x == 1 else 0)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)

In [None]:
vectorizer.get_vocabulary()[:10]

In [None]:
print("X_train shape: " + str(X_train.shape))
print("X_test shape: " + str(X_test.shape))
print("X_val shape: " + str(X_val.shape))
print("y_train shape: " + str(y_train.shape))
print("y_test shape: " + str(y_test.shape))
print("y_val shape: " + str(y_val.shape))

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
output = vectorizer([["lubię dhl bo szybko dostarczają paczki blef xxxxx"]])

In [None]:
output.numpy()[0, :6]

In [None]:
!rm -rf var/nkjp*

In [None]:
!wget http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-skipg-ns.txt.gz -O var/nkjp+wiki-forms-all-100-skipg-ns.txt.gz

In [None]:
!gzip -d var/nkjp+wiki-forms-all-100-skipg-ns.txt.gz

In [None]:
cat var/nkjp+wiki-forms-all-100-skipg-ns.txt | head -n 3

In [None]:
path_to_glove_file = 'var/nkjp+wiki-forms-all-100-skipg-ns.txt'
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


In [None]:
embeddings_index['lubię']

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))

In [None]:
embedding_matrix.shape

In [None]:
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
classes_count = len(list(set(y)))

In [None]:
classes_count

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")

embedded_sequences = embedding_layer(int_sequences_input)

x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(classes_count, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

In [None]:
x_train = vectorizer(np.array([[s] for s in X_train])).numpy()
x_val = vectorizer(np.array([[s] for s in X_val])).numpy()

y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
x_train[:1][0]

In [None]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)

In [None]:
model.fit(x_train, y_train, batch_size=64, epochs=3, validation_data=(x_val, y_val))

In [None]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)

preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [
        [X[0]],
        [X[1]],
        ['Nie polecam tego alegrowicza'],
        ['Beznadziejny sklep. Przesłali skisłą paletkę do makijażu, bardzo kłopotliwa reklamacja: stos formularzy do wypełnienia i potem jeszcze maile z pytaniami o nr konta do zwrotu. Zrobili zwrot zamiast reklamacji, zero rekompensaty za mój stracony czas i nerwy; ich obsługa klienta to żart.'],
        ['Od miesiąca jestem systematycznie spamowany prośbami o opinię.'],
        ['Pomysł na to, żeby wysyłać jedno zamówienie na 4 produkty w dwóch osobnych paczkach, które w dodatku przychodzą w różnym czasie jest bez sensu. Szczególnie w czasach, kiedy dba się o ekologię.'],
        ['Proszę nie wysyłać mi więcej wiadomości od tej firmy Nie chce dostawać żadnych więcej meilow Zgłaszam to już wcześniej ale jak widać nikt tym się nie zajął... Porażka'],
        ['Dostawa dramat prawie tydzień oczekiwania na przesyłkę']
    ]
)

In [None]:
[class_names[np.argmax(x)] for x in probabilities]