[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github.com/jkanclerz/analiza-tekstu/blob/master/12-Eksploracyjna_analiza_dokumentow-Word-embedings-klasyfikacja.ipynb)

## Klasyfikacja z wykorzystaniem word embedings

In [3]:
pip install numpy tensorflow pandas

Note: you may need to restart the kernel to use updated packages.


In [5]:
!mkdir -p var
!wget http://blog.jkan.pl/polish_sentiment_dataset.csv -O var/polish_sentiment.csv

--2021-12-04 07:53:16--  http://blog.jkan.pl/polish_sentiment_dataset.csv
Resolving blog.jkan.pl (blog.jkan.pl)... 85.128.239.15
Connecting to blog.jkan.pl (blog.jkan.pl)|85.128.239.15|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 95240804 (91M) [text/csv]
Saving to: ‘var/polish_sentiment.csv’


2021-12-04 07:53:22 (16,3 MB/s) - ‘var/polish_sentiment.csv’ saved [95240804/95240804]



In [7]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

In [8]:
filename = 'var/polish_sentiment.csv'

dataset = pd.read_csv(filename, delimiter = ",")

In [9]:
dataset.describe()

Unnamed: 0,length,rate
count,762836.0,936817.0
mean,88.486888,0.58734
std,97.483536,0.797016
min,0.0,-1.0
25%,44.0,1.0
50%,63.0,1.0
75%,101.0,1.0
max,7970.0,1.0


In [10]:
dataset = dataset.drop(columns=['length'])

In [11]:
dataset = dataset[dataset['description'].notnull() & dataset['rate'].notnull() & dataset['rate'] != 0]

In [12]:
dataset['description'] = dataset['description'].str.lower()

In [13]:
dataset.rate.value_counts()

 1.0    734250
-1.0    183391
Name: rate, dtype: int64

In [14]:
len(dataset[dataset['rate'] == 0])

0

In [15]:
X = dataset['description']

In [16]:
y = dataset['rate']

In [17]:
y = y.map(lambda x: x if x == 1 else 0)

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [19]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)

2021-12-04 07:54:55.517715: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'i', 'w', 'polecam', 'bardzo', 'z', 'szybka', 'na', 'nie']

In [21]:
print("X_train shape: " + str(X_train.shape))
print("X_test shape: " + str(X_test.shape))
print("X_val shape: " + str(X_val.shape))
print("y_train shape: " + str(y_train.shape))
print("y_test shape: " + str(y_test.shape))
print("y_val shape: " + str(y_val.shape))

X_train shape: (587289,)
X_test shape: (183529,)
X_val shape: (146823,)
y_train shape: (587289,)
y_test shape: (183529,)
y_val shape: (146823,)


In [22]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [23]:
output = vectorizer([["lubię dhl bo szybko dostarczają paczki blef xxxxx"]])

In [24]:
output.numpy()[0, :6]

array([ 407, 1793,   79,   11, 5385,  404])

In [30]:
!rm -rf var/nkjp*

In [31]:
!wget http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-skipg-ns.txt.gz -O var/nkjp+wiki-forms-all-100-skipg-ns.txt.gz

--2021-12-04 08:01:17--  http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-skipg-ns.txt.gz
Resolving dsmodels.nlp.ipipan.waw.pl (dsmodels.nlp.ipipan.waw.pl)... 213.135.36.94
Connecting to dsmodels.nlp.ipipan.waw.pl (dsmodels.nlp.ipipan.waw.pl)|213.135.36.94|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 808466993 (771M) [application/octet-stream]
Saving to: ‘var/nkjp+wiki-forms-all-100-skipg-ns.txt.gz’


2021-12-04 08:01:39 (35,3 MB/s) - ‘var/nkjp+wiki-forms-all-100-skipg-ns.txt.gz’ saved [808466993/808466993]



In [32]:
!gzip -d var/nkjp+wiki-forms-all-100-skipg-ns.txt.gz

In [36]:
cat var/nkjp+wiki-forms-all-100-skipg-ns.txt | head -n 3

2123132 100
w 0.466284 -0.360719 0.626233 0.074990 0.511295 -0.050783 -0.284817 -0.560439 0.084301 -0.017775 -0.283048 -0.032232 0.023057 0.111658 0.115480 -0.008415 0.133235 0.205307 0.340641 -0.131821 0.384864 0.038841 0.033434 -0.563962 -0.382250 -0.008713 0.020756 -0.612251 0.236674 -0.552045 -0.433139 0.463721 0.363138 -0.150043 -0.103797 -0.088920 -0.310293 0.190513 -0.074629 0.205998 -0.116720 -0.106180 -0.029671 -0.425457 0.644615 0.224739 -0.401282 0.691186 0.280313 0.040966 -0.086397 0.116823 0.452302 0.159339 -0.065328 0.098123 -0.418321 0.436972 -0.518450 0.030116 0.170335 -0.397297 -0.113067 0.115685 -0.172756 0.137272 0.387522 -0.132569 0.668747 -0.054743 -0.306965 0.215071 0.209518 0.249928 -0.415931 0.214751 0.484958 -0.095062 0.642413 -0.470559 -0.217651 0.322925 -0.473760 -0.445890 -0.423398 0.108468 -0.381243 -0.031808 -0.354203 0.109992 0.242362 0.181020 -0.283963 -0.020929 -0.247558 0.142271 -0.090935 -0.000551 -0.695751 -0.001272
i 0.190866 -0.259831 0.293700 -0

In [37]:
path_to_glove_file = 'var/nkjp+wiki-forms-all-100-skipg-ns.txt'
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


Found 2123133 word vectors.


In [38]:
embeddings_index['lubię']

array([-7.25185e-01, -2.78460e-02,  4.74960e-02,  2.35574e-01,
       -5.23631e-01, -1.79990e-02, -3.86200e-02, -1.08883e-01,
       -3.08771e-01, -2.38550e-02,  9.63445e-01, -3.02333e-01,
        2.04415e-01, -1.75154e-01, -3.63909e-01,  1.40352e-01,
       -2.23939e-01,  4.24170e-01,  4.84920e-02, -3.24568e-01,
       -2.28869e-01, -3.01482e-01,  1.52183e-01, -1.79558e-01,
        2.15400e-03,  1.88992e-01, -2.66566e-01, -8.45700e-02,
        2.72910e-01, -3.14724e-01,  5.63602e-01, -2.10065e-01,
        5.31396e-01,  4.52114e-01, -7.42492e-01,  4.49459e-01,
       -1.35454e-01,  7.83104e-01,  1.96206e-01,  2.27563e-01,
        6.28637e-01, -3.90218e-01, -4.10153e-01,  1.44018e-01,
        4.73120e-02,  6.96420e-02,  7.58690e-02, -1.02447e-01,
        6.55840e-02,  1.87230e-02,  5.85000e-04, -2.03890e-02,
        1.75582e-01,  7.24677e-01, -3.16065e-01,  2.02726e-01,
        6.38310e-02,  5.24538e-01, -1.21259e-01, -1.80347e-01,
       -2.01140e-02,  1.51751e-01, -3.22533e-01,  1.714

In [39]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))

In [40]:
embedding_matrix.shape

(20002, 100)

In [41]:
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18290 words (1710 misses)


In [42]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [43]:
classes_count = len(list(set(y)))

In [44]:
classes_count

2

In [45]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")

embedded_sequences = embedding_layer(int_sequences_input)

x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(classes_count, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         2000200   
                                                                 
 conv1d (Conv1D)             (None, None, 128)         64128     
                                                                 
 max_pooling1d (MaxPooling1D  (None, None, 128)        0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, None, 128)        0         
 1D)                                                         

In [None]:
x_train = vectorizer(np.array([[s] for s in X_train])).numpy()
x_val = vectorizer(np.array([[s] for s in X_val])).numpy()

y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
x_train[:1][0]

In [None]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)

In [None]:
model.fit(x_train, y_train, batch_size=64, epochs=3, validation_data=(x_val, y_val))

In [None]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)

preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [
        [X[0]],
        [X[1]],
        ['Nie polecam tego alegrowicza'],
        ['Beznadziejny sklep. Przesłali skisłą paletkę do makijażu, bardzo kłopotliwa reklamacja: stos formularzy do wypełnienia i potem jeszcze maile z pytaniami o nr konta do zwrotu. Zrobili zwrot zamiast reklamacji, zero rekompensaty za mój stracony czas i nerwy; ich obsługa klienta to żart.'],
        ['Od miesiąca jestem systematycznie spamowany prośbami o opinię.'],
        ['Pomysł na to, żeby wysyłać jedno zamówienie na 4 produkty w dwóch osobnych paczkach, które w dodatku przychodzą w różnym czasie jest bez sensu. Szczególnie w czasach, kiedy dba się o ekologię.'],
        ['Proszę nie wysyłać mi więcej wiadomości od tej firmy Nie chce dostawać żadnych więcej meilow Zgłaszam to już wcześniej ale jak widać nikt tym się nie zajął... Porażka'],
        ['Dostawa dramat prawie tydzień oczekiwania na przesyłkę']
    ]
)

In [None]:
[class_names[np.argmax(x)] for x in probabilities]