<a href="https://colab.research.google.com/github/kurek0010/neural-network-course/blob/master/07_rnn/02_text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [2]:
!wget https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
!unzip -q reviews.zip

--2025-02-13 17:03:18--  https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.2.207, 142.250.141.207, 74.125.137.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.2.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42878657 (41M) [application/x-zip-compressed]
Saving to: ‘reviews.zip’


2025-02-13 17:03:22 (15.1 MB/s) - ‘reviews.zip’ saved [42878657/42878657]



In [3]:
data_dir = './reviews'
train_dir = os.path.join(data_dir, 'train')

train_texts = []
train_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            train_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                train_labels.append(0)
            else:
                train_labels.append(1)

In [4]:
test_dir = os.path.join(data_dir, 'test')

test_texts = []
test_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            test_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                test_labels.append(0)
            else:
                test_labels.append(1)

In [5]:
train_texts[:10]

['"Hak Hap", or "Black Mask" (in english) was a disappointment. I was told that it was a sort of "Japanese version of the Matrix". Imagine my disappointment. The film was either badly dubbed or the soundtrack didn\'t time well with the film. Another thing is that the dialogue was pretty much bad. There was very little thought put into the English version of this film and it appeals only to the "senseless action" genre. Not a film I would want to see again.',
 'LOC could have been a very well made movie on how the Kargil war was fought; it had the locations, the budget, and the skill to have been India\'s "Saving Private Ryan" or "Black Hawk Down". Instead it come across as a bloated, 4 hour bore of trying to meld the war move with the masala movie. Even the war scenes were terribly executed, using the same hill in all their battle scenes, and spending unnecessary time on casual talk. Instead of trying to appeal to the indian public, a better movie would have been a to-the-book account 

In [6]:
train_labels[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [7]:
train_labels[-10:]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [8]:
maxlen = 100   # skracamy recenzje do 100 słów
num_words = 10000    # 10000 najczęściej pojawiających się słów
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)

In [9]:
list(tokenizer.index_word.items())[:20]

[(1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it'),
 (10, 'i'),
 (11, 'this'),
 (12, 'that'),
 (13, 'was'),
 (14, 'as'),
 (15, 'for'),
 (16, 'with'),
 (17, 'movie'),
 (18, 'but'),
 (19, 'film'),
 (20, 'on')]

In [10]:
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[:3])

[[39, 325, 2373, 8, 628, 13, 3, 1383, 10, 13, 576, 12, 9, 13, 3, 429, 4, 857, 307, 4, 1, 2652, 835, 58, 1383, 1, 19, 13, 342, 906, 2270, 39, 1, 812, 158, 55, 70, 16, 1, 19, 157, 151, 6, 12, 1, 411, 13, 181, 73, 75, 47, 13, 52, 114, 194, 273, 80, 1, 628, 307, 4, 11, 19, 2, 9, 6436, 61, 5, 1, 4252, 202, 509, 21, 3, 19, 10, 59, 178, 5, 64, 171], [97, 25, 74, 3, 52, 70, 90, 17, 20, 86, 1, 322, 13, 5074, 9, 66, 1, 1975, 1, 349, 2, 1, 2696, 5, 25, 74, 1898, 1951, 2374, 39, 325, 177, 302, 9, 213, 635, 14, 3, 467, 531, 2682, 4, 266, 5, 1, 322, 844, 16, 1, 17, 57, 1, 322, 136, 68, 1899, 2137, 769, 1, 169, 2186, 8, 29, 65, 982, 136, 2, 3417, 1740, 55, 20, 5674, 735, 302, 4, 266, 5, 1268, 5, 1, 1391, 1067, 3, 125, 17, 59, 25, 74, 3, 5, 1, 271, 2640, 4, 48, 571, 30, 37, 325, 177, 39, 57, 3417, 55, 20, 1, 210, 4, 646, 37, 57, 125, 9, 97, 25, 340, 3, 3264, 164, 37, 7916, 5, 898, 1500, 2, 5283, 1, 19, 363, 92, 471, 37, 5, 64, 46, 28, 792, 5283, 11, 19, 16, 61, 1, 8422, 1918, 9, 59, 94, 1, 17, 50, 174

In [11]:
word_index = tokenizer.word_index
print(f'{len(word_index)} unikatowych słów.')

88582 unikatowych słów.


In [12]:
# skracamy recenzje do pierwszych 100 słów
train_data = pad_sequences(sequences, maxlen=maxlen)
train_data.shape

(25000, 100)

In [13]:
train_data[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,   39,  325, 2373,
           8,  628,   13,    3, 1383,   10,   13,  576,   12,    9,   13,
           3,  429,    4,  857,  307,    4,    1, 2652,  835,   58, 1383,
           1,   19,   13,  342,  906, 2270,   39,    1,  812,  158,   55,
          70,   16,    1,   19,  157,  151,    6,   12,    1,  411,   13,
         181,   73,   75,   47,   13,   52,  114,  194,  273,   80,    1,
         628,  307,    4,   11,   19,    2,    9, 6436,   61,    5,    1,
        4252,  202,  509,   21,    3,   19,   10,   59,  178,    5,   64,
         171],
       [ 322,  136,   68, 1899, 2137,  769,    1,  169, 2186,    8,   29,
          65,  982,  136,    2, 3417, 1740,   55,   20, 5674,  735,  302,
           4,  266,    5, 1268,    5,    1, 1391, 1067,    3,  125,   17,
          59,   25,   74,    3,    5,    1,  271, 2640,    4,   48,  571,
          30,   37,  32

In [14]:
train_labels = np.asarray(train_labels)
train_labels

array([0, 0, 0, ..., 1, 1, 1])

In [20]:
train_labels.shape

(25000,)

In [15]:
# przemieszanie próbek
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]

train_data.shape

(25000, 100)

In [16]:
# podział na zbiór treningowy i walidacyjny
training_samples = 15000
validation_samples = 10000

X_train = train_data[:training_samples]
y_train = train_labels[:training_samples]
X_val = train_data[training_samples: training_samples + validation_samples]
y_val = train_labels[training_samples: training_samples + validation_samples]

In [17]:
# budowa modelu
# Embedding(input_dim, output_dim)

model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()



In [18]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [19]:
history = model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

Epoch 1/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 20ms/step - accuracy: 0.6505 - loss: 0.5982 - val_accuracy: 0.8301 - val_loss: 0.3795
Epoch 2/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - accuracy: 0.9257 - loss: 0.2059 - val_accuracy: 0.8359 - val_loss: 0.3922
Epoch 3/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 22ms/step - accuracy: 0.9946 - loss: 0.0332 - val_accuracy: 0.8247 - val_loss: 0.5309
Epoch 4/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.9991 - loss: 0.0048 - val_accuracy: 0.8184 - val_loss: 0.6486
Epoch 5/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 1.0000 - loss: 5.0095e-04 - val_accuracy: 0.8190 - val_loss: 0.7394


In [21]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

In [22]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

[0.718999445438385, 0.8169199824333191]

### Simple RNN

In [23]:
from tensorflow.keras.layers import SimpleRNN, LSTM

In [24]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [25]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [26]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 34ms/step - accuracy: 0.5431 - loss: 0.6823 - val_accuracy: 0.6734 - val_loss: 0.6062
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 31ms/step - accuracy: 0.7871 - loss: 0.4628 - val_accuracy: 0.7959 - val_loss: 0.4494
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 32ms/step - accuracy: 0.8830 - loss: 0.2933 - val_accuracy: 0.8238 - val_loss: 0.4144
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 32ms/step - accuracy: 0.9253 - loss: 0.1973 - val_accuracy: 0.7941 - val_loss: 0.5343
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 32ms/step - accuracy: 0.9609 - loss: 0.1137 - val_accuracy: 0.7983 - val_loss: 0.5484
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 32ms/step - accuracy: 0.9789 - loss: 0.0670 - val_accuracy: 0.7940 - val_loss: 0.6013
Epoch 7/10
[1m4

In [27]:
plot_hist(history)

In [28]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [29]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [30]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 55ms/step - accuracy: 0.6375 - loss: 0.6101 - val_accuracy: 0.8383 - val_loss: 0.3761
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 55ms/step - accuracy: 0.8574 - loss: 0.3365 - val_accuracy: 0.8286 - val_loss: 0.4386
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 47ms/step - accuracy: 0.8975 - loss: 0.2619 - val_accuracy: 0.8216 - val_loss: 0.4183
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 50ms/step - accuracy: 0.9147 - loss: 0.2247 - val_accuracy: 0.8396 - val_loss: 0.3682
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 55ms/step - accuracy: 0.9286 - loss: 0.1951 - val_accuracy: 0.8541 - val_loss: 0.3473
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 48ms/step - accuracy: 0.9413 - loss: 0.1673 - val_accuracy: 0.8559 - val_loss: 0.3592
Epoch 7/10
[1m4

In [31]:
plot_hist(history)

In [32]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_val, y_val))

Epoch 1/3
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 53ms/step - accuracy: 0.6455 - loss: 0.6094 - val_accuracy: 0.8194 - val_loss: 0.4090
Epoch 2/3
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 51ms/step - accuracy: 0.8606 - loss: 0.3353 - val_accuracy: 0.8352 - val_loss: 0.3638
Epoch 3/3
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 47ms/step - accuracy: 0.8990 - loss: 0.2554 - val_accuracy: 0.8395 - val_loss: 0.3919


In [33]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

[0.41356217861175537, 0.8263999819755554]