<a href="https://colab.research.google.com/github/kszymon/neural-network/blob/main/07_rnn%20/02_text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [2]:
!wget https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
!unzip -q reviews.zip

--2025-05-12 06:24:36--  https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.216.207, 173.194.217.207, 108.177.12.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.216.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42878657 (41M) [application/x-zip-compressed]
Saving to: ‘reviews.zip’


2025-05-12 06:24:39 (22.0 MB/s) - ‘reviews.zip’ saved [42878657/42878657]



In [3]:
data_dir = './reviews'
train_dir = os.path.join(data_dir, 'train')

train_texts = []
train_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            train_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                train_labels.append(0)
            else:
                train_labels.append(1)

In [4]:
test_dir = os.path.join(data_dir, 'test')

test_texts = []
test_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            test_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                test_labels.append(0)
            else:
                test_labels.append(1)

In [5]:
train_texts[:10]

["I had to do a search on the actresses to find the board of this film because the title is now An Unexpected Love. It's not really worth looking for but I was unfamiliar with both leads and wondered why they were headlining a lesbian flick on Lifetime. Everything's pretty restrained and you don't really get an idea of who these characters are so, as a viewer, I wasn't able to become emotionally invested in the storyline. I guess I'm not the target audience for this but I'm not sure who is. Everything's muted and soft focus and earth tones...nothing's very interesting. I had a prurient interest in seeing two women make out but it's handled so discreetly that I was disappointed. Rent Personal Best instead.",
 "The movie never becomes intolerable to watch. And to tell it straight, it has nothing to show either, except maybe part-sexy Alicia Silverstone in a nerdy non-sexy character in revealing quite-sexy dresses. The story is very easy to follow or there's nothing to follow -- you can s

In [6]:
train_labels[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [7]:
train_labels[-10:]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [8]:
maxlen = 100   # skracamy recenzje do 100 słów
num_words = 10000    # 10000 najczęściej pojawiających się słów
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)

In [9]:
list(tokenizer.index_word.items())[:20]

[(1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it'),
 (10, 'i'),
 (11, 'this'),
 (12, 'that'),
 (13, 'was'),
 (14, 'as'),
 (15, 'for'),
 (16, 'with'),
 (17, 'movie'),
 (18, 'but'),
 (19, 'film'),
 (20, 'on')]

In [10]:
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[:3])

[[10, 66, 5, 78, 3, 1784, 20, 1, 1504, 5, 166, 1, 2086, 4, 11, 19, 85, 1, 422, 6, 147, 32, 2070, 116, 42, 21, 63, 287, 264, 15, 18, 10, 13, 7693, 16, 196, 829, 2, 3537, 135, 33, 68, 3, 2485, 506, 20, 2640, 181, 6041, 2, 22, 89, 63, 76, 32, 323, 4, 34, 131, 102, 23, 35, 14, 3, 526, 10, 283, 499, 5, 410, 2142, 7476, 8, 1, 766, 10, 479, 143, 21, 1, 2387, 308, 15, 11, 18, 143, 21, 249, 34, 6, 9092, 2, 1789, 1148, 2, 699, 7576, 52, 218, 10, 66, 3, 599, 8, 316, 104, 369, 94, 43, 18, 42, 2388, 35, 12, 10, 13, 682, 847, 962, 115, 302], [1, 17, 112, 457, 5, 103, 2, 5, 373, 9, 726, 9, 44, 161, 5, 120, 342, 546, 276, 170, 1276, 5229, 8, 3, 8926, 695, 1276, 106, 8, 3626, 176, 1276, 5340, 1, 62, 6, 52, 772, 5, 790, 39, 222, 161, 5, 790, 22, 67, 64, 8, 342, 93, 47, 6, 54, 833, 114, 202, 9654, 3230, 169, 354, 1975, 2, 52, 1898, 113, 1838, 8927, 6, 481, 3538, 70, 10, 158, 6436, 96, 73, 20, 38, 113, 10, 5074, 243, 14, 10, 298, 904, 45, 28, 44, 161, 5, 78, 546, 146, 3, 17, 11, 525, 165, 35, 75, 467, 155

In [11]:
word_index = tokenizer.word_index
print(f'{len(word_index)} unikatowych słów.')

88582 unikatowych słów.


In [12]:
# skracamy recenzje do pierszych 100 słów
train_data = pad_sequences(sequences, maxlen=maxlen)
train_data.shape

(25000, 100)

In [13]:
train_data[:3]

array([[  32, 2070,  116,   42,   21,   63,  287,  264,   15,   18,   10,
          13, 7693,   16,  196,  829,    2, 3537,  135,   33,   68,    3,
        2485,  506,   20, 2640,  181, 6041,    2,   22,   89,   63,   76,
          32,  323,    4,   34,  131,  102,   23,   35,   14,    3,  526,
          10,  283,  499,    5,  410, 2142, 7476,    8,    1,  766,   10,
         479,  143,   21,    1, 2387,  308,   15,   11,   18,  143,   21,
         249,   34,    6, 9092,    2, 1789, 1148,    2,  699, 7576,   52,
         218,   10,   66,    3,  599,    8,  316,  104,  369,   94,   43,
          18,   42, 2388,   35,   12,   10,   13,  682,  847,  962,  115,
         302],
       [   5,  103,    2,    5,  373,    9,  726,    9,   44,  161,    5,
         120,  342,  546,  276,  170, 1276, 5229,    8,    3, 8926,  695,
        1276,  106,    8, 3626,  176, 1276, 5340,    1,   62,    6,   52,
         772,    5,  790,   39,  222,  161,    5,  790,   22,   67,   64,
           8,  342,   9

In [14]:
train_labels = np.asarray(train_labels)
train_labels.shape

(25000,)

In [15]:
# przemieszanie próbek
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]

train_data.shape

(25000, 100)

In [16]:
# podział na zbiór treningowy i walidacyjny
training_samples = 15000
validation_samples = 10000

X_train = train_data[:training_samples]
y_train = train_labels[:training_samples]
X_val = train_data[training_samples: training_samples + validation_samples]
y_val = train_labels[training_samples: training_samples + validation_samples]

In [17]:
# budowa modelu
# Embedding(input_dim, output_dim)
model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()



In [18]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [19]:
history = model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

Epoch 1/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.6518 - loss: 0.5936 - val_accuracy: 0.8362 - val_loss: 0.3718
Epoch 2/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 20ms/step - accuracy: 0.9278 - loss: 0.2031 - val_accuracy: 0.8215 - val_loss: 0.4293
Epoch 3/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.9923 - loss: 0.0388 - val_accuracy: 0.8235 - val_loss: 0.5273
Epoch 4/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.9989 - loss: 0.0050 - val_accuracy: 0.8203 - val_loss: 0.6466
Epoch 5/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.9998 - loss: 8.9112e-04 - val_accuracy: 0.8159 - val_loss: 0.7385


In [20]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

In [21]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

[0.7271944284439087, 0.8127999901771545]

### Simple RNN

In [22]:
from tensorflow.keras.layers import SimpleRNN, LSTM

In [23]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [24]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [25]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 32ms/step - accuracy: 0.5683 - loss: 0.6649 - val_accuracy: 0.7786 - val_loss: 0.4827
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 30ms/step - accuracy: 0.8364 - loss: 0.3855 - val_accuracy: 0.8368 - val_loss: 0.3793
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 30ms/step - accuracy: 0.8874 - loss: 0.2827 - val_accuracy: 0.8254 - val_loss: 0.4621
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 36ms/step - accuracy: 0.9192 - loss: 0.2177 - val_accuracy: 0.8340 - val_loss: 0.3924
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 30ms/step - accuracy: 0.9421 - loss: 0.1597 - val_accuracy: 0.8391 - val_loss: 0.4186
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 31ms/step - accuracy: 0.9620 - loss: 0.1129 - val_accuracy: 0.8064 - val_loss: 0.4781
Epoch 7/10
[1m4

In [26]:
plot_hist(history)

In [27]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [28]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [29]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 47ms/step - accuracy: 0.6534 - loss: 0.6011 - val_accuracy: 0.8318 - val_loss: 0.3826
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 52ms/step - accuracy: 0.8707 - loss: 0.3154 - val_accuracy: 0.7660 - val_loss: 0.5944
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 52ms/step - accuracy: 0.9028 - loss: 0.2458 - val_accuracy: 0.8431 - val_loss: 0.3899
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 48ms/step - accuracy: 0.9174 - loss: 0.2198 - val_accuracy: 0.8461 - val_loss: 0.3572
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 51ms/step - accuracy: 0.9330 - loss: 0.1835 - val_accuracy: 0.8452 - val_loss: 0.3922
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 49ms/step - accuracy: 0.9391 - loss: 0.1670 - val_accuracy: 0.8401 - val_loss: 0.4455
Epoch 7/10
[1m4

In [30]:
plot_hist(history)

In [31]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_val, y_val))

Epoch 1/3
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 52ms/step - accuracy: 0.6454 - loss: 0.6063 - val_accuracy: 0.7697 - val_loss: 0.4841
Epoch 2/3
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 49ms/step - accuracy: 0.8652 - loss: 0.3265 - val_accuracy: 0.8470 - val_loss: 0.3508
Epoch 3/3
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 52ms/step - accuracy: 0.8997 - loss: 0.2502 - val_accuracy: 0.8508 - val_loss: 0.3477


In [32]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

[0.3495537042617798, 0.8523200154304504]