<a href="https://colab.research.google.com/github/jakubtwalczak/neural_network_intro_udemy/blob/main/3_Keras/03_Overfitting_underfitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ładowanie danych.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import tensorflow as tf
from tensorflow.keras.datasets import imdb # zbiór 50 k recenzji filmowych z IMDb
from tensorflow.keras.datasets.imdb import get_word_index # recenzje są zakodowane jako sekwencja indeksów słów, wg częstotliwości wystąpienia
from tensorflow.keras.utils import get_file
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

sns.set()

In [2]:
NUM_WORDS = 10000 # 10 k najczęściej pojawiających się słów
INDEX_FROM = 3

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)

In [3]:
print(f'train_data shape: {train_data.shape}')
print(f'test_data shape: {test_data.shape}')

train_data shape: (25000,)
test_data shape: (25000,)


In [4]:
train_data[10] # przykładowa próbka
# zawiera wyłącznie wartości liczbowe, odzwierciedlające pozycję słowa w rankingu częstotliwości
# im mniejsza liczba, tym wyższa pozycja na liście

[1,
 785,
 189,
 438,
 47,
 110,
 142,
 7,
 6,
 7475,
 120,
 4,
 236,
 378,
 7,
 153,
 19,
 87,
 108,
 141,
 17,
 1004,
 5,
 2,
 883,
 2,
 23,
 8,
 4,
 136,
 2,
 2,
 4,
 7475,
 43,
 1076,
 21,
 1407,
 419,
 5,
 5202,
 120,
 91,
 682,
 189,
 2818,
 5,
 9,
 1348,
 31,
 7,
 4,
 118,
 785,
 189,
 108,
 126,
 93,
 2,
 16,
 540,
 324,
 23,
 6,
 364,
 352,
 21,
 14,
 9,
 93,
 56,
 18,
 11,
 230,
 53,
 771,
 74,
 31,
 34,
 4,
 2834,
 7,
 4,
 22,
 5,
 14,
 11,
 471,
 9,
 2,
 34,
 4,
 321,
 487,
 5,
 116,
 15,
 6584,
 4,
 22,
 9,
 6,
 2286,
 4,
 114,
 2679,
 23,
 107,
 293,
 1008,
 1172,
 5,
 328,
 1236,
 4,
 1375,
 109,
 9,
 6,
 132,
 773,
 2,
 1412,
 8,
 1172,
 18,
 7865,
 29,
 9,
 276,
 11,
 6,
 2768,
 19,
 289,
 409,
 4,
 5341,
 2140,
 2,
 648,
 1430,
 2,
 8914,
 5,
 27,
 3000,
 1432,
 7130,
 103,
 6,
 346,
 137,
 11,
 4,
 2768,
 295,
 36,
 7740,
 725,
 6,
 3208,
 273,
 11,
 4,
 1513,
 15,
 1367,
 35,
 154,
 2,
 103,
 2,
 173,
 7,
 12,
 36,
 515,
 3547,
 94,
 2547,
 1722,
 5,
 3547,
 36,
 20

In [5]:
# mapowanie wartości liczbowych na tekst recenzji

word_to_idx = get_word_index()
word_to_idx = {k:(v + INDEX_FROM) for k, v in word_to_idx.items()}
word_to_idx["<PAD>"] = 0  # padding
word_to_idx["<START>"] = 1  # start sekwencji tekstowej
word_to_idx["<UNK>"] = 2  # słowo nieznane - spoza pierwszych 10 k
word_to_idx["<UNUSED>"] = 3  # słowo nieużywane

idx_to_word = {v: k for k, v in word_to_idx.items()}
list(idx_to_word.items())[:10]
print(' '.join(idx_to_word[idx] for idx in train_data[10]))

<START> french horror cinema has seen something of a revival over the last couple of years with great films such as inside and <UNK> romance <UNK> on to the scene <UNK> <UNK> the revival just slightly but stands head and shoulders over most modern horror titles and is surely one of the best french horror films ever made <UNK> was obviously shot on a low budget but this is made up for in far more ways than one by the originality of the film and this in turn is <UNK> by the excellent writing and acting that ensure the film is a winner the plot focuses on two main ideas prison and black magic the central character is a man named <UNK> sent to prison for fraud he is put in a cell with three others the quietly insane <UNK> body building <UNK> marcus and his retarded boyfriend daisy after a short while in the cell together they stumble upon a hiding place in the wall that contains an old <UNK> after <UNK> part of it they soon realise its magical powers and realise they may be able to use it 

In [6]:
train_labels[:10] # pierwsze 10 etykiet binarnych

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0])

In [7]:
# kodowanie zerojedynkowe

def multi_hot_sequences(sequences, dimension):
    results = np.zeros((len(sequences), dimension)) # tworzymy macierz zawierającą same zera wymiaru (liczba sekwencji, wymiar kodowania)
    for i, word_indices in enumerate(sequences):
        results[i, word_indices] = 1.0 # iterując po sekwencjach wstawiamy jedynki tam, gdzie w i-tej próbce występują w odpowiednich indeksach słowa
    return results

train_data = multi_hot_sequences(train_data, dimension=NUM_WORDS)
test_data = multi_hot_sequences(test_data, dimension=NUM_WORDS)
train_data.shape # wymiar ogółem (liczba próbek, liczba słów w próbce)

(25000, 10000)

In [8]:
test_data.shape

(25000, 10000)

# Budowa modelu bazowego.

In [9]:
baseline_model = Sequential()
baseline_model.add(Dense(16, activation='relu', input_shape=(NUM_WORDS,)))
baseline_model.add(Dense(16, activation='relu')) # dwie warstwy ukryte
baseline_model.add(Dense(1, activation='sigmoid')) # warstwa wyjścia dla klasyfikacji binarnej

baseline_model.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

baseline_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
baseline_history = baseline_model.fit(train_data,
                                      train_labels,
                                      epochs=20,
                                      batch_size=512,
                                      validation_data=(test_data, test_labels))

Epoch 1/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 91ms/step - accuracy: 0.7157 - loss: 0.5919 - val_accuracy: 0.8783 - val_loss: 0.3289
Epoch 2/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 69ms/step - accuracy: 0.9145 - loss: 0.2525 - val_accuracy: 0.8872 - val_loss: 0.2840
Epoch 3/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.9423 - loss: 0.1705 - val_accuracy: 0.8847 - val_loss: 0.2966
Epoch 4/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 53ms/step - accuracy: 0.9576 - loss: 0.1301 - val_accuracy: 0.8773 - val_loss: 0.3236
Epoch 5/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.9678 - loss: 0.1087 - val_accuracy: 0.8719 - val_loss: 0.3581
Epoch 6/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9752 - loss: 0.0862 - val_accuracy: 0.8694 - val_loss: 0.3962
Epoch 7/20
[1m49/49[0m [32m━━━━

# Mniejszy model.

In [11]:
# poprzedni model wykazywał doskonałe dopasowanie do danych
# przy jednoczesnym znacznie słabszym i niewykazującym poprawy z epoki na epokę generalizowaniu
# zbudujemy model z mniejszą liczbą ukrytych neuronów

smaller_model = Sequential()
smaller_model.add(Dense(4, activation='relu', input_shape=(NUM_WORDS,)))
smaller_model.add(Dense(4, activation='relu'))
smaller_model.add(Dense(1, activation='sigmoid'))

smaller_model.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

smaller_model.summary()

In [12]:
smaller_history = smaller_model.fit(train_data,
                                    train_labels,
                                    epochs=20,
                                    batch_size=512,
                                    validation_data=(test_data, test_labels))

Epoch 1/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 115ms/step - accuracy: 0.6790 - loss: 0.6525 - val_accuracy: 0.8463 - val_loss: 0.5207
Epoch 2/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 39ms/step - accuracy: 0.8790 - loss: 0.4628 - val_accuracy: 0.8738 - val_loss: 0.4059
Epoch 3/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.9038 - loss: 0.3473 - val_accuracy: 0.8820 - val_loss: 0.3449
Epoch 4/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - accuracy: 0.9175 - loss: 0.2839 - val_accuracy: 0.8870 - val_loss: 0.3124
Epoch 5/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 41ms/step - accuracy: 0.9262 - loss: 0.2411 - val_accuracy: 0.8882 - val_loss: 0.2957
Epoch 6/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9346 - loss: 0.2118 - val_accuracy: 0.8886 - val_loss: 0.2878
Epoch 7/20
[1m49/49[0m [32m━━━

# Większy model.

In [13]:
# różnica między modelami jest nieznaczna, problemem jest nadal słaba generalizacja modelu
# "dołożymy" więcej neuronów w warstwach ukrytych

bigger_model = Sequential()
bigger_model.add(Dense(512, activation='relu', input_shape=(NUM_WORDS,)))
bigger_model.add(Dense(512, activation='relu'))
bigger_model.add(Dense(1, activation='sigmoid'))

bigger_model.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

bigger_model.summary()

In [14]:
bigger_history = bigger_model.fit(train_data,
                                  train_labels,
                                  epochs=20,
                                  batch_size=512,
                                  validation_data=(test_data, test_labels))

Epoch 1/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 410ms/step - accuracy: 0.7597 - loss: 0.4693 - val_accuracy: 0.8784 - val_loss: 0.2996
Epoch 2/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 382ms/step - accuracy: 0.9486 - loss: 0.1536 - val_accuracy: 0.8734 - val_loss: 0.3313
Epoch 3/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 357ms/step - accuracy: 0.9845 - loss: 0.0581 - val_accuracy: 0.8674 - val_loss: 0.4391
Epoch 4/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 471ms/step - accuracy: 0.9989 - loss: 0.0088 - val_accuracy: 0.8696 - val_loss: 0.5698
Epoch 5/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 349ms/step - accuracy: 1.0000 - loss: 0.0015 - val_accuracy: 0.8709 - val_loss: 0.6458
Epoch 6/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 343ms/step - accuracy: 1.0000 - loss: 3.2849e-04 - val_accuracy: 0.8712 - val_loss: 0.6976
Epoch 7/20
[1m49/

In [15]:
# stwórzmy DataFrame dla modelu bazowego

hist = pd.DataFrame(baseline_history.history)
hist['epoch'] = baseline_history.epoch
hist

Unnamed: 0,accuracy,loss,val_accuracy,val_loss,epoch
0,0.80356,0.487105,0.87828,0.328922,0
1,0.91348,0.241662,0.88724,0.284047,1
2,0.93812,0.174048,0.88472,0.296571,2
3,0.95324,0.137496,0.87728,0.323634,3
4,0.9634,0.112162,0.87192,0.358134,4
5,0.97116,0.092425,0.86936,0.396171,5
6,0.9784,0.076044,0.86284,0.441192,6
7,0.98324,0.063092,0.86076,0.48363,7
8,0.98728,0.051526,0.858,0.531847,8
9,0.9914,0.041887,0.85568,0.576922,9


# Porównanie wydajności modeli.

In [16]:
# utwórzmy wykres dla funkcji straty we wszystkich przetestowanych modelach

import plotly.graph_objects as go

fig = go.Figure()
for name, history in zip(['smaller', 'baseline', 'bigger'], [smaller_history, baseline_history, bigger_history]):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name=name + '_binary_crossentropy', mode='lines+markers'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name=name + '_val_binary_crossentropy', mode='lines+markers'))
    fig.update_layout(xaxis_title='Epoki', yaxis_title='binary_crossentropy')
fig.show()

# wykres pokazuje, że we wszystkich modelach wartość funkcji straty w zbiorze treningowym szybko maleje na zbiorze treningowym
# w modelach małym i średnim od mniej więcej trzeciej - czwartej epoki strata walidacyjnym rośnie i modele się przeuczają
# w modelu dużym strata walidacyjna rośnie od samego początku - szybko się przeucza

# Techniki regularyzacji.

In [17]:
from tensorflow.keras.regularizers import l2
# zastosujemy regularyzację L2, czyli dodawanie współczynnika kary w postaci kwadratów wag

l2_model = Sequential()
l2_model.add(Dense(16, kernel_regularizer=l2(0.001), activation='relu', input_shape=(NUM_WORDS,)))
l2_model.add(Dense(16, kernel_regularizer=l2(0.01), activation='relu')) # można regularyzować każdą warstwę z osobna
l2_model.add(Dense(1, activation='sigmoid'))

l2_model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

l2_model.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [18]:
l2_history = l2_model.fit(train_data,
                          train_labels,
                          epochs=20,
                          batch_size=512,
                          validation_data=(test_data, test_labels))

Epoch 1/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 125ms/step - accuracy: 0.6401 - loss: 0.8072 - val_accuracy: 0.8571 - val_loss: 0.5793
Epoch 2/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 59ms/step - accuracy: 0.8905 - loss: 0.5065 - val_accuracy: 0.8845 - val_loss: 0.4639
Epoch 3/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 55ms/step - accuracy: 0.9189 - loss: 0.3991 - val_accuracy: 0.8875 - val_loss: 0.4319
Epoch 4/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 54ms/step - accuracy: 0.9286 - loss: 0.3538 - val_accuracy: 0.8838 - val_loss: 0.4198
Epoch 5/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.9385 - loss: 0.3230 - val_accuracy: 0.8852 - val_loss: 0.4085
Epoch 6/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.9421 - loss: 0.3070 - val_accuracy: 0.8842 - val_loss: 0.4019
Epoch 7/20
[1m49/49[0m [32m━━━

In [19]:
# porównajmy stratę modelu bazowego i po regularyzacji

fig = go.Figure()
for name, history in zip(['baseline', 'l2'], [baseline_history, l2_history]):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name=name + '_binary_crossentropy', mode='lines+markers'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name=name + '_val_binary_crossentropy', mode='lines+markers'))
    fig.update_layout(xaxis_title='Epoki', yaxis_title='binary_crossentropy')
fig.show()

# strata modelu regularyzowanego spada wolniej na zbiorze treningowym
# natomiast strata na zbiorze testowym po osiągnięciu minimum nie rośnie tak drastycznie

In [20]:
from tensorflow.keras.layers import Dropout # warstwa Dropout

dropout_model = Sequential()
dropout_model.add(Dense(16, activation='relu', input_shape=(NUM_WORDS,)))
dropout_model.add(Dropout(0.5)) # porzucenie połowy neuronów podczas treningu
dropout_model.add(Dense(16, activation='relu'))
dropout_model.add(Dropout(0.5))
dropout_model.add(Dense(1, activation='sigmoid'))

dropout_model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

dropout_model.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [21]:
dropout_history = dropout_model.fit(train_data,
                                    train_labels,
                                    epochs=20,
                                    batch_size=512,
                                    validation_data=(test_data, test_labels))

Epoch 1/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 88ms/step - accuracy: 0.5851 - loss: 0.6631 - val_accuracy: 0.8560 - val_loss: 0.4995
Epoch 2/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.7680 - loss: 0.5016 - val_accuracy: 0.8764 - val_loss: 0.3587
Epoch 3/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step - accuracy: 0.8344 - loss: 0.3949 - val_accuracy: 0.8872 - val_loss: 0.2945
Epoch 4/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.8744 - loss: 0.3253 - val_accuracy: 0.8882 - val_loss: 0.2760
Epoch 5/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.9014 - loss: 0.2676 - val_accuracy: 0.8885 - val_loss: 0.2732
Epoch 6/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.9205 - loss: 0.2351 - val_accuracy: 0.8877 - val_loss: 0.2786
Epoch 7/20
[1m49/49[0m [32m━━━━

In [22]:
fig = go.Figure()
for name, history in zip(['baseline', 'dropout'], [baseline_history, dropout_history]):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name=name + '_binary_crossentropy', mode='lines+markers'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name=name + '_val_binary_crossentropy', mode='lines+markers'))
    fig.update_layout(xaxis_title='Epoki', yaxis_title='binary_crossentropy')
fig.show()

# strata walidacyjna nie jest już tak stabilna, jak przy regularyzacji i rośnie szybciej po przeuczeniu
# tym niemniej obie techniki - regularyzację i Dropout - można łączyć