<a href="https://colab.research.google.com/github/leeminq1/python_ai_colab/blob/main/7_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RNN (Recurrent Neural Network)
### - For time series data, text, video, ..

In [5]:
import numpy as np
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

from sklearn.metrics import accuracy_score
from keras.datasets import reuters
from keras.datasets import imdb

## 1. Reuters news Classification

In [6]:
## Load Dataset (https://keras.io/api/datasets/reuters/)
max_features = 5000 # 자주나오는 5000개의 단어만
(X_train, y_train), (X_test, y_test) = keras.datasets.reuters.load_data(num_words=max_features)
## padding 
from keras.preprocessing.sequence import pad_sequences #(https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences)
text_max_words = 120 # 한 news의 길이는 최대 120단어
# 길이를 맞춰주기 위해서 120으로 자름
X_train = pad_sequences(X_train, maxlen=text_max_words)
X_test = pad_sequences(X_test, maxlen=text_max_words)
## to categorical
# 뒤에 46 안써도 줘도 자기가 알아서 y label 종류보고 맞춰줌
y_train = keras.utils.to_categorical(y_train,46)
y_test = keras.utils.to_categorical(y_test,46)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz


In [16]:
print(X_train.shape)
print(X_train[0].shape)

(8982, 120)
(120,)


In [8]:
## word to index
# 단어를 다 index로 바꿔줌
word_to_index=reuters.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json


In [17]:
word_to_index

{'mdbl': 10996,
 'fawc': 16260,
 'degussa': 12089,
 'woods': 8803,
 'hanging': 13796,
 'localized': 20672,
 'sation': 20673,
 'chanthaburi': 20675,
 'refunding': 10997,
 'hermann': 8804,
 'passsengers': 20676,
 'stipulate': 20677,
 'heublein': 8352,
 'screaming': 20713,
 'tcby': 16261,
 'four': 185,
 'grains': 1642,
 'broiler': 20680,
 'wooden': 12090,
 'wednesday': 1220,
 'highveld': 13797,
 'duffour': 7593,
 '0053': 20681,
 'elections': 3914,
 '270': 2563,
 '271': 3551,
 '272': 5113,
 '273': 3552,
 '274': 3400,
 'rudman': 7975,
 '276': 3401,
 '277': 3478,
 '278': 3632,
 '279': 4309,
 'dormancy': 9381,
 'errors': 7247,
 'deferred': 3086,
 'sptnd': 20683,
 'cooking': 8805,
 'stratabit': 20684,
 'designing': 16262,
 'metalurgicos': 20685,
 'databank': 13798,
 '300er': 20686,
 'shocks': 20687,
 'nawg': 7972,
 'tnta': 20688,
 'perforations': 20689,
 'affiliates': 2891,
 '27p': 20690,
 'ching': 16263,
 'china': 595,
 'wagyu': 16264,
 'affiliated': 3189,
 'chino': 16265,
 'chinh': 16266,
 '

In [19]:
len(word_to_index)

30979

In [27]:
word_to_index['the']

1

### LSTM

In [15]:
text_max_words

120

In [26]:
## Model (https://keras.io/api/layers/)
# input은 전체 feature고 / out_dim= 위에서 120개의 단어 하나 하나가 embedding layer를 지나면서
# 각 단어가 128개의 vector 로 변한다.
# output_dim은 임의로 정한다. ( word_embedding의 vector의 크기)
model = keras.Sequential(
    [
    #  masking은 0으로 된 값을 사용하지 않는 다는 것을 의미함.
      keras.layers.Masking(mask_value=0,input_dim=text_max_words),
      keras.layers.Embedding(input_dim=max_features,output_dim=128,input_length=text_max_words,),
      keras.layers.LSTM(100, activation='tanh', recurrent_activation='sigmoid'),
    
      keras.layers.Dense(46,activation='softmax')
    ]
)

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_5 (Masking)         (None, 120)               0         
                                                                 
 embedding_7 (Embedding)     (None, 120, 128)          640000    
                                                                 
 lstm_1 (LSTM)               (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 46)                4646      
                                                                 
Total params: 736,246
Trainable params: 736,246
Non-trainable params: 0
_________________________________________________________________


In [28]:
## Train
batch_size = 64
epochs = 50

# 러닝 rate / loss / optimaizer 머 쓸건지 정함
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy']) # keras.losses.MeanSq
## fit
hist = model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
## Evaluate model on the test set
model.evaluate(X_test, y_test, verbose=1)



[2.341099262237549, 0.6656277775764465]

### Stacked LSTM

In [30]:
## We can stack up LSTM
## Model
model = keras.Sequential(
    [
      keras.layers.Masking(mask_value=0,input_dim=text_max_words),
      keras.layers.Embedding(input_dim=max_features,output_dim=128,input_length=text_max_words,),
    #  layer가 여러개일 경우에는 방향이 2가지로 가서 (?) return_sequnces가 필요함.
    #  맨마지막 LSTM의 경우는 필요없음
      keras.layers.LSTM(100, activation='tanh', recurrent_activation='sigmoid',return_sequences=True),
      keras.layers.LSTM(100, activation='tanh', recurrent_activation='sigmoid'),
    
      keras.layers.Dense(46,activation='softmax')
    ]
)

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_6 (Masking)         (None, 120)               0         
                                                                 
 embedding_8 (Embedding)     (None, 120, 128)          640000    
                                                                 
 lstm_2 (LSTM)               (None, 120, 100)          91600     
                                                                 
 lstm_3 (LSTM)               (None, 100)               80400     
                                                                 
 dense_1 (Dense)             (None, 46)                4646      
                                                                 
Total params: 816,646
Trainable params: 816,646
Non-trainable params: 0
_________________________________________________________________


In [None]:
## Train
batch_size = 64
epochs = 50

# 러닝 rate / loss / optimaizer 머 쓸건지 정함
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy']) # keras.losses.MeanSq
## fit
hist = model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)



In [None]:
## Evaluate model on the test set
model.evaluate(X_test, y_test, verbose=1)

### GRU

In [32]:
## Evaluate model on the test set
model = keras.Sequential(
    [
      keras.layers.Masking(mask_value=0,input_dim=text_max_words),
      keras.layers.Embedding(input_dim=max_features,output_dim=128,input_length=text_max_words,),

      keras.layers.GRU(100),

  
      keras.layers.Dense(46,activation='softmax')
    ]
)

In [33]:
## Train
batch_size = 64
epochs = 50

# 러닝 rate / loss / optimaizer 머 쓸건지 정함
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy']) # keras.losses.MeanSq
## fit
hist = model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [34]:
## Evaluate model on the test set
model.evaluate(X_test, y_test, verbose=1)



[2.4536361694335938, 0.6682991981506348]

### 1D CNN

In [36]:
## 1d CNN
model = keras.Sequential(
    [
      keras.layers.Masking(mask_value=0,input_dim=text_max_words),
      keras.layers.Embedding(input_dim=max_features,output_dim=128,input_length=text_max_words,),

      # conv 1
      keras.layers.Conv1D(filters=32, kernel_size=(10),padding="same"),
      keras.layers.BatchNormalization(),
      layers.Activation('relu'),
      # conv 2
      layers.Conv1D(filters=32, kernel_size=(10),padding="same"),
      layers.BatchNormalization(),
      layers.Activation('relu'),
      #  Pooling 
     layers.MaxPooling1D(pool_size=(5),padding="same" ),

     layers.Flatten(),
      keras.layers.Dense(46,activation='softmax')
    ]
)

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_9 (Masking)         (None, 120)               0         
                                                                 
 embedding_11 (Embedding)    (None, 120, 128)          640000    
                                                                 
 conv1d_2 (Conv1D)           (None, 120, 32)           40992     
                                                                 
 batch_normalization_2 (Batc  (None, 120, 32)          128       
 hNormalization)                                                 
                                                                 
 activation_2 (Activation)   (None, 120, 32)           0         
                                                                 
 conv1d_3 (Conv1D)           (None, 120, 32)           10272     
                                                      

In [37]:
## Train
batch_size = 64
epochs = 50

# 러닝 rate / loss / optimaizer 머 쓸건지 정함
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy']) # keras.losses.MeanSq
## fit
hist = model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [38]:
## Evaluate model on the test set
model.evaluate(X_test, y_test, verbose=1)



[2.0747945308685303, 0.6932324171066284]

## 2. IMDB Movie review Classification (Kaggle Competition!)
#### - a. load dataset: https://keras.io/api/datasets/imdb/
#### - b. define your RNN model (Google it!) (https://keras.io/api/layers/recurrent_layers/)
#### - c. Train your model
#### - d. Tune hyperparamters of your model on the validation set
#### - e. Evaluate your model on the test set.

\
#### after you achieve your best performance, submit your result:
#### make your team name as "date_name" (ex. 20220307_권원빈) ('team'탭에 가면 변경가능)
(https://www.kaggle.com/c/imdb-classification1234/)



In [None]:
## Load Dataset
max_features = 10000 # 자주나오는 단어 10000개만
text_max_words = 500 # 리뷰 하나는 단어 500개까지 보겠다
(X_train, y_train), (X_test, y_test) = 

## padding
X_train = 
X_test = 

In [None]:
## word to index & index_to_word
word_to_index =  ## 사실 3칸 밀려있음. 밑에서 수정해서 쓰기!!


In [None]:
## review sample


In [None]:
## Model
model = keras.Sequential(
    [
      
    ]
)

In [None]:
## Training
batch_size=64
epochs=5

## compile

## fit

In [None]:
## Evaluate model on the test set
model.evaluate(X_test, y_test, verbose=1)

In [None]:
## Submission for Kaggle
import pandas as pd

# predict results
results = model.predict(X_test)
results = np.round_(results).astype(int).reshape(-1)
results = pd.Series(results,name="Category")

submission = pd.concat([pd.Series(range(1,25001),name = "Id"),results],axis = 1)
submission.to_csv("imdb_classification.csv",index=False)

### Test on my sentence!!
아래의 example을 model에 넣고 sentiment를 예측해보자

In [None]:
## test example
#example = 'This movie is very good. I like the characters and the story was beautiful. Also the background music was selected appropriately'
#example = "This movie was awful. I don't like the main characters and the story did not make any sense."
example = 'i love this movie'

# 1. 단어 단위로 자르시고. example.split()


# 2. words의 각 단어를 index로 바꾸기
#     a. 먼저, 단어를 모두 소문자로 바꾸어주어야함 & '.'은 없애야함
#     b. index로 바꿀때, 'word_to_index[word]+3' 사용 (0,1,2,3은 token용)
#     b. 리뷰의 맨 앞은 무조건 1로 시작 (시작 token)
#     c. index가 10000보다 큰 단어는 2로 바꾸기 (모르는 단어용 token)
#     d. 리뷰의 길이는 120으로 고정 (0 for padding)
word_to_index = imdb.get_word_index()


# 3. 길이가 500이 되도록 padding (앞부분 참고)


# 4. model에 넣고 prediction (model.predict())

