<a href="https://colab.research.google.com/github/janShi1105/science/blob/main/ML16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
tf.random.set_seed(1)
rnn_layer = tf.keras.layers.SimpleRNN(units=2, use_bias= True, return_sequences=True)
rnn_layer.build(input_shape=(None, None, 5))
w_xh, w_oo, b_h = rnn_layer.weights
print('W_xh shape:', w_xh.shape)
print('W_oo shape:', w_oo.shape)
print('b_h shape:', b_h.shape)

W_xh shape: (5, 2)
W_oo shape: (2, 2)
b_h shape: (2,)


In [2]:
x_seq = tf.convert_to_tensor([[1.0]*5, [2.0]*5, [3.0]*5], dtype=tf.float32)
output = rnn_layer(tf.reshape(x_seq, shape=(1,3,5)))
out_man = []
for t in range(len(x_seq)):
  xt = tf.reshape(x_seq[t], (1,5))
  print('Time step {}'.format(t))
  print('    Input           :', xt.numpy())

  ht = tf.matmul(xt, w_xh) + b_h
  print('    Hidden        :', ht.numpy())

  if t>0:
    prev_o = out_man[t-1]
  else:
    prev_o = tf.zeros(shape=(ht.shape))
  ot = ht + tf.matmul(prev_o, w_oo)
  ot = tf.math.tanh(ot)
  out_man.append(ot)
  print('      Output (manual)   :', ot.numpy())
  print('      SimpleRNN output: '.format(t), output[0][t].numpy())
  print()



Time step 0
    Input           : [[1. 1. 1. 1. 1.]]
    Hidden        : [[0.4775737  0.21143436]]
      Output (manual)   : [[0.44429833 0.20833899]]
      SimpleRNN output:  [0.44429833 0.20833899]

Time step 1
    Input           : [[2. 2. 2. 2. 2.]]
    Hidden        : [[0.9551474  0.42286873]]
      Output (manual)   : [[0.8925385  0.48455504]]
      SimpleRNN output:  [0.8925385  0.48455504]

Time step 2
    Input           : [[3. 3. 3. 3. 3.]]
    Hidden        : [[1.4327211 0.6343031]]
      Output (manual)   : [[0.983985  0.7219829]]
      SimpleRNN output:  [0.983985  0.7219829]



In [3]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

df = pd.read_csv('drive/MyDrive/movie_data.csv', encoding='utf-8')

In [5]:
!ls

drive  sample_data


In [6]:
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))
for ex in ds_raw.take(3):
  tf.print(ex[0].numpy()[0][:50], ex[1])


b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


In [7]:
tf.random.set_seed(1)
ds_raw= ds_raw.shuffle(50000, reshuffle_each_iteration=False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [8]:
from collections import Counter

tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()
for example in ds_raw_train:
  tokens = tokenizer.tokenize(example[0].numpy()[0])
  token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

Vocab-size: 87007


In [9]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
!tensorflow --version

/bin/bash: tensorflow: command not found


In [11]:
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = 'This is an example!'
print(encoder.encode(example_str))

[232, 9, 270, 1123]


In [12]:
def encode(text_tensor, label):
  text = text_tensor.numpy()[0]
  encoded_text = encoder.encode(text)
  return encoded_text, label

In [13]:
def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
  print('Sequnece length: ', example[0].shape)

Sequnece length:  (24,)
Sequnece length:  (179,)
Sequnece length:  (262,)
Sequnece length:  (535,)
Sequnece length:  (130,)


In [14]:
ds_subset = ds_train.take(8)
for example in ds_subset:
  print('Individual size: ', example[0].shape)
  

Individual size:  (119,)
Individual size:  (688,)
Individual size:  (308,)
Individual size:  (204,)
Individual size:  (326,)
Individual size:  (240,)
Individual size:  (127,)
Individual size:  (453,)


In [15]:
ds_batched = ds_subset.padded_batch(4, padded_shapes=([-1], []))
for batch in ds_batched:
  print('Batch dimension: ', batch[0].shape)

Batch dimension:  (4, 688)
Batch dimension:  (4, 453)


In [16]:
train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

In [17]:
from tensorflow.keras.layers import Embedding
model = tf.keras.Sequential()
model.add(Embedding(input_dim=100, output_dim=6, input_length=20, name='embed-layer'))
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, 20, 6)             600       
                                                                 
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


In [18]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          32000     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, None, 32)          2080      
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


In [19]:
embedding_dim = 20
vocab_size = len(token_counts) + 2
tf.random.set_seed(1)
bi_lstm_model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='embed-layer'), 
      #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, name='lstm-layer'), name='bidir-lstm'),
      tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(64))),
      tf.keras.layers.Dense(64, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid')
    ]
)
bi_lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, None, 20)          1740180   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              43520     
 l)                                                              
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,792,021
Trainable params: 1,792,021
Non-trainable params: 0
_________________________________________________________________


In [None]:
bi_lstm_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=['accuracy'])
history = bi_lstm_model.fit(train_data, validation_data=valid_data, epochs=10)
test_results = bi_lstm_model.evaluate(test_data)
print('Test Acc.: {:.2f}%'.format(test_results[1]*100))

Epoch 1/10

In [None]:
from collections import Counter

def preprocess_datasets(ds_raw_train, ds_raw_valid, ds_raw_test, max_seq_length=None, batch_size=32):
  tokenizer = tfds.deprecated.text.Tokenizer()
  token_counts = Counter()

  for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    if max_seq_length is not None:
      tokens = tokens[-max_seq_length:]
    token_counts.update(tokens)

  print('Vocab-size:', len(token_counts))

  encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
  def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    if max_seq_length is not None:
      encoded_text = encoded_text[-max_seq_length:]
    return encoded_text, label

  def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

  ds_train = ds_raw_train.map(encode_map_fn)
  ds_valid = ds_raw_valid.map(encode_map_fn)
  ds_test = ds_raw_test(encode_map_fn)

  train_data = ds_train.padded_batch(batch_size, padded_shapes=([-1], []))
  valid_data = ds_valid.padded_batch(batch_size, padded_shapes=([-1], []))
  test_data = ds_test.padded_batch(batch_size, padded_shapes=([-1], []))
  
  return (train_data, valid_data, test_data, len(token_counts))