In [11]:
# Preprocessing text data for machine learning applications
# Bag-of-words approaches and sequence-modeling approaches for text processing
# The Transformer architecture
# Sequence-to-sequence learning

In [12]:
# preparing text data
# standardize text, vectrorise text. Convert to lowercase and remove punctuations.
# tokenization
# convert each token to numeric vector, indexing all tokens present in the data.
# one hot encoding or embedding

In [13]:
# Text standardization
# tokenization

In [14]:
# Vocabulary indexing
# padding

In [15]:
# Preparing the IMDB movie reviews data

In [16]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0   626k      0  0:02:11  0:02:11 --:--:-- 1636k86k0:03:45  0:00:38  0:03:07  429k.0M    0     0   373k      0  0:03:40  0:01:00  0:02:40  164k 0     0   370k      0  0:03:41  0:01:02  0:02:39  196k369k      0  0:03:42  0:01:05  0:02:37  330k8M    0     0   375k      0  0:03:38  0:01:10  0:02:28  448k9.4M    0     0   383k      0  0:03:34  0:01:18  0:02:16  459k43 34.5M    0     0   402k      0  0:03:23  0:01:27  0:01:56  587k 0     0   405k      0  0:03:22  0:01:29  0:01:53  574k1M    0     0   498k      0  0:02:44  0:01:53  0:00:51  805k0     0   519k      0  0:02:38  0:01:57  0:00:41 1072k


In [17]:
!rm -r aclImdb/train/unsup

In [18]:
!cat aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

In [19]:
import os, pathlib, shutil, random

In [20]:
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)

In [21]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [22]:
for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'First of all, I am not a huge fan of contemporary Turkish cinema, which is because, the usual pattern of creating a box office success is by hitting below the waistline. This movie is nothing of an artistic masterpiece that deals with taboos, as the director and marketing ads imply. In my mere opinion, the sole purpose of this movie is make money by touching a sensitive morale(in fact it is mostly considered taboo in the native country) Cheap populism might provide with a brief definition of what I meant.<br /><br />However, the acting is near perfect. In fact, most of the cast has theatrical background and tried hard to compensate for what Altioklar lacked; talent! All members of the cast were perfectly fit in their roles and well qualified for the job, even the less experienced ones. (Like Janset) At least, Altioklar deserves a small word of appreciation , j

In [None]:
# TextVectorization layer so that they yield multi hot encoded binary word vectors

In [23]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(max_tokens=20000, output_mode="multi_hot")
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [24]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


In [25]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs) 
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x) 
    model = keras.Model(inputs, outputs) 
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [26]:
model = get_model()
model.summary()
callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True)]
model.fit(binary_1gram_train_ds.cache(), validation_data=binary_1gram_val_ds.cache(), epochs=10, callbacks=callbacks)
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10

ValueError: The following argument(s) are not supported with the native Keras format: ['options']