## Loading and transforming data

In [84]:
import numpy as np

# training and testing data
with open('X_train.txt') as train, open('X_test.txt') as test:
    X_train = train.read()
    X_test = test.read()

# training output
y_train = np.load('y_train.npy')

In [2]:
import tensorflow as tf

# this command is able to construct a vocabulary based on the inputed text
def _CreateTable(vocab, num_oov=1):
    init = tf.lookup.KeyValueTensorInitializer(
        vocab, 
        tf.range(tf.size(vocab, out_type=tf.int64),
                        dtype=tf.int64),
                        key_dtype=tf.string,
                        value_dtype=tf.int64)
    
    return tf.lookup.StaticVocabularyTable(init, num_oov, lookup_key_dtype=tf.string)


2024-04-22 13:28:38.205422: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [85]:
# transformation for creating the vocab
import re
import nltk


nltk.download('stopwords')

class DataCleaner:
    def __init__(self, stopwords=True):
        try:
            self.stop = nltk.corpus.stopwords.words('english')
            self.pattern = self.construct_pattern(stopwords)
        except Exception as e:
            print(f"Error initializing stopwords: {e}")
            self.stop = []
            self.pattern = ""

    def construct_pattern(self, stopwords=True):
        try:
            stopwords_pattern = r'\b(?:' + '\s*|'.join(map(re.escape, self.stop)) + r')\b' if stopwords else ''
            return stopwords_pattern + r'|[^\w\s]'
        except Exception as e:
            print(f"Error constructing pattern: {e}")
            return ""

    def clean_text(self, text):
        try:
            clean_data = [None]
            for _ in range(1):
                clean_data[0] = text.lower().split('\n')
                clean_data[0] = list(map(lambda x: re.sub(self.pattern, '', x), clean_data[0]))
            return clean_data[0]
        except AttributeError:
            print("Error: Input text is not a string.")
            return []
        except Exception as e:
            print(f"Error cleaning text: {e}")
            return []

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/haarrublar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [167]:
# we are using datacleaner because our model is not BERT

dataclean = DataCleaner()
prep_x = dataclean.clean_text(X_train)
# max_prep_x_value = max(map(lambda x: len(x), prep_x))
max_prep_x_value = 250

In [168]:
# tokenizer using built-in cleaner function
max_features=10000
sequence_length=max_prep_x_value

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [169]:
# creating a vocab using the text provided
vectorize_layer.adapt(prep_x)

# vectorizing the text
vectorize_layer(prep_x)

<tf.Tensor: shape=(1000, 250), dtype=int64, numpy=
array([[ 318, 2413, 1686, ...,    0,    0,    0],
       [  13,  864,    2, ...,    0,    0,    0],
       [   7,    0,    0, ...,    0,    0,    0],
       ...,
       [  27,  138,    2, ...,    0,    0,    0],
       [ 584,   49,  371, ...,    0,    0,    0],
       [ 460,  195,  569, ...,    0,    0,    0]])>

In [170]:
# include vectorized text and label
def vectorize_text(text,label):
    text = tf.expand_dims(text,-1)
    return vectorize_layer(text), label

In [171]:
vectorize_text(prep_x,y_train.flatten())

(<tf.Tensor: shape=(1000, 250), dtype=int64, numpy=
 array([[ 318, 2413, 1686, ...,    0,    0,    0],
        [  13,  864,    2, ...,    0,    0,    0],
        [   7,    0,    0, ...,    0,    0,    0],
        ...,
        [  27,  138,    2, ...,    0,    0,    0],
        [ 584,   49,  371, ...,    0,    0,    0],
        [ 460,  195,  569, ...,    0,    0,    0]])>,
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1,

In [172]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print("6020 ---> ",vectorize_layer.get_vocabulary()[6020])
print(f"Vocabulary size: {len(vectorize_layer.get_vocabulary())}")

1287 --->  possibly
6020 --->  010ign
Vocabulary size: 6021


In [173]:
# creating a dataset of mapped vectorized text

tf_dataset = tf.data.Dataset.from_tensor_slices((prep_x,y_train))
train_ds = tf_dataset.map(vectorize_text) 
train_ds

<_MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(1,), dtype=tf.int64, name=None))>

In [174]:
# y_train 
y = y_train.reshape(-1, 1) 
y.flatten()

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [175]:
# configure data for performance
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [176]:
import tensorflow_text as tf_text
embedding_dim = 16

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(7000, embedding_dim),
  tf_text.keras.layers.ToDense(mask=True),
  tf.keras.layers.LSTM(32),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')])

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, None, 16)          112000    
                                                                 
 to_dense_6 (ToDense)        (None, None, 16)          0         
                                                                 
 lstm_6 (LSTM)               (None, 32)                6272      
                                                                 
 dense_14 (Dense)            (None, 32)                1056      
                                                                 
 dense_15 (Dense)            (None, 1)                 33        
                                                                 
Total params: 119,361
Trainable params: 119,361
Non-trainable params: 0
_________________________________________________________________


In [190]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer='adam',
    metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)]
)

In [178]:
train_ds

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(1,), dtype=tf.int64, name=None))>

In [180]:
epochs = 3
history = model.fit(
    train_ds,
    epochs=epochs,
    batch_size=10, verbose=1)

Epoch 1/3


Epoch 2/3
Epoch 3/3


In [189]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  tf.keras.layers.Activation('sigmoid')
])

export_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer='adam',
    metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)]
)

In [188]:
examples = tf.constant([
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
])

export_model.predict(examples)



array([[0.50000066],
       [0.50000066],
       [0.50000066]], dtype=float32)

In [192]:
import os

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [193]:
os.listdir(dataset_dir)

['imdb.vocab', 'test', 'imdbEr.txt', 'README', 'train']

In [194]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['urls_neg.txt',
 'urls_unsup.txt',
 'pos',
 'unsupBow.feat',
 'urls_pos.txt',
 'unsup',
 'labeledBow.feat',
 'neg']

In [195]:
sample_file = os.path.join(train_dir, 'pos/1181_9.txt')
with open(sample_file) as f:
  print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


In [197]:
remove_dir = os.path.join(train_dir, 'unsup')
# shutil.rmtree(remove_dir)

In [198]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

Found 75000 files belonging to 3 classes.
Using 60000 files for training.


In [202]:
raw_train_ds

<_BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [203]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
  
parent_dir = os.path.dirname(text_dir)

parent_dir

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


'/home/haarrublar/.keras/datasets'

In [204]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [205]:
labeled_data_sets

[<_MapDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 <_MapDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 <_MapDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>]