# Stoneburner, Kurt
- ## DSC 650 - Assignment 10


Links to Deep Learning Sample Code:
- https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/chapter11_part01_introduction.ipynb

- https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/chapter11_part02_sequence-models.ipynb

- https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/chapter11_part03_transformer.ipynb

- https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/chapter11_part04_sequence-to-sequence-learning.ipynb

ngram reference:
- https://www.analyticsvidhya.com/blog/2021/09/what-are-n-grams-and-how-to-implement-them-in-python/

Convert Numpy Array to Tensor:
- https://www.projectpro.io/recipes/convert-numpy-array-tensor

Convert Tensor to Numpy Array:
- https://www.delftstack.com/howto/numpy/python-convert-tensor-to-numpy-array/

In [1]:
import os
from pathlib import Path
import sys
# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

from tensorflow import keras
import tensorflow as tf
import datetime 

#//*** Reusing Code from assignment 04
from chardet.universaldetector import UniversalDetector
from bs4 import BeautifulSoup


import re

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [2]:
#//*** Get Working Directory
current_dir = Path(os.getcwd()).absolute()

#//*** Go up Two folders
project_dir = current_dir.parents[2]

#//*** IMDB Data Path
imdb_path = project_dir.joinpath("dsc650/data/external/imdb/aclImdb")

file_path = imdb_path.joinpath("train/pos")

#//*** Grab the first positive review text for testing
file_path = file_path.joinpath(os.listdir(file_path)[0])

with open(file_path,'r') as f:
    sample_text = f.read()

print(sample_text)


Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [3]:
#//*** Randomly assign 20% of the training Data and copy to a validation folder
import os, pathlib, shutil, random

val_dir = imdb_path.joinpath("val")
train_dir = imdb_path.joinpath("train")
test_dir = imdb_path.joinpath("test")

for category in ("neg", "pos"):
    #//*** Skip if val folder exists (Delete Folder to resample)
    if os.path.exists(val_dir.joinpath(category)):
        break
    
    os.makedirs(val_dir.joinpath(category))
    files = os.listdir(train_dir.joinpath(category))
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)



# Load IMDB Dataset #

In [4]:
#//*** Use Universal Detector to determine file encoding.
#//*** Borrowed from Assignment04
def read_file_with_encoding(filepath):

    detector = UniversalDetector()
    
    try:
        with open(filepath) as f:
            return f.read()
    except UnicodeDecodeError:
        detector.reset()
        with open(filepath, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()
        encoding = detector.result['encoding']
        with open(filepath, encoding=encoding) as f:
            return f.read()

#//*** Borrowed from Assignment04
def parse_html_payload(payload):
    """
    This function uses Beautiful Soup to read HTML data
    and return the text.  If the payload is plain text, then
    Beautiful Soup will return the original content
    """
    soup = BeautifulSoup(payload, 'html.parser')
    return str(soup.get_text()).encode('utf-8').decode('utf-8')

def load_dataset(dir_path):
    
    text = []
    targets = []
    
    #//*** Crawl the neg and pos folders
    for category in ("neg", "pos"):
        files = os.listdir(dir_path.joinpath(category))
        
        #//*** Loop through each file in the folder
        for file in files:
            try:
                #//*** Add processed file to text
                text.append(
                    #//*** Strip HTML Tags
                    parse_html_payload(
                        #//*** Read File from disk. Function uses Universal Detector to determine file encoding
                        read_file_with_encoding(
                            dir_path.joinpath(category).joinpath(file))))

                #//*** Append Target Value
                if category == 'neg':
                    targets.append(0)
                else:
                    targets.append(1)
            except:
                print(f"Dropping File: {file} due to decoding issues")
    return text,targets
print("Loading Raw Validation Set")
raw_val_text, val_target = load_dataset(val_dir)

print("Loading Raw Train Data")
raw_train_text, train_target = load_dataset(train_dir)

print("Loading Raw Test Data")
raw_test_text, test_target = load_dataset(test_dir)
print("Done")

Loading Raw Validation Set
Loading Raw Train Data
Dropping File: 7714_1.txt due to decoding issues
Dropping File: 11351_9.txt due to decoding issues
Dropping File: 8263_9.txt due to decoding issues
Loading Raw Test Data
Dropping File: 4414_1.txt due to decoding issues
Dropping File: 6973_1.txt due to decoding issues
Dropping File: 2464_10.txt due to decoding issues
Dropping File: 5281_10.txt due to decoding issues
Done


# Assignment 10.1 #

In [48]:
#//*** Vectorize a corpus
class Vectorizer:
    def __init__(self,**kwargs):
        self.corpus_tokens = []
        self.corpus_ngrams = []

        self.max_tokens = None
        self.ngram_size = 1
        self.tidyup = True
        
        for key,value in kwargs.items():
            if key =="max_tokens":
                self.max_tokens = value
                
            if key == "ngrams":
                self.ngram_size = value
            
            if key == "tidyup":
                self.tidyup = value
        
        
        #//*** One Hot Encoding Dictionaries
        #//*** Key = Token Index, Value = Word
        self.ngram_index = {}
        
        #//*** Key = Word, Value = Token Index
        self.vocabulary_index = {}
        
    def tokenize(self,raw_text):
        #//*** Initialize Output Tokens
        tokens = []

        #//*** Split Text into words
        for x in re.split("\s",raw_text):

            #//*** Findall Non text characters in each word
            non_text = re.findall("\W",x)

            #//*** Remove non_text Characters
            for i in non_text:
                x = x.replace(i,"")

            #//*** If X has length, append out
            if len(x) > 0:
                tokens.append(x.lower())
        return tokens

    def build_ngrams(self):
        if self.ngram_size <= 0:
            print("Ngram size must be an integer > 0")
            print("Quitting!")
            return None
        
        #//*** Using unigrams, use tokens
        if self.ngram_size == 1:
            self.corpus_ngrams = self.corpus_tokens
            return

        self.corpus_ngrams = []
        
        #//*** Get each token group from corpus_tokens
        for token in self.corpus_tokens:
            
            loop_ngram = []
            
            #//*** Use an index based range to loop through tokens
            for x in range(0,len(token) ):

                #//*** Check if index + ngram_size exceeds the length of tokens
                if x+self.ngram_size <= len(token):

                    result = ""

                    #//*** Build the ngram
                    for y in range(self.ngram_size):
                        #print(self.tokens[x+y])
                        result += token[x+y] + " "

                    loop_ngram.append(result[:-1])

                else:
                    break
            
            #//*** Token group ngram is built. Add loop_ngram to corpus_ngram
            self.corpus_ngrams.append(loop_ngram)
        

    
    def build_vocabulary(self,corpus):
        if not isinstance(corpus,list) :
            print("Vectorizer Requires a corpus (list of text):")
            return None 
        
        self.tokens = []
        
        print("Tokenizing...")
        #//*** Tokenize each text entry in the corpus
        for raw_text in corpus:
            self.corpus_tokens.append(self.tokenize(raw_text))
        
        print("Building ngrams...")
        #//*** Build ngrams (Defaults to unigrams)
        self.build_ngrams()
        
        word_freq = {}
        
        print("Building Vocabulary...")
        #//*** Build dictionary of unique words
        #//*** Loop through each element of the corpus
        for element in self.corpus_ngrams:
        
            #//*** Process each individual ngram
            for ngram in element:

                #//*** Add unique words to dictionaries
                if ngram not in self.vocabulary_index.keys():
                    index = len(self.ngram_index.values())
                    self.ngram_index[ index ] = ngram
                    self.vocabulary_index [ ngram ] = index
                    
                    #//*** Initialize Word Frequency
                    word_freq[ ngram ] = 1
                else:
                    #//*** Increment Word Frequency
                    word_freq[ ngram ] += 1

        #//*** END for element in self.corpus_ngrams:
        if self.max_tokens != None:
            
            #//*** Check if token count exceeds max tokens
            if self.max_tokens < len(self.ngram_index.items()):
                
                print("Sorting Word Frequency...")
                #//*** Sort the Word Frequency Dictionary. Keep the highest frequency words
                word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True))
                
                print("Building Token Dictionary")
                #//*** Get list of keys that are lowest frequency
                for key in list(word_freq.keys())[self.max_tokens:]:
                    #//*** Delete Low Frequency ngrams
                    del word_freq[ key ]
                
                self.ngram_index = {}
                self.vocabulary_index = {}
                
                print("Rebuilding Vocabulary")
                #//*** Rebuild ngram_index & vocabulary_index
                for ngram in word_freq.keys():
                    index = len(self.ngram_index.values())
                    self.ngram_index[ index ] = ngram
                    self.vocabulary_index [ ngram ] = index        
            
            #//*** END Trim Low Frequency ngrams
        self.word_freq = word_freq

    #//*** One Hot encode the corpus.
    #//*** Handling the corpus as a whole increases processing speed
    #//*** Hot encode to a sparse tensor to for increased encoding speed compared to a dense array
    def one_hot_encode(self,corpus):
        
        #//*** Encoded Results
        results = []
        
        #//*** Set the Max array size to the total number of items in self.ngram_index
        array_size = len(self.ngram_index.keys())
        
 
        start_time = datetime.datetime.now()
        count = 0
        
        
        
        for element in corpus:
            #//*** hot encode each ngram
            result = []
            for ngram in element:
                
                #//*** Skip words not in self.vocabulary_index
                #//*** These are skipped due to max_tokens limitations
                if ngram not in self.vocabulary_index.keys():
                    continue

                sparse_tensor = tf.SparseTensor(indices=[[0,self.vocabulary_index[ngram]]],values=[1],dense_shape=[1,array_size])
                #index = self.vocabulary_index[ngram]
                
                #base_array = np.zeros(array_size, dtype=int)
                
                #base_array [index] = 1
                
                
                #//*** Add the one-hot-encoded word to encoded text
                result.append(sparse_tensor)
                

            #//*** END for ngram in tokens:
            
            result = tf.sparse.concat(axis=1, sp_inputs=result)
            #//*** concat Sparse Matrix
            results.append( result )
            
            count += 1
            
            
            
            #//*** Print a status update every 1000 items
            if count % 100 == 0:
                print(f"{count} / {len(corpus)} Encoded: {datetime.datetime.now() - start_time}")
        
        #//*** Concat List of Sparse Matrixes into a sparse matrix
        #results =  tf.sparse.concat(axis=1, sp_inputs=results)
        
        print(f"Encoding Complete: {datetime.datetime.now() - start_time}")
        
        return results        
    
    def encode(self,corpus):
        
        if not isinstance(corpus,list) :
            print("Vectorizer Requires a corpus (list of text):")
            return None

        self.corpus_tokens = []
        self.corpus_ngrams = []
        print("Tokenizing...")
        #//*** Tokenize each text entry in the corpus
        for raw_text in corpus:
            self.corpus_tokens.append(self.tokenize(raw_text))
        
        print("Building ngrams...")
        #//*** Build ngrams (Defaults to unigrams)
        self.build_ngrams()
        
        print("One Hot Coding....")
        #//*** One hot encode each text element


        #//*** One Hot Encode Values. These are actually sparse tensors for speed.
        encoded = self.one_hot_encode(self.corpus_ngrams)
   
        #//*** TidyUp (Delete) ngrams and Tokens
        if self.tidyup:
            self.corpus_tokens = []
            self.corpus_ngrams = []
            
        return encoded
    
    #//*** Convert One-Hot-Encoding to text
    def decode(self,elements):
        
        results = []
        
        #//*** For Each element in Corpus
            
        decoded = ""

        #//*** For Each ngram (word(s)) in Elements
        for ngram in elements:

            #//*** Grab Index of 1 from sparse tensor
            index = ngram.indices[0].numpy()[1]
            
            #ngram = list(ngram.numpy())

            decoded += self.ngram_index[ index ] + " "

        #//*** END for ngram in elements:
        results.append( decoded[:-1])
            
        #//*** END for elements in corpus:
        return results


#//*** Test the Vectorizer with some sample data
vectorizer = Vectorizer(max_tokens=100,ngrams=2, tidyup=False)
vectorizer.build_vocabulary(raw_val_text[:5])
start_time = datetime.datetime.now()

temp_vals = vectorizer.encode(raw_val_text[:5])

print(f"Run Time: {datetime.datetime.now() - start_time}")


print("Sample Text: (First 500 Chars)")
for element in raw_val_text[:5]:
    print(element[:500])
    print("====")
print()
print()

print("Tokens: (First 100 tokens)")
for token in vectorizer.corpus_tokens:
    print(token[:100])
    print("====")
print()
print()

print("ngrams: (First 50 tokens)")
for token in vectorizer.corpus_ngrams:
    print(token[:100])
    print("====")
print()
print()
print("Small one hot encoded Sample:")
print(temp_vals)
print()
print()
print("Encoded Vocabulary")
print(vectorizer.vocabulary_index)
print()
print()
print("Decoded Text from vocabulary (limited by max tokens)")
#for result in vectorizer.decode(temp_vals):
#    print(result)
#    print()

del temp_vals

del vectorizer


Tokenizing...
Building ngrams...
Building Vocabulary...
Sorting Word Frequency...
Building Token Dictionary
Rebuilding Vocabulary
Tokenizing...
Building ngrams...
One Hot Coding....
Encoding Complete: 0:00:00.030889
Run Time: 0:00:00.033881
Sample Text: (First 500 Chars)
Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinlan) & her son. The luxury jetliner takes off as planned but mid-air the plane is hi-jacked by the co-pilot Chambers (Robert Foxworth) & his two accomplice's Banker (Monte Markh
====
This film lacked something I couldn't put my finger on at first: charisma on the part of the leading actress. This inevitably translated to lack of chemistry when she shared the screen with her leading man. Even the romant

In [91]:
#//*** Test the Vectorizer with some sample data
#max_tokens = 20000
#ngrams = 1
#vectorizer = Vectorizer(max_tokens=max_tokens,ngrams=ngrams)

#//*** Build Vocabulary based on the training text
#vectorizer.build_vocabulary(raw_train_text)

#//*** Encode Validation, training and test data

print("Encoding Validation Data...")
#val_train = vectorizer.encode(raw_val_text)

print("Encoding Training Data...")
#x_train = vectorizer.encode(raw_train_text)

print("Encoding Test Data...")
#y_train = vectorizer.encode(raw_test_text)

"""
raw_train_text
raw_val_text
raw_test_text

start_time = datetime.datetime.now()
val_train = vectorizer.encode(raw_val_text[:1000])
print(f"Validation set Encoded: {datetime.datetime.now() - start_time}")
"""
print()

Encoding Validation Data...
Encoding Training Data...
Encoding Test Data...



https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/first_edition/6.1-using-word-embeddings.ipynb

In [106]:


from keras.datasets import imdb
from keras import preprocessing

# Number of words to consider as features
max_features = 10000
# Cut texts after this number of words 
# (among top max_features most common words)
maxlen = 20

# Load the data as lists of integers.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# This turns our lists of integers
# into a 2D integer tensor of shape `(samples, maxlen)`
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [119]:
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.layers import Embedding

model = Sequential()
# We specify the maximum input length to our Embedding layer
# so we can later flatten the embedded inputs
model.add(Embedding(10000, 8, input_length=maxlen))
# After the Embedding layer, 
# our activations have shape `(samples, maxlen, 8)`.

# We flatten the 3D tensor of embeddings 
# into a 2D tensor of shape `(samples, maxlen * 8)`
model.add(Flatten())

# We add the classifier on top
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.2)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [118]:


score = model.evaluate(x_test, y_test, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])


Test loss: 0.5094160437583923
Test accuracy: 0.7583199739456177


In [76]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode="multi_hot",
)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [78]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


In [89]:

type(binary_1gram_train_ds)

tensorflow.python.data.ops.dataset_ops.ParallelMapDataset

In [None]:
# //*** CODE HERE