# Intent
The purpose of this notebook is to take raw text downloaded from a [website](https://www.hplovecraft.com/writings/texts/). We will remove punctuation, unnecessary white space, convert to lowercase, and otherwise standardize all text files into a single text file for processing in a ML model.

In [1]:
from collections import Counter
import os 
import pickle

In [2]:
def load_directory(data_dir: str) -> str:
    """
    Load all text files in the specified directory. The contents of these files are then concatenated into one, long string.
    
    :param data_dir: The data directory containing the text files we want to load.
    :return all_text_data: The concatenated text data of the files in the specified directory.
    """
    # Relevant files
    text_files = [
        text_file for text_file in os.listdir(data_dir) 
        if text_file.endswith('.txt')
    ]
    
    # Combine all text data into one string
    all_text_data = " ".join(
        [
            load_data(os.path.join(data_dir, text_file))
            for text_file in text_files
        ]
    )
    
    print(f"Loaded {len(text_files)} text files from {data_dir}")
    return all_text_data


def load_data(path: str) -> str:
    """
    Open and read data from specified file.
    
    :param path: Path to the file.
    :return data: The contents of the file.
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()
    return data


def token_lookup() -> dict:
    """
    Generate a dict to turn punctuation into tokens.
    
    :return tokens: Tokenized dictionary where the key is the punctuation and the value is the converted token
    """
    tokens = dict()
    tokens['.'] = ' <PERIOD> '
    tokens['!'] = ' <EXCLAMATION_MARK> '
    tokens['?'] = ' <QUESTION_MARK> '
    tokens[','] = ' <COMMA> '
    tokens['"'] = ' <QUOTATION_MARK> '
    tokens[';'] = ' <SEMICOLON> '
    tokens['('] = ' <LEFT_PAREN> '
    tokens[')'] = ' <RIGHT_PAREN> '
    tokens['-'] = ' <DASH> '
    tokens['—'] = ' <DASH> '
    tokens['\n'] = ' <NEW_LINE> '
    return tokens


def clean_text(text: str, special_tokens_dict: dict) -> str:
    # Iterate through all tokens we want to remove
    for k, v in special_tokens_dict.items():
        text = text.replace(k, v)
        
    # Reduce white space to single space
    text = ' '.join(text.split())
    # Reduce to lower case to reduce complexity of words
    text = text.lower()
    return text


def filter_text(text):
    """
    
    """
    tokens = text.split()
    word_counter = Counter(tokens)
    trimmed_words = [w for w, cnt in word_counter.items() if cnt > 3]
    trimmed_tokens = [t for t in tokens if t in trimmed_words]
    return trimmed_tokens


def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    # Counter for all words
    word_counter = Counter(text)
    # Need to make sure that our filler text is included with our vocabulary
    word_counter['<PAD>'] += 1
    
    # Sort words by frequency
    sorted_words = sorted(word_counter, key=word_counter.get, reverse=True)
    
    # Create int_to_vocab, vocab_to_int
    # We are filtering out infrequent tokens, as well as the <PAD> token
    int_to_vocab = {ii:word for ii, word in enumerate(sorted_words)}
    vocab_to_int = {word:ii for ii, word in int_to_vocab.items()}
    
    # return tuple
    return (vocab_to_int, int_to_vocab)

In [3]:
def preprocess_and_save_data(data_dir, output_file='preprocess.dat'):
    """
    Preprocess Text Data
    """
    all_text_data = load_directory(data_dir)
    special_tokens_dict = token_lookup()
    cleaned_text = clean_text(all_text_data, special_tokens_dict)
    tokens = filter_text(cleaned_text)
    vocab_to_int, int_to_vocab = create_lookup_tables(tokens)
    int_text = [vocab_to_int[word] for word in tokens]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, special_tokens_dict), open(output_file, 'wb'))


In [4]:
# Relevant directories
current_dir = os.path.abspath('')
data_dir = os.path.join(current_dir, "data")
print(f"Data directory is: {data_dir}")

preprocess_and_save_data(data_dir)

Data directory is: C:\Users\14196\Documents\Projects\Machine_Learning_Projects\NLP_Projects\lovecraft_text_generator\data
Loaded 105 text files from C:\Users\14196\Documents\Projects\Machine_Learning_Projects\NLP_Projects\lovecraft_text_generator\data


## Modifications
We are struggling to get below a training Cross Entropy Loss of 4.5. Resources indicate that to achieve a more realistic output, a loss of 3.5 or lower is encouraged. 

Therefore, to assist with improving performance, the below analysis was performed to determine that tokens occurring less than 3 times consituted a total of 1.79% of all tokens in the text, and showed a drastic increase of occurrance from tokens occurring only once (1.28%). <More here on why this value was selected; if results are promising>

In [5]:
"""
all_text_data = load_directory(data_dir)
special_tokens_dict = token_lookup()
cleaned_text = clean_text(all_text_data, special_tokens_dict)
tokens = cleaned_text.split(" ")
vocab_to_int, int_to_vocab = create_lookup_tables(tokens)
int_text = [vocab_to_int[word] for word in vocab_to_int]


word_counter = Counter(tokens)

for token_freq in range(1,6):
    infrequent_tokens = [(token, word_counter[token]) for token in word_counter if word_counter[token] <= token_freq]
    per = len(infrequent_tokens)/len(tokens)
    print(f"{len(infrequent_tokens)} tokens occur at or less than {token_freq} time{'s' if token_freq > 1 else ''}.")
    if token_freq > 1:
          it_prev = [(token, word_counter[token]) for token in word_counter if word_counter[token] <= (token_freq - 1)]
          per_change = (len(infrequent_tokens) - len(it_prev)) / len(it_prev) * 100
          print(f"A {per_change:.2f}% change from the previous iteration.")
    print(f"This is {per*100:.2f}% of the total tokens.\n")
"""

'\nall_text_data = load_directory(data_dir)\nspecial_tokens_dict = token_lookup()\ncleaned_text = clean_text(all_text_data, special_tokens_dict)\ntokens = cleaned_text.split(" ")\ntokens.append(\'<PAD>\')\nvocab_to_int, int_to_vocab = create_lookup_tables(tokens)\nint_text = [vocab_to_int[word] for word in vocab_to_int]\n\n\nword_counter = Counter(tokens)\n\nfor token_freq in range(1,6):\n    infrequent_tokens = [(token, word_counter[token]) for token in word_counter if word_counter[token] <= token_freq]\n    per = len(infrequent_tokens)/len(tokens)\n    print(f"{len(infrequent_tokens)} tokens occur at or less than {token_freq} time{\'s\' if token_freq > 1 else \'\'}.")\n    if token_freq > 1:\n          it_prev = [(token, word_counter[token]) for token in word_counter if word_counter[token] <= (token_freq - 1)]\n          per_change = (len(infrequent_tokens) - len(it_prev)) / len(it_prev) * 100\n          print(f"A {per_change:.2f}% change from the previous iteration.")\n    print(f"T