# CHOMP v2
__File Processor__

__by Kate Gilleran__  
__Last updated November 30__, __2021__  
[https://github.com/kwgws/chomp2](https://github.com/kwgws/chomp2)

This notebook processes raw text files into data more suitable to topic modelling.
1. Tokenize the text into individual words.
2. Remove stopwords.
3. Replace lemmas with a common equivalent where possible.
4. Save the result to a new file.

## 1. Set Paths

In [None]:
in_path = "./downloads"
out_path = "./corpus"

stop_words_file = "stopwords.txt"

## 2. Load

In [None]:
import os

files = []
total_count = 0
skip_count = 0

for file in [f for f in os.listdir(in_path) if f.endswith(f".txt")]:

    if os.path.exists(os.path.join(out_path, file)):
        skip_count = skip_count + 1
        continue

    files.append(file)
    total_count = total_count + 1

if not os.path.exists(os.path.abspath(out_path)):
    os.makedirs(os.path.abspath(out_path))

print(f"Found {total_count} files to process ({skip_count} skipped).")

## 3. Process

This can take a very long time, especially with large data sets! We'll print out a message before each file with a note as to how far we've gotten.

In [None]:
from autocorrect import Speller
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import regex as re
from unidecode import unidecode

nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("universal_tagset")
wnl = WordNetLemmatizer()

stop_words = set(stopwords.words("english"))
with open(stop_words_file, "r", encoding="utf-8") as f:
    stop_words.update([w.strip() for w in f.readlines()])

dehyphenator = re.compile(r"(?<=[A-Za-z])-\s\n(?=[A-Za-z])")
defuzzer = re.compile(r"([^a-zA-Z0-9]+)")

spell = Speller(fast=True, only_replacements=True)

i = 0
for file in files:

    i = i + 1
    print(f"{i}/{total_count} ({((i / total_count)*100.0):.0f}%): {file}")

    # Load file, remove special characters.
    text = ""
    with open(os.path.join(in_path, file), "r", encoding="utf-8") as f:
        text = unidecode(f.read())

    # De-fuzz.
    text = dehyphenator.sub("", text)
    text = defuzzer.sub(" ", text)

    # Tokenize.
    tokenized_text = word_tokenize(text)
    text = []

    # Remove stopwords.
    for word in [w for w in tokenized_text if w not in stop_words]:
        text.append(word)

    # Lemmatize.
    text = pos_tag(text, tagset="universal")
    for x in range(len(text)):
        word, pos = text[x]
        if pos == "VERB":
            pos = "v"
        elif pos == "ADJ":
            pos = "a"
        elif pos == "ADV":
            pos = "r"
        else:
            pos = "n"
        text[x] = wnl.lemmatize(word, pos=pos)

    # One last check to standardize spelling.
    text = " ".join(text)
    text = spell(text)

    # Save updated text to new file.
    with open(os.path.join(out_path, file), "w") as f:
        f.write(text)

print("\n** DONE! **")