# CHOMP v2
__Misc. Utilities__

__by Kate Gilleran__  
__Last updated November 30__, __2021__  
[https://github.com/seangilleran/chomp2](https://github.com/seangilleran/chomp2)

## Check Language (With Spacy)

In [None]:
import os

import en_core_web_sm
from spacy.language import Language
from spacy_langdetect import LanguageDetector

path = "./corpus"


# Load language detector.
@Language.factory("language_detector")
def language_detector(nlp, name):
    return LanguageDetector()
nlp = en_core_web_sm.load()
nlp.add_pipe("language_detector", last=True)

for file in [f for f in os.listdir(path) if f.endswith(f".txt")]:

    with open(os.path.join(path, file), "r", encoding="utf-8") as f:
        text = f.read()

    # Check language.
    lang = nlp(text)._.language
    language = lang["language"]
    score = lang["score"]

    print(f"{language.capitalize()} ({(score * 100):.0f}%): {file}")
    with open("lang_check.csv", "a", encoding="utf-8") as f:
        f.write(f"{language},{score},{file}\n")

## Check Language (With Tag)

In [None]:
import json
import os

path = "./meta"

for file in [f for f in os.listdir(path) if f.endswith(".json")]:

    with open(os.path.join(path, file), "r", encoding="utf-8") as f:
        collection = json.loads(f.read())

    for item in collection["items"]:
        lang = item["language"]

        for file in item["files"]:
            with open("lang_check.csv", "a", encoding="utf-8") as f:
                f.write(f"{lang},{file['name']},{file['id']}\n")

## Convert PDF to TXT

In [None]:
# TODO

## Check Words vs. Enchant Spellcheck

In [None]:
import os

from nltk.tokenize import word_tokenize
import regex as re

path = ".\corpus"
files = []
words = set()

az = re.compile(r"^[a-zA-Z]+$")


files = [f for f in os.listdir(path) if f.endswith(".txt")]

print(f"Loading {len(files)} files...")

for file in files:
    word_count = len(words)
    with open(os.path.join(path, file), "r", encoding="utf-8") as f:
        text = f.read()
    for word in word_tokenize(text):
        if not az.search(word):
            continue
        words.add(word)
    print(f"Added {len(words) - word_count} words from {file}.")

print("\n** DONE! **")
print(f"Found {len(words)} unique words.")

In [None]:
import enchant

d = enchant.Dict("en_US")


words = list(words)
words.sort()
print(f"Checking {len(words)} words...")

ok_count = 0
nf_count = 0

for word in words:
    print(f"(?) {word} ...", end="")
    if not d.check(word):
        with open("words.txt", "a", encoding="utf-8") as f:
            f.write(f"{word}\n")
        print("Not Found")
        continue
    print("OK")

total = ok_count + nf_count
print("\n** DONE! **")
print(f"Could not find spelling for {nf_count} words out of {total} (corpus {(ok_count/total):.0f}% ok).")