In [1]:
from collections import Counter
from pathlib import Path

import polars as pl
import spacy
from tqdm.auto import tqdm

from process_docs import load_all_docbins

output_dir = Path("../data/slim_pajama_lists")

In [2]:
docs = list(
    tqdm(
        load_all_docbins("../data/slim_pajama_docbins/"),
        desc="Loading Docs",
        total=458_047,
    )
)

Loading Docs:   0%|          | 0/458047 [00:00<?, ?it/s]

In [3]:
def get_ngrams(docs, output_dir=None, n=3) -> None:
    """
    Count n-grams in a collection of spaCy docs.

    Args:
        docs: Iterable of spaCy docs
        output_dir: Directory to save the parquet file
        n: The "n" in "n-gram"

    Returns:
        None
    """
    ngram_counter = Counter()

    for doc in tqdm(docs, desc=f"Processing N-grams"):
        """
        Get tokens from doc.
        Pad each token list with (n-1) "#" tokens, 
        so the first n-gram of a text has its first word in the nth position.
        This allows the nth column to be used as a simple token frequency list.
        The "#" tokens can be easily filtered out later.
        """

        tokens = ["#"] * (n - 1) + [token.lower_ for token in doc if token.is_alpha]

        # Create n-grams
        ngrams = [tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]

        ngram_counter.update(ngrams)

    # Create lists for each column
    data = {f"token_{i}": [] for i in range(n)}
    data["count"] = []

    # Calculate total count and frequencies
    total_count = ngram_counter.total()

    # Fill the data dictionary
    for ngram, count in ngram_counter.items():
        for i, token in enumerate(ngram):
            data[f"token_{i}"].append(token)
        data["count"].append(count)

    # Create DataFrame and sort by count
    ngram_counts = pl.DataFrame(data).sort("count", descending=True)

    # Display dataframe
    display(ngram_counts)

    if output_dir is not None:
        ngram_counts.write_parquet(output_dir / f"{n}grams.parquet", compression="zstd")

In [4]:
get_ngrams(docs, output_dir=output_dir, n=3)

Processing N-grams:   0%|          | 0/458047 [00:00<?, ?it/s]

token_0,token_1,token_2,count
str,str,str,i64
"""one""","""of""","""the""",136038
"""as""","""well""","""as""",100633
"""a""","""lot""","""of""",68301
"""the""","""united""","""states""",53923
"""part""","""of""","""the""",53684
…,…,…,…
"""now""","""it""","""demanding""",1
"""it""","""demanding""","""greater""",1
"""demanding""","""greater""","""transparency""",1
"""greater""","""transparency""","""more""",1


In [5]:
def get_depgrams(docs, output_dir=None) -> None:
    """
    Count dependency bigrams in a collection of spaCy docs.

    Args:
        docs: Iterable of spaCy docs
        output_dir: Directory to save the parquet file

    Returns:
        None
    """
    depgram_counter = Counter()

    for doc in tqdm(docs, desc=f"Processing Depdency Grams"):
        depgrams = [
            (tok.head.lemma_, tok.head.tag_, tok.dep_, tok.lemma_, tok.tag_)
            for tok in doc
            if tok.is_alpha and tok.head.is_alpha
        ]
        depgram_counter.update(depgrams)

    # Create DataFrame with expanded tuples
    ngram_counts = pl.DataFrame(
        [
            (head, head_tag, relation, dep, dep_tag, count)
            for (
                head,
                head_tag,
                relation,
                dep,
                dep_tag,
            ), count in depgram_counter.items()
        ],
        schema=[
            "head_lemma",
            "head_tag",
            "relation",
            "dependent_lemma",
            "dependent_tag",
            "count",
        ],
        orient="row",
    ).sort("count", descending=True)

    # Display dataframe
    display(ngram_counts)

    if output_dir is not None:
        ngram_counts.write_parquet(output_dir / f"depgrams.parquet", compression="zstd")

In [6]:
get_depgrams(docs, output_dir=output_dir)

Processing Depdency Grams:   0%|          | 0/458047 [00:00<?, ?it/s]

head_lemma,head_tag,relation,dependent_lemma,dependent_tag,count
str,str,str,str,str,i64
"""be""","""VBZ""","""ROOT""","""be""","""VBZ""",1487296
"""be""","""VBD""","""ROOT""","""be""","""VBD""",626819
"""be""","""VBP""","""ROOT""","""be""","""VBP""",531054
"""say""","""VBD""","""ROOT""","""say""","""VBD""",311320
"""be""","""VBZ""","""nsubj""","""it""","""PRP""",281580
…,…,…,…,…,…
"""Skake""","""NNP""","""prt""","""up""","""RP""",1
"""Skake""","""NNP""","""dobj""","""view""","""NN""",1
"""bother""","""VBN""","""nsubj""","""public""","""NN""",1
"""Skake""","""NNP""","""advcl""","""bother""","""VBN""",1
