In [1]:
import json
from collections import Counter
from pathlib import Path
from typing import Dict, Iterator, List

import pandas as pd # type: ignore

In [2]:
ASSETS = Path("../assets/").resolve()
Phrases = Dict[str, List[str]]

In [3]:
def read_jsonl(p: Path) -> Iterator[Phrases]:
    with p.open("r", encoding="utf-8") as lines:
        for line in lines:
            yield json.loads(line)

In [4]:
phrases : List[str] = []
total_documents = 0
for d in read_jsonl(ASSETS / "output.jsonl"):
    phrases.extend(*d.values())
    total_documents += 1

In [5]:
df = pd.DataFrame(Counter(phrases).most_common())
df.columns = ["phrase", "frequency"]

In [6]:
print("Total documents: {:>28}".format(total_documents))
print("Total phrases: {:>30}".format(df["frequency"].sum()))
print("Unique phrases: {:>29}".format(df.shape[0]))
print(
    "Unique phrases that occur more than once: {}".format(
        df.loc[df["frequency"].gt(1), "phrase"].shape[0]
    )
)

Total documents:                          313
Total phrases:                            634
Unique phrases:                           195
Unique phrases that occur more than once: 62


In [7]:
df.to_csv(ASSETS / "frequencies.csv", index=False)