In [None]:
import os, re, html, pandas as pd, stanza, pathlib, torch
from tqdm import tqdm
from stanza.utils.conll import CoNLL

In [3]:
BASE_DIR = pathlib.Path('dataset')
FILES = [
    # BASE_DIR / 'A1.csv',
    # BASE_DIR / 'B2.csv',
    # BASE_DIR / 'C2.csv',
    BASE_DIR / 'cefr_leveled_texts.csv',
]
LANG  = "en"
dfs = []
for f in FILES:
    if os.path.exists(f):
        dfs.append(pd.read_csv(f, encoding="utf-8"))
if not dfs:
    raise FileNotFoundError("None of the CSVs were found. Check FILES paths.")
data = pd.concat(dfs, ignore_index=True)
if "text" not in data.columns:
    raise ValueError("CSV must contain a 'text' column.")
data = data.dropna(subset=["text"]).reset_index(drop=True)

In [9]:
#Optional for testing
# data = data.head(200)
levels = data.label.unique()

In [11]:
stanza.download(LANG, processors="tokenize,mwt,pos,lemma,depparse", verbose=False)
USE_GPU = False
try:
    import torch
    USE_GPU = torch.cuda.is_available()
except Exception:
    pass
nlp = stanza.Pipeline(LANG, processors="tokenize,mwt,pos,lemma,depparse",use_gpu=USE_GPU, verbose=False)

In [19]:
os.makedirs("dataset/tagged-stanza", exist_ok=True)

In [20]:
for l in levels:
    cefr = data[data["label"]==l]
    TEMPLATE = "dataset/tagged-stanza/{}_{:04d}.conllu"
    for i, txt in tqdm(enumerate(cefr["text"].astype(str), start=1), total=len(data)):
        doc = nlp(txt)
        # This writes a NEW file per document (row)
        CoNLL.write_doc2conll(doc, TEMPLATE.format(l, i))

 19%|█████████████████████████                                                                                                          | 286/1494 [03:31<14:51,  1.36it/s]
 18%|███████████████████████▊                                                                                                           | 272/1494 [01:33<06:59,  2.91it/s]
 16%|█████████████████████▏                                                                                                             | 241/1494 [04:01<20:55,  1.00s/it]
 14%|█████████████████▉                                                                                                                 | 205/1494 [02:06<13:14,  1.62it/s]
 19%|█████████████████████████▎                                                                                                         | 288/1494 [00:50<03:33,  5.65it/s]
 14%|█████████████████▋                                                                                                                 | 20

In [10]:
levels

array(['B2', 'A2', 'C1', 'B1', 'A1', 'C2'], dtype=object)