In [None]:
import os, re, html, pandas as pd, stanza, pathlib, torch
from tqdm import tqdm
from stanza.utils.conll import CoNLL

In [4]:
BASE_DIR = pathlib.Path('dataset')
FILES = [
    BASE_DIR / 'A1.csv',
    BASE_DIR / 'A2.csv',
    BASE_DIR / 'B1.csv',
    BASE_DIR / 'B2.csv',
    BASE_DIR / 'C1.csv',
    BASE_DIR / 'C2.csv',
]
LANG  = "en"
dfs = []
for f in FILES:
    if os.path.exists(f):
        dfs.append(pd.read_csv(f, encoding="utf-8"))
if not dfs:
    raise FileNotFoundError("None of the CSVs were found. Check FILES paths.")
data = pd.concat(dfs, ignore_index=True)
if "text" not in data.columns:
    raise ValueError("CSV must contain a 'text' column.")
data = data.dropna(subset=["text"]).reset_index(drop=True)

In [5]:
#Optional for testing
# data = data.head(200)
levels = data.label.unique()

In [6]:
stanza.download(LANG, processors="tokenize,mwt,pos,lemma,depparse", verbose=False)
USE_GPU = False
try:
    import torch
    USE_GPU = torch.cuda.is_available()
except Exception:
    pass
nlp = stanza.Pipeline(LANG, processors="tokenize,mwt,pos,lemma,depparse",use_gpu=USE_GPU, verbose=False)

In [19]:
os.makedirs("dataset/tagged-stanza", exist_ok=True)

In [8]:
for l in levels:
    cefr = data[data["label"]==l]
    TEMPLATE = "dataset/tagged-stanza/{}_{:04d}.conllu"
    for i, txt in tqdm(enumerate(cefr["text"].astype(str), start=1), total=len(data)):
        doc = nlp(txt)
        # This writes a NEW file per document (row)
        CoNLL.write_doc2conll(doc, TEMPLATE.format(l, i))

 19%|████████▎                                   | 282/1488 [00:45<03:14,  6.18it/s]
 18%|████████                                    | 272/1488 [01:34<07:03,  2.87it/s]
 14%|██████                                      | 205/1488 [02:05<13:06,  1.63it/s]
 19%|████████▍                                   | 286/1488 [03:27<14:33,  1.38it/s]
 16%|███████▏                                    | 241/1488 [03:59<20:39,  1.01it/s]
 14%|█████▉                                      | 202/1488 [03:37<23:05,  1.08s/it]


In [9]:
levels

array(['A1', 'A2', 'B1', 'B2', 'C1', 'C2'], dtype=object)