In [6]:
# --- Preprocessing (kept simple) ---
import re

#!pip install stopwordsiso
import stopwordsiso

#nltk.download('stopwords')

def don(text): 
    return text
    
def lower(text): 
    return text.lower()

URL_RE = re.compile(r"https?://\S+|www\.\S+")
def remove_urls(text): 
    return URL_RE.sub("", text)
    
def replace_urls(text): 
    return URL_RE.sub(" <URL> ", text)
    
PUNCT_RE = re.compile(r"[^\w\s]")
def remove_punct(text): 
    return PUNCT_RE.sub(" ", text)

TOKEN_PUNC = re.compile(r"\w+|[^\w\s]")

def remove_stopwords(text, lg="all"):
    #very slow but keeps punctuation
    stoplist = stopwordsiso.stopwords(['bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
                                       'fi', 'fr', 'hu', 'it', 'lt', 'lv', 'mt', 'nl',
                                       'pl', 'pt', 'ro', 'sk', 'sl', 'sv'])
    return " ".join([T for T in TOKEN_PUNC.findall(text) if T not in set(stoplist)])

def compose(*funcs):
    def f(text):
        for fn in funcs:
            text = fn(text)
        return re.sub(r"\s+", " ", text).strip()
    return f

PREPROCESSORS = {
    "DON": don,
    "LOW": lower,
    "URLrem": remove_urls,
    "URLrep": replace_urls,
    "PUN": remove_punct,
    "RSW": remove_stopwords,
    "LOW+URLrem": compose(lower, remove_urls),
    "LOW+URLrep": compose(lower, replace_urls),
    "LOW+PUN": compose(lower, remove_punct),
    "LOW+URLrem+PUN": compose(lower, remove_urls, remove_punct),
    "LOW+URLrep+PUN": compose(lower, remove_urls, remove_punct),
    "LOW+URLrem+PUN+RSW": compose(lower, remove_urls, remove_punct, remove_stopwords),
    "LOW+URLrep+PUN+RSW": compose(lower, replace_urls, remove_punct, remove_stopwords),

}
multi_string = "I am travelling to Nancy for an NLP course à l'Université de Lorraine"
print(remove_stopwords(multi_string))


I travelling Nancy NLP ' Université Lorraine


In [7]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
from pathlib import Path
import numpy as np
import pandas as pd

SEED = 42
OUTFILE = Path("results_DiagLang.csv")

macro_f1 = make_scorer(f1_score, average="macro")
SCORING = {"acc": "accuracy", "macro_f1": macro_f1}

def mean_scores(scores):
    return {k.replace("test_", ""): float(np.mean(v))
            for k, v in scores.items() if k.startswith("test_")}
    
MODEL = LogisticRegression(max_iter=2000, random_state=SEED)

def evaluate(X, y, preprocess, vectorizer):
    Xp = [preprocess(t) for t in X]
    pipe = Pipeline([
        ("vect", vectorizer),
        ("clf", MODEL),
    ])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    scores = cross_validate(pipe, Xp, y, cv=cv, scoring=SCORING, n_jobs=-1)
    return mean_scores(scores)

In [8]:
! unzip corpus_multi.zip
import json
with open("corpus_multi.json") as f:
    json_data = json.load(f)
    
X, y = [x[0] for x in json_data], [x[1] for x in json_data]
print("Samples:", len(X), "Labels:", sorted(set(y)))

Samples: 5984 Labels: ['bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi', 'fr', 'hu', 'it', 'lt', 'lv', 'mt', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sv']


In [None]:
VECTORIZERS = {
    "count_word_1-1": CountVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False, max_features= 1000),
    #"count_word_1-2": CountVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False, max_features= 1000),
    "tfidf_word_1-1": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False, max_features= 1000),
    #"tfidf_word_1-2": TfidfVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False, max_features= 1000),
    "count_char_3-5": CountVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False, max_features= 1000),
    #"count_charwb_3-5": CountVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False, max_features= 1000),
    "tfidf_char_3-5": TfidfVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False, max_features= 1000),
    #"tfidf_charwb_3-5": TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False, max_features= 1000),
}

rows = []
for prep_name, prep in PREPROCESSORS.items():
    for vec_name, vec in VECTORIZERS.items():
        print(f"Prep: {prep_name:12s} | Vec: {vec_name}")
        res = evaluate(X, y, prep, vec)
        print(res)
        rows.append({
            "preprocessing": prep_name,
            "vectorizer": vec_name,
            **res
        })

Prep: DON          | Vec: count_word_1-1


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.9953210786162732, 'macro_f1': 0.995342399461625}
Prep: DON          | Vec: tfidf_word_1-1


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.9886358873773062, 'macro_f1': 0.9891118793832032}
Prep: DON          | Vec: count_char_3-5


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.9944853773229058, 'macro_f1': 0.994486507350121}
Prep: DON          | Vec: tfidf_char_3-5


  return _ForkingPickler.loads(res)


{'acc': 0.9847923878816328, 'macro_f1': 0.9857163015072723}
Prep: LOW          | Vec: count_word_1-1
{'acc': 0.9949869098610516, 'macro_f1': 0.9950217483866263}
Prep: LOW          | Vec: tfidf_word_1-1
{'acc': 0.986797400412961, 'macro_f1': 0.9874340971703981}
Prep: LOW          | Vec: count_char_3-5


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.9944856567282196, 'macro_f1': 0.9945068442510328}
Prep: LOW          | Vec: tfidf_char_3-5


  return _ForkingPickler.loads(res)


{'acc': 0.9849594722592434, 'macro_f1': 0.9858673301946524}
Prep: URLrem       | Vec: count_word_1-1
{'acc': 0.9954881629938839, 'macro_f1': 0.9955067882845048}
Prep: URLrem       | Vec: tfidf_word_1-1
{'acc': 0.9884688029996955, 'macro_f1': 0.9889435147803021}
Prep: URLrem       | Vec: count_char_3-5


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.9948196857807842, 'macro_f1': 0.9948291108745428}
Prep: URLrem       | Vec: tfidf_char_3-5


  return _ForkingPickler.loads(res)


{'acc': 0.9849596119619003, 'macro_f1': 0.9858606952933242}
Prep: URLrep       | Vec: count_word_1-1
{'acc': 0.9951539942386625, 'macro_f1': 0.9951918901629229}
Prep: URLrep       | Vec: tfidf_word_1-1
{'acc': 0.9886358873773062, 'macro_f1': 0.9891118793832032}
Prep: URLrep       | Vec: count_char_3-5


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.9946526014031735, 'macro_f1': 0.9946500659610583}
Prep: URLrep       | Vec: tfidf_char_3-5


  return _ForkingPickler.loads(res)


{'acc': 0.984625443206679, 'macro_f1': 0.9855457175589917}
Prep: PUN          | Vec: count_word_1-1
{'acc': 0.9953210786162732, 'macro_f1': 0.995342399461625}
Prep: PUN          | Vec: tfidf_word_1-1
{'acc': 0.9886358873773062, 'macro_f1': 0.9891118793832032}
Prep: PUN          | Vec: count_char_3-5


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.9951538545360055, 'macro_f1': 0.9951701013635319}
Prep: PUN          | Vec: tfidf_char_3-5


  return _ForkingPickler.loads(res)


{'acc': 0.9852937807171216, 'macro_f1': 0.9861507441560459}
Prep: RSW          | Vec: count_word_1-1


In [13]:
df = pd.DataFrame(rows).sort_values("macro_f1", ascending=False)
print(df)#if "RSW" improves results, is it because of English ?

df.to_csv(OUTFILE, index=False)
print(f"Saved to: {OUTFILE}")

   preprocessing      vectorizer       acc  macro_f1
16           RSW  count_word_1-1  0.996658  0.996659
8            URL  count_word_1-1  0.995488  0.995507
0            DON  count_word_1-1  0.995321  0.995342
12           PUN  count_word_1-1  0.995321  0.995342
14           PUN  count_char_3-5  0.995154  0.995170
4            LOW  count_word_1-1  0.994987  0.995022
10           URL  count_char_3-5  0.994820  0.994829
6            LOW  count_char_3-5  0.994486  0.994507
2            DON  count_char_3-5  0.994485  0.994487
17           RSW  tfidf_word_1-1  0.994151  0.994262
1            DON  tfidf_word_1-1  0.988636  0.989112
13           PUN  tfidf_word_1-1  0.988636  0.989112
9            URL  tfidf_word_1-1  0.988469  0.988944
5            LOW  tfidf_word_1-1  0.986797  0.987434
15           PUN  tfidf_char_3-5  0.985294  0.986151
7            LOW  tfidf_char_3-5  0.984959  0.985867
11           URL  tfidf_char_3-5  0.984960  0.985861
3            DON  tfidf_char_3-5  0.984792  0.