# NLP Lab â€“ Minimal Preprocessing Experiments (Single Task)

This notebook runs a tiny grid of preprocessing experiments on **one** supervised text classification task
and saves results to `results_single_task.csv`.


In [1]:
import re
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score


In [2]:
SEED = 42
OUTFILE = Path("results_single_task_3_classes.csv")

macro_f1 = make_scorer(f1_score, average="macro")
SCORING = {"acc": "accuracy", "macro_f1": macro_f1}

def mean_scores(scores):
    return {k.replace("test_", ""): float(np.mean(v))
            for k, v in scores.items() if k.startswith("test_")}


In [3]:
# --- Preprocessing (kept simple) ---

!pip install stopwordsiso
import stopwordsiso

def don(text): 
    return text
    
def lower(text): 
    return text.lower()

URL_RE = re.compile(r"https?://\S+|www\.\S+")
def remove_urls(text): 
    return URL_RE.sub(" ", text)
    
def replace_urls(text): 
    return URL_RE.sub(" <URL> ", text)
    
PUNCT_RE = re.compile(r"[^\w\s]")
def remove_punct(text): 
    return PUNCT_RE.sub(" ", text)

TOKEN_PUNC = re.compile(r"\w+|[^\w\s]")
def remove_stopwords(text):
    #very slow but keeps punctuation
    
    return " ".join([T for T in TOKEN_PUNC.findall(text) if T not in set(stopwordsiso.stopwords("en"))])

    
def compose(*funcs):
    def f(text):
        for fn in funcs:
            text = fn(text)
        return re.sub(r"\s+", " ", text).strip()
    return f

PREPROCESSORS = {
    "DON": don,
    "LOW": lower,
    "URLrem": remove_urls,
    "URLrep": replace_urls,
    "PUN": remove_punct,
    "RSW": remove_stopwords,
    "LOW+URLrem": compose(lower, remove_urls),
    "LOW+URLrep": compose(lower, replace_urls),
    "LOW+PUN": compose(lower, remove_punct),
    "LOW+URLrem+PUN": compose(lower, remove_urls, remove_punct),
    "LOW+URLrep+PUN": compose(lower, remove_urls, remove_punct),
    "LOW+URLrem+PUN+RSW": compose(lower, remove_urls, remove_punct, remove_stopwords),
    "LOW+URLrep+PUN+RSW": compose(lower, replace_urls, remove_punct, remove_stopwords),

}
toto = "I am travelling to Nancy for an NLP course at IDMC :https://idmc.univ-lorraine.fr/" 
for prep_name, prep in PREPROCESSORS.items():
    print(f"Prep: {prep_name}")
    print(prep(toto))


Prep: DON
I am travelling to Nancy for an NLP course at IDMC :https://idmc.univ-lorraine.fr/
Prep: LOW
i am travelling to nancy for an nlp course at idmc :https://idmc.univ-lorraine.fr/
Prep: URLrem
I am travelling to Nancy for an NLP course at IDMC : 
Prep: URLrep
I am travelling to Nancy for an NLP course at IDMC : <URL> 
Prep: PUN
I am travelling to Nancy for an NLP course at IDMC  https   idmc univ lorraine fr 
Prep: RSW
I travelling Nancy NLP IDMC : https : / / idmc . univ - lorraine . /
Prep: LOW+URLrem
i am travelling to nancy for an nlp course at idmc :
Prep: LOW+URLrep
i am travelling to nancy for an nlp course at idmc : <URL>
Prep: LOW+PUN
i am travelling to nancy for an nlp course at idmc https idmc univ lorraine fr
Prep: LOW+URLrem+PUN
i am travelling to nancy for an nlp course at idmc
Prep: LOW+URLrep+PUN
i am travelling to nancy for an nlp course at idmc
Prep: LOW+URLrem+PUN+RSW
travelling nancy nlp idmc
Prep: LOW+URLrep+PUN+RSW
travelling nancy nlp idmc URL


  import pkg_resources


In [4]:
# 20 news groups to test, 3 will be enough for our purpose
# ref : https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
newsgroups_train = fetch_20newsgroups(subset='train')
print(list(newsgroups_train.target_names))
print("")
cats = ["comp.graphics", "sci.space","sci.med"]
data = fetch_20newsgroups(
    subset="all",
    categories=cats,
    remove=("headers", "footers", "quotes"),
)
X, y = data.data, data.target
print("Samples:", len(X), "Classes:", set(y), "Labels:", data.target_names)


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

Samples: 2950 Classes: {0, 1, 2} Labels: ['comp.graphics', 'sci.med', 'sci.space']


In [5]:
MODEL = LogisticRegression(max_iter=2000, random_state=SEED)

def evaluate(X, y, preprocess, vectorizer):
    Xp = [preprocess(t) for t in X]
    pipe = Pipeline([
        ("vect", vectorizer),
        ("clf", MODEL),
    ])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    scores = cross_validate(pipe, Xp, y, cv=cv, scoring=SCORING, n_jobs=-1)
    return mean_scores(scores)


In [None]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
warnings.simplefilter("ignore", UserWarning)


VECTORIZERS = {
    "count_word_1-1": CountVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False),
    #"count_word_1-2": CountVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False),
    "tfidf_word_1-1": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False),
    #"tfidf_word_1-2": TfidfVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False),
    "count_char_3-5": CountVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False),
    #"count_charwb_3-5": CountVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False),
    "tfidf_char_3-5": TfidfVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False),
    #"tfidf_charwb_3-5": TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False),
}

rows = []
for prep_name, prep in PREPROCESSORS.items():
    for vec_name, vec in VECTORIZERS.items():
        print(f"Prep: {prep_name:12s} | Vec: {vec_name}")
        res = evaluate(X, y, prep, vec)
        print(res)
        rows.append({
            "preprocessing": prep_name,
            "vectorizer": vec_name,
            **res
        })




Prep: DON          | Vec: count_word_1-1


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.8522033898305084, 'macro_f1': 0.852437380580561}
Prep: DON          | Vec: tfidf_word_1-1


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


{'acc': 0.8884745762711864, 'macro_f1': 0.8887987585640236}
Prep: DON          | Vec: count_char_3-5


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


In [None]:
df = pd.DataFrame(rows).sort_values("macro_f1", ascending=False)
print(df)

In [None]:
df.to_csv(OUTFILE, index=False)
print(f"Saved to: {OUTFILE}")


In [None]:
full_data = fetch_20newsgroups(
    subset="all",
    remove=("headers", "footers", "quotes"),
)
X, y = full_data.data, full_data.target
print("Samples:", len(X), "Classes:", set(y), "Labels:", full_data.target_names)

VECTORIZERS = {
    #"count_word_1-1": CountVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False),
    #"count_word_1-2": CountVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False),
    "tfidf_word_1-1": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False),
    #"tfidf_word_1-2": TfidfVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False),
    #"count_char_3-5": CountVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False),
    #"count_charwb_3-5": CountVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False),
    "tfidf_char_3-5": TfidfVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False),
    #"tfidf_charwb_3-5": TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False),
}

rows_full = []
for prep_name, prep in PREPROCESSORS.items():
    for vec_name, vec in VECTORIZERS.items():
        print(f"Prep: {prep_name:12s} | Vec: {vec_name}")
        res = evaluate(X, y, prep, vec)
        print(res)
        rows_full.append({
            "preprocessing": prep_name,
            "vectorizer": vec_name,
            **res
        })

In [None]:
df = pd.DataFrame(rows_full).sort_values("macro_f1", ascending=False)
print(df)