# Baseline Classifier

In [1]:
import numpy as np

np.random.seed(42)

### Loading the dataset

In [2]:
from datasets import load_dataset

In [3]:
ds = load_dataset("paul-ww/ei-abstract-significance")

Found cached dataset parquet (/Users/paul/.cache/huggingface/datasets/paul-ww___parquet/paul-ww--ei-abstract-significance-1c087dddb8b05c98/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
class_labels = ds["train"].features["label"]
label2id = {name: class_labels.str2int(name) for name in class_labels.names}
id2label = {v: k for k, v in label2id.items()}

### Tracking using Weights&Biases

In [5]:
%env WANDB_LOG_MODEL='end'
%env WANDB_WATCH='all'

env: WANDB_LOG_MODEL='end'
env: WANDB_WATCH='all'


In [6]:
import wandb

wandb.login()
run = wandb.init(project="significance_classification", group="baseline")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpaul_ww[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Model Setup

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    [
        ("vec", TfidfVectorizer()),
        ("clf", LogisticRegression(max_iter=5000)),
    ]
)

In [8]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=pipe,
    param_grid={
        "vec__min_df": [1, 3, 5],
        "vec__stop_words": ["english", None],
        "vec__ngram_range": [(1, 3)],
        "clf__C": [0.01, 0.1, 1, 10],
    },
    scoring="f1_macro",
    refit=True,
    verbose=1,
)

In [9]:
import pandas as pd

df_train = ds["train"].to_pandas()
df_train["label"] = df_train["label"].apply(ds["train"].features["label"].int2str)

df_test = ds["test"].to_pandas()
df_test["label"] = df_test["label"].apply(ds["test"].features["label"].int2str)

In [10]:
import joblib

with joblib.parallel_backend("threading", n_jobs=-1):
    grid.fit(X=df_train["text"], y=df_train["label"])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [11]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results.sort_values("rank_test_score", ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_vec__min_df,param_vec__ngram_range,param_vec__stop_words,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
23,2.121181,0.563551,0.082739,0.015859,10,5,"(1, 3)",,"{'clf__C': 10, 'vec__min_df': 5, 'vec__ngram_r...",0.632143,0.682658,0.623744,0.654688,0.602947,0.639236,0.027318,1
21,5.082139,0.704263,0.185801,0.066605,10,3,"(1, 3)",,"{'clf__C': 10, 'vec__min_df': 3, 'vec__ngram_r...",0.628049,0.672697,0.620026,0.632764,0.614456,0.633598,0.020547,2
22,3.523663,0.343828,0.104311,0.046033,10,5,"(1, 3)",english,"{'clf__C': 10, 'vec__min_df': 5, 'vec__ngram_r...",0.613865,0.584179,0.611725,0.607946,0.556912,0.594926,0.021776,3
20,5.614729,0.702846,0.177253,0.099533,10,3,"(1, 3)",english,"{'clf__C': 10, 'vec__min_df': 3, 'vec__ngram_r...",0.572941,0.61065,0.584342,0.631295,0.560325,0.591911,0.025757,4
19,13.393346,1.456073,0.273921,0.109515,10,1,"(1, 3)",,"{'clf__C': 10, 'vec__min_df': 1, 'vec__ngram_r...",0.580836,0.540453,0.571559,0.541956,0.545511,0.556063,0.01678,5
17,6.810039,1.236699,0.426183,0.229583,1,5,"(1, 3)",,"{'clf__C': 1, 'vec__min_df': 5, 'vec__ngram_ra...",0.476885,0.520882,0.511866,0.479108,0.473122,0.492372,0.019896,6
18,15.529131,1.237353,0.335264,0.070735,10,1,"(1, 3)",english,"{'clf__C': 10, 'vec__min_df': 1, 'vec__ngram_r...",0.472652,0.459765,0.520882,0.51395,0.469546,0.487359,0.025003,7
15,4.746362,1.028105,0.462631,0.139592,1,3,"(1, 3)",,"{'clf__C': 1, 'vec__min_df': 3, 'vec__ngram_ra...",0.465348,0.47379,0.464386,0.463726,0.466463,0.466743,0.003643,8
16,3.571849,1.255202,0.468867,0.269643,1,5,"(1, 3)",english,"{'clf__C': 1, 'vec__min_df': 5, 'vec__ngram_ra...",0.442881,0.456271,0.465348,0.470357,0.452866,0.457545,0.009629,9
14,4.488644,1.100817,0.261957,0.130177,1,3,"(1, 3)",english,"{'clf__C': 1, 'vec__min_df': 3, 'vec__ngram_ra...",0.444145,0.408377,0.446719,0.428597,0.435976,0.432763,0.01376,10


In [12]:
best_pipe = grid.best_estimator_

In [13]:
from pathlib import Path

output_path = Path(run.dir) / "model_finetuned"
output_path.mkdir(parents=True, exist_ok=True)
joblib.dump(best_pipe, Path(output_path) / "baseline.joblib")

['/Users/paul/Documents/HPI/nge-browser-data-integration/classification/training/wandb/run-20230621_112410-57b9sxjo/files/model_finetuned/baseline.joblib']

#### Evaluation

In [14]:
y_pred_proba = best_pipe.predict_proba(df_test["text"])

In [15]:
from classification.utils import log_metrics_to_wandb

log_metrics_to_wandb(
    y_pred_proba=y_pred_proba,
    y_true_num=ds["test"]["label"],
    id2label=id2label,
    labels=class_labels.names,
    run=run,
)