# Ocena modeli dystrybucyjnych dla korpusu pełnego i wzorcowego

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from src.settings import POLISH_ANNOTATIONS_FPATH, DATA_DIR

In [2]:
RANDOM_STATE = 42

EMB_VECTOR_SIZE = 100
EMB_WINDOW = 5
VOCAB_MIN_COUNT = 5

## Zbiór danych do klasyfikacji (tekst - numer zadania), korpus wzorcowy

In [3]:
korpus_wzorcowy = pd.read_json(POLISH_ANNOTATIONS_FPATH, lines=True)
korpus_wzorcowy

Unnamed: 0,start,end,text,doc_filepath,video_filename,task_label
0,39040,41120,"Myślę, że mam inny pomysł, można?",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
1,41120,49680,"Chyba ten znak mówi, że jak ktoś będzie spacer...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
2,49680,55280,Trzeba przejść łukiem obok leżącego i o tym zn...,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
3,61280,66840,"Okrągły znak pomaga nam, mówi, że są pasy na u...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
4,66840,73200,"Jak ktoś zobaczy, ale zignoruje ten znak, to m...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
...,...,...,...,...,...,...
40350,248960,250320,[uderzanie w coś],/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13
40351,279560,291400,Kot zobaczył w akwarium rybkę. Podszedł i dał ...,/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13
40352,291400,347120,"Zjadłam, zjadłam.",/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13
40353,347120,356440,"Zając biegnie, zobaczył wiszące pranie. Wskocz...",/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13


In [4]:
target_labels = korpus_wzorcowy["task_label"].astype(str).unique().tolist()
target_labels

['15', '24', '7', '17', '8', '13']

In [5]:
korpus_wzorcowy["text"].isna().sum(), korpus_wzorcowy["task_label"].isna().sum()

(0, 0)

In [6]:
import string
import token
import numpy as np
from nltk.tokenize import word_tokenize


def preprocess(text, stopwords):
    text = text.lower()
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text, language="polish")
    tokens = [word for word in tokens if word not in stopwords]
    if not any(tokens):
        return np.nan
    return " ".join(tokens)

with open(DATA_DIR / "polish.stopwords.txt", "r") as f:
    stop_words = set(f.readlines())

korpus_wzorcowy_preprocessed = korpus_wzorcowy.copy()
korpus_wzorcowy_preprocessed["text"] = korpus_wzorcowy_preprocessed["text"].apply(preprocess, args=(stop_words,))

korpus_wzorcowy_preprocessed["text"][5]

'ktoś biega bez świadomości i może spowodować wypadek a znak nam pomaga przypomina żeby uważać'

In [7]:
korpus_wzorcowy_preprocessed.isna().sum()

start             0
end               0
text              2
doc_filepath      0
video_filename    0
task_label        0
dtype: int64

In [8]:
pl_data_preprocessed = korpus_wzorcowy_preprocessed.dropna()
pl_data_preprocessed.shape

(40353, 6)

In [9]:
wzorcowy_X_train, wzorcowy_X_test, wzorcowy_y_train, wzorcowy_y_test = train_test_split(
    pl_data_preprocessed['text'],
    pl_data_preprocessed['task_label'],
    test_size=0.2,
    stratify=pl_data_preprocessed['task_label'],
    random_state=RANDOM_STATE
)

In [10]:
wzorcowy_X_train.shape, wzorcowy_X_test.shape, wzorcowy_y_train.shape, wzorcowy_y_test.shape

((32282,), (8071,), (32282,), (8071,))

In [11]:
wzorcowy_train_tokens = wzorcowy_X_train.str.split().explode().tolist()

In [12]:
from gensim.models import Word2Vec

# sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
wzorcowy_skipgram = Word2Vec(wzorcowy_train_tokens, sg=1, vector_size=EMB_VECTOR_SIZE, window=EMB_WINDOW, min_count=VOCAB_MIN_COUNT, workers=4)

In [13]:
wzorcowy_cbow = Word2Vec(wzorcowy_train_tokens, sg=0, vector_size=EMB_VECTOR_SIZE, window=EMB_WINDOW, min_count=VOCAB_MIN_COUNT, workers=4)

In [14]:
def vectorize(sentence, w2v_model):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

wzorcowy_X_train_skipgram = np.array([vectorize(sentence, w2v_model=wzorcowy_skipgram) for sentence in wzorcowy_X_train])
wzorcowy_X_test_skipgram = np.array([vectorize(sentence, w2v_model=wzorcowy_skipgram) for sentence in wzorcowy_X_test])

In [15]:
wzorcowy_X_train_cbow = np.array([vectorize(sentence, w2v_model=wzorcowy_cbow) for sentence in wzorcowy_X_train])
wzorcowy_X_test_cbow = np.array([vectorize(sentence, w2v_model=wzorcowy_cbow) for sentence in wzorcowy_X_test])

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# wzorcowy_skipgram_clf = LogisticRegression()
wzorcowy_skipgram_clf = DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE)
wzorcowy_skipgram_clf.fit(wzorcowy_X_train_skipgram, wzorcowy_y_train)

In [17]:
# wzorcowy_cbow_clf = LogisticRegression()
wzorcowy_cbow_clf = DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE)
wzorcowy_cbow_clf.fit(wzorcowy_X_train_cbow, wzorcowy_y_train)

In [18]:
from pprint import pprint
from sklearn.metrics import classification_report

def evaluate_clf(clf, X_test, y_test, target_names=None):
    y_pred = clf.predict(X_test)
    return classification_report(y_test, y_pred, output_dict=False, target_names=target_names, zero_division=0, digits=4)

skipgram_clf_report = evaluate_clf(wzorcowy_skipgram_clf, wzorcowy_X_test_skipgram, wzorcowy_y_test, target_names=target_labels)
print(skipgram_clf_report)

              precision    recall  f1-score   support

          15     0.6519    0.0650    0.1183      1814
          24     0.3231    0.0184    0.0348      1143
           7     0.0000    0.0000    0.0000       997
          17     0.3252    0.8510    0.4706      2410
           8     0.3015    0.3491    0.3236      1266
          13     0.4808    0.0567    0.1014       441

    accuracy                         0.3292      8071
   macro avg     0.3471    0.2234    0.1748      8071
weighted avg     0.3629    0.3292    0.2283      8071



In [19]:
cbow_clf_report = evaluate_clf(wzorcowy_cbow_clf, wzorcowy_X_test_cbow, wzorcowy_y_test, target_names=target_labels)
print(cbow_clf_report)

              precision    recall  f1-score   support

          15     0.6667    0.0650    0.1185      1814
          24     0.3088    0.0184    0.0347      1143
           7     0.0000    0.0000    0.0000       997
          17     0.3246    0.8506    0.4699      2410
           8     0.3003    0.3476    0.3222      1266
          13     0.5000    0.0522    0.0945       441

    accuracy                         0.3286      8071
   macro avg     0.3501    0.2223    0.1733      8071
weighted avg     0.3649    0.3286    0.2276      8071

