<a href="https://colab.research.google.com/github/jacksonliang35/Polish-POS-Tagging/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This script predicts the POS tag of a word using its neighboring word (by a window).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/UIUC/pos_tagging/Code


In [None]:
%cd /content/drive/MyDrive/UIUC/pos_tagging/Code

/content/drive/MyDrive/UIUC/pos_tagging/Code


In [None]:
## Imports
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

Following is a function used to parse xml into list of list.

In [None]:
def read_data(path, dataType='orth', labelType='pos'):
    assert(dataType in ['orth', 'lemma'])
    assert(labelType in ['pos', 'ctag'])
    ## Parse xml file
    tree = ET.parse(path)
    root = tree.getroot()
    ## Construct list of list
    data = []
    label = []
    for chunk in root:
        cur_data = []
        cur_label = []
        for tok in chunk:
            if tok.tag != 'ns':
                if dataType == 'orth':
                    cur_data.append(tok.find('orth').text.lower())
                else:
                    cur_data.append(tok.find('lex').find('base').text.lower())
                if labelType == 'pos':
                    cur_label.append(tok.find('lex').find('ctag').text.split(":")[0])
                else:
                    cur_label.append(tok.find('lex').find('ctag').text)
        data.append(cur_data)
        label.append(cur_label)
    return data,label

Following is a function that converts list of list into pandas dataframe. The dataframe consists a window of words, where we want to predict the POS tag of the center word.

In [None]:
def lol2df(data, label, wind_hs=3, labelType='pos'):
    assert(labelType in ['pos', 'ctag'])
    assert(len(data)==len(label))
    # Prepare list of list
    data2df = []
    for l in range(len(data)):
        cur = ['_'] * wind_hs + data[l] + ['_'] * wind_hs
        for t in range(len(data[l])):
            temp = cur[t:t+2*wind_hs+1]
            temp.extend(label[l][t].split(':'))
            data2df.append(temp)
    # Convert to df
    if labelType == 'pos':
        cname = ['w'+str(i) for i in range(2*wind_hs+1)]+['class']
    else:
        cname = ['w'+str(i) for i in range(2*wind_hs+1)]+['c1', 'c2', 'c3', 'c4']
    return pd.DataFrame(data2df, columns=cname)

Reading files...

In [None]:
# Input and combine data (will use 5-fold CV)
trdata, trlabel = read_data('../Data/train.xml')
valdata, vallabel = read_data('../Data/validate.xml')
testdata, testlabel = read_data('../Data/test.xml')
data = pd.concat([lol2df(trdata, trlabel), lol2df(valdata, vallabel)], ignore_index=True)
tdata = lol2df(testdata, testlabel)

In [None]:
data.head(n=10)

Unnamed: 0,w0,w1,w2,w3,w4,w5,w6,class
0,_,_,_,zabiję,cię,",",jeśli,fin
1,_,_,zabiję,cię,",",jeśli,umrzesz,ppron12
2,_,zabiję,cię,",",jeśli,umrzesz,!,interp
3,zabiję,cię,",",jeśli,umrzesz,!,"""",comp
4,cię,",",jeśli,umrzesz,!,"""",_,fin
5,",",jeśli,umrzesz,!,"""",_,_,interp
6,jeśli,umrzesz,!,"""",_,_,_,interp
7,_,_,_,cieszy,fakt,",",że,fin
8,_,_,cieszy,fakt,",",że,royal,subst
9,_,cieszy,fakt,",",że,royal,canin,interp


Pre-processing...

In [None]:
## Transform labels into numbers
## labels
label_enc = LabelEncoder().fit(data['class'])
y = label_enc.transform(data['class'])

In [None]:
## A naive way to transform words into numbers
## features
X = data.drop(columns=['class']).values
data_enc = LabelEncoder().fit(X.ravel())
# Below using Labelencoder on features, not a good practice in general
X = data_enc.transform(X.ravel()).reshape(X.shape)

In [None]:
y_test = label_enc.transform(tdata['class'])
X_test = tdata.drop(columns='class').values
X_test_r = X_test.ravel()

# For unseen words, replace it with a random seen word. Below using set to boost performance.
new_test_data = set(LabelEncoder().fit(X_test.ravel()).classes_) - set(data_enc.classes_)
for i in range(len(X_test_r)):
    if X_test_r[i] in new_test_data:
        X_test_r[i] = np.random.choice(data_enc.classes_)
X_test = data_enc.transform(X_test_r).reshape(X_test.shape)

Following is a wrapper to analyze classifiers with 5-fold CV.

In [None]:
def testClassifiers(clf, X, y, cv=5):
    # Use 5-fold CV
    kf = KFold(n_splits=cv)
    f1, prec, rec = list(), list(), list()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ## Fit classifier
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        ## Calculate metrics
        f1.append(f1_score(y_test, y_pred, average="micro"))
        prec.append(precision_score(y_test, y_pred, average="micro"))
        rec.append(recall_score(y_test, y_pred, average="micro"))
    return np.mean(f1), np.mean(prec), np.mean(rec)

In [None]:
# Validation
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
names = ["Gaussian Naive Bayes", "Decision Tree"]
classifiers = [GaussianNB(), 
               DecisionTreeClassifier()]
for name, clf in zip(names, classifiers):
    print("Current classifier:", name)
    f,p,r = testClassifiers(clf, X, y)
    print("Average precision:", p)
    print("Average recall:", r)
    print("Average f1 score:", f)
    print("")

Current classifier: Gaussian Naive Bayes
Average precision: 0.38761566933991365
Average recall: 0.38761566933991365
Average f1 score: 0.38761566933991365

Current classifier: Decision Tree
Average precision: 0.8544324491054904
Average recall: 0.8544324491054904
Average f1 score: 0.8544324491054904



In [None]:
# Testing
names = ["Gaussian Naive Bayes", "Decision Tree"]
classifiers = [GaussianNB(), 
               DecisionTreeClassifier()]
for name, clf in zip(names, classifiers):
    print("Current classifier:", name)
    clf.fit(X, y)
    y_pred = clf.predict(X_test)
    ## Calculate metrics
    print("Test precision:", precision_score(y_test, y_pred, average="micro"))
    print("Test recall:", recall_score(y_test, y_pred, average="micro"))
    print("Test f1 score:", f1_score(y_test, y_pred, average="micro"))
    print("")

Current classifier: Gaussian Naive Bayes
Test precision: 0.3890446373804613
Test recall: 0.3890446373804613
Test f1 score: 0.38904463738046136

Current classifier: Decision Tree
Test precision: 0.8528197338141639
Test recall: 0.8528197338141639
Test f1 score: 0.8528197338141639

