<a href="https://colab.research.google.com/github/krfis/clthesis/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations & imports

In [1]:
! git clone https://github.com/machamp-nlp/machamp.git --quiet  # clones machamp

In [2]:
! cd machamp

In [3]:
! cd machamp; cat README.md | grep "requirements";  # requirements file

pip3 install --user -r requirements.txt


In [4]:
#! nvidia-smi  # gpu status

In [5]:
! cd machamp; pip3 install --user -r requirements.txt --quiet  # installs dependencies

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.2/594.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for jsonnet (setup.py) ... [?25l[?25hdone


In [6]:
! export PATH=$PATH:/root/.local/bin

In [7]:
! pip install conllu --quiet
! pip install conll-df --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for conll-df (setup.py) ... [?25l[?25hdone


In [8]:
import os
import random
import transformers
import pandas as pd

from conllu import parse
from conll_df import conll_df

In [9]:
#! rm -r /content/CLthesis  # first remove clone if necessary

In [10]:
! git clone https://github.com/krfis/clthesis.git --quiet # clones github repo with data

# Helper functions

In [11]:
def count_tokens(dataset):
  '''counts tokens in a conllu dataset
  returns token count : int'''

  counter = 0
  for sentence in dataset:
    for token in sentence:
      counter += 1
  return counter

In [12]:
def process(path):
  '''parses conllu file
  returns parsed conllu file : list(TokenList)'''

  with open(path, "r", encoding="utf-8") as dataset:
    input = dataset.read()
    parsed = parse(input)
  return parsed

In [13]:
def write(content):
  '''writes content of TokenLists to output file'''

  with open("output.conllu", "w", encoding="utf-8") as output:
    for sent in content:
      output.write(sent.serialize())

# Models

In [14]:
# old
#models = {
#    "xlm-r" : "FacebookAI/xlm-roberta-base",
#    "scandibert" : "vesteinn/ScandiBERT-no-faroese",
#    "icebert" : "mideind/IceBERT",
#    "norbert" : "patrickvonplaten/norwegian-roberta-base",
#    "swebert" : "birgermoell/roberta-swedish",
#    "danbert" : "DDSC/roberta-base-danish"
#}

In [15]:
models = {
    "xlm-r" : "xlm-roberta-base",
    "scandibert" : "ScandiBERT-no-faroese",
    "icebert" : "IceBERT",
    "norbert" : "norwegian-roberta-base",
    "swebert" : "roberta-swedish",
    "danbert" : "roberta-base-danish"
}

In [16]:
for modelname in models.values():
  print(modelname)

xlm-roberta-base
ScandiBERT-no-faroese
IceBERT
norwegian-roberta-base
roberta-swedish
roberta-base-danish


# Pilot study 1

In [17]:
! mkdir -p machamp/data/pilot  # creates new folder for data

In [18]:
# copy files to data folder

! cp CLthesis/data/balanced/scandi-base/single-source/train-isl-60k.conllu machamp/data/pilot/  # train set
! cp CLthesis/data/balanced/scandi-base/single-source/dev-isl-6k.conllu machamp/data/pilot/  # dev set
! cp CLthesis/data/balanced/scandi-base/single-source/test-fao.conllu machamp/data/pilot/  # test set

In [19]:
! ls machamp/data/pilot  # contents of data folder

dev-isl-6k.conllu  test-fao.conllu  train-isl-60k.conllu


In [None]:
#! head machamp/data/pilot/*  # inspect files

In [18]:
train_path = "/content/machamp/data/pilot/train-isl-60k.conllu"
dev_path = "/content/machamp/data/pilot/dev-isl-6k.conllu"

In [22]:
train_df = conll_df(train_path, file_index=False)
dev_df = conll_df(dev_path, file_index=False)

In [None]:
train_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,w,l,x,p,g,f,Gender,Type,Definite,Degree,Mood,Case,Number,Voice,type,Person,Tense
s,i,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1,Á,eiga,ADP,fs_þgf,2,case,_,_,_,_,_,Dat,_,_,_,_,_
1,2,loka,lok,ADV,no_ft_þgf_kk,4,advmod,Masc,_,_,_,_,Dat,Plur,_,_,_,_
1,3,mínútunum,mínúta,NOUN,no_ft_þgf_kvk_gr,2,conj,Fem,_,Def,_,_,Dat,Plur,_,_,_,_


In [None]:
dev_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,w,l,x,p,g,f,Gender,Type,Definite,Degree,Mood,Case,Number,Voice,type,Person,Tense
s,i,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1,Um,um,ADV,eo,3,advmod,_,_,_,_,_,_,_,_,_,_,_
1,2,40%,40%,NUM,prósenta,3,nummod,_,_,_,_,_,_,_,_,_,_,_
1,3,álvera,álver,NOUN,no_ft_ef_hk,7,nsubj,Neut,_,_,_,_,Gen,Plur,_,_,_,_


In [20]:
! cp /content/CLthesis/configs/config_pilot.json machamp/configs/  # copy dataset config file to config folder
! cp /content/CLthesis/configs/params_pilot.json machamp/configs/  # copy params file to config folder

In [21]:
! cat machamp/configs/config_pilot.json

{
    "pilot" : {
        "train_data_path" : "/content/machamp/data/pilot/train-isl-60k.conllu",
        "dev_data_path" : "/content/machamp/data/pilot/dev-isl-6k.conllu",
        "word_idx" : 1,
        "tasks" : {
            "upos" : {
                "task_type" : "seq",
                "column_idx" : 3,
                //"metric" : "accuracy",
                //"additional_metrics" : ["f1_micro", "f1_macro"]
            },

            "dependency" : {
                "task_type" : "dependency",
                "column_idx" : 6,
                //"metric" : "las",
                //"additional_metrics" : ["uas"]
            }
        }
    }
}

In [22]:
! cat machamp/configs/params_pilot.json

{
  "transformer_model": "IceBERT",
  "reset_transformer_model": false,
  "random_seed": 8446,
  "default_dec_dataset_embeds_dim": 12,
  "encoder": {
    "dropout": 0.2,
    "max_input_length": 128,
    "update_weights_encoder": true
  },
  "decoders": {
    "default_decoder": {
      "loss_weight": 1.0,
      "metric": "accuracy",
      "topn": 1,
      "layers_to_use": [-1]
    },
    //"classification": {
    //},
    "dependency": {
      "arc_representation_dim": 768,
      "tag_representation_dim": 256,
      "metric": "las",
      "additional_metrics": "uas"
    },
    //"mlm": {
    //  "metric": "perplexity"
    //},
    //"multiclas": {
    //  "metric": "multi_acc",
    //  "threshold": 0.7
    //},
    //"multiseq": {
    //  "metric": "multi_acc",
    //  "threshold": 0.7
    //},
    //"regression": {
    //  "metric": "avg_dist"
    //},
    "seq": {
      "metric": "accuracy",
      "additional_metrics": ["f1_macro", "f1_micro"]
    },
    //"seq_bio": {
    //  "metric

## Training

In [24]:
! cd machamp; ls; python3 train.py --dataset_configs configs/config_pilot.json --parameters_config configs/params_pilot.json --name pilot --device 0  # set device to 1 for CPU

configs  docs	  logs	   predict.py  requirements.txt  scripts  train.py
data	 LICENSE  machamp  README.md   results		 TODO
2024-03-02 13:19:12.305254: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-02 13:19:12.305309: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-02 13:19:12.306656: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-02 13:19:14,499 - INFO - machamp.model.trainer - cmd: train.py --dataset_configs configs/config_pilot.json --parameters_config configs/params_pilot.json --name pilot --device 0
2024-03-02 13:19:14,539 - INFO - machamp.model.trainer - 

## Testing

In [25]:
! mkdir -p machamp/predictions  # create folder

In [26]:
! cd machamp; python3 predict.py /content/machamp/logs/pilot/2024.03.02_13.19.14/model.pt /content/machamp/data/pilot/test-fao.conllu predictions/pilot_test.out --device 0  # set device to 1 for CPU

2024-03-02 13:40:03.465785: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-02 13:40:03.465839: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-02 13:40:03.467249: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-02 13:40:05,659 - INFO - __main__ - cmd: predict.py /content/machamp/logs/pilot/2024.03.02_13.19.14/model.pt /content/machamp/data/pilot/test-fao.conllu predictions/pilot_test.out --device 0

2024-03-02 13:40:05,659 - INFO - __main__ - loading model...
2024-03-02 13:40:06,505 - INFO - __main__ - predicting on /content/machamp/data/pilot/test-fao.conllu, s