# Installations & imports

In [None]:
! git clone https://github.com/machamp-nlp/machamp.git --quiet  # clones machamp

In [None]:
! cd machamp

In [None]:
! cd machamp; cat README.md | grep "requirements";  # requirements file

In [None]:
#! nvidia-smi  # gpu status

In [None]:
! cd machamp; pip3 install --user -r requirements.txt --quiet  # installs dependencies

In [None]:
! export PATH=$PATH:/root/.local/bin

In [None]:
! pip install conllu --quiet
! pip install conll-df --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for conll-df (setup.py) ... [?25l[?25hdone


In [None]:
import os
import random
import transformers
import pandas as pd

from conllu import parse
from conll_df import conll_df

In [None]:
#! rm -r /content/CLthesis  # first remove clone if necessary

In [None]:
! git clone https://github.com/krfis/clthesis.git --quiet # clones github repo with data

# Helper functions

In [None]:
def count_tokens(dataset):
  '''counts tokens in a conllu dataset
  returns token count : int'''

  counter = 0
  for sentence in dataset:
    for token in sentence:
      counter += 1
  return counter

In [None]:
def process(path):
  '''parses conllu file
  returns parsed conllu file : list(TokenList)'''

  with open(path, "r", encoding="utf-8") as dataset:
    input = dataset.read()
    parsed = parse(input)
  return parsed

In [None]:
def write(content):
  '''writes content of TokenLists to output file'''

  with open("output.conllu", "w", encoding="utf-8") as output:
    for sent in content:
      output.write(sent.serialize())

# Models

In [None]:
models = {
    "xlm-r" : "FacebookAI/xlm-roberta-base",
    "scandibert" : "vesteinn/ScandiBERT-no-faroese",
    "icebert" : "mideind/IceBERT",
    "norbert" : "patrickvonplaten/norwegian-roberta-base",
    "swebert" : "birgermoell/roberta-swedish",
    "danbert" : "DDSC/roberta-base-danish"
}

In [None]:
for modelname in models.values():
  print(modelname)

# Pilot study 1

In [None]:
! mkdir -p machamp/data/pilot  # creates new folder for data

In [None]:
# copy files to data folder

! cp CLthesis/data/balanced/scandi-base/single-source/train-isl-60k.conllu machamp/data/pilot/  # train set
! cp CLthesis/data/balanced/scandi-base/single-source/dev-isl-6k.conllu machamp/data/pilot/  # dev set
! cp CLthesis/data/balanced/scandi-base/single-source/test-fao.conllu machamp/data/pilot/  # test set

In [None]:
! ls machamp/data/pilot  # contents of data folder

In [None]:
#! head machamp/data/pilot/*  # inspect files

In [None]:
train_path = "/content/machamp/data/pilot/train-isl-60k.conllu"
dev_path = "/content/machamp/data/pilot/dev-isl-6k.conllu"

In [None]:
train_df = conll_df(train_path, file_index=False)
dev_df = conll_df(dev_path, file_index=False)

In [None]:
train_df.head(3)

In [None]:
dev_df.head(3)

In [None]:
! cp /content/CLthesis/configs/config_pilot.json machamp/configs/  # copy dataset config file to config folder
! cp /content/CLthesis/configs/params_pilot.json machamp/configs/  # copy params file to config folder

In [None]:
! cat machamp/configs/config_pilot.json

In [None]:
! cat machamp/configs/params_pilot.json

## Training

In [None]:
! cd machamp; ls; python3 train.py --dataset_configs configs/config_pilot.json --parameters_config configs/params_pilot.json --name pilot --device 0  # set device to 1 for CPU

## Testing

In [None]:
! mkdir -p machamp/predictions  # create folder

In [None]:
! cd machamp; python3 predict.py /content/machamp/logs/pilot/2024.03.02_13.19.14/model.pt /content/machamp/data/pilot/test-fao.conllu predictions/pilot_test.out --device 0  # set device to 1 for CPU