In [6]:
import os
import sys
import argparse
import copy
from pathlib import Path 

import pandas as pd

from datasets import load_metric
from datasets import Dataset

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler

from tqdm.auto import tqdm
import time

## Training Script

In [None]:
%%shell 

source ~/anaconda3/etc/profile.d/conda.sh
conda activate ./573_gpu

model_name='xlm-roberta-large'

# Test whether condor is using GPU
python3 src/test_gpu.py

# Preprocess tweets
python3 src/preprocess_olid.py \
    --file data/olid-training-v1.0.tsv \
    --train_ids data/eng_train_ids.txt \
    --val_ids data/eng_val_ids.txt \
    --split_punctuation \
    --remove_apostraphes \
    --remove_hashtags

# Finetune pretrained model on training data
python3 src/finetune_pretrained.py \
    --train_data data/clean_train_english.tsv \
    --val_data data/clean_val_english.tsv \
    --config configs/${model_name}.json

## Inference & Evaluation

In [None]:
%%shell 

# Run finetuned model predictions and generate output
python3 src/finetune_predict.py \
    --val_data data/clean_val_english.tsv \
    --config configs/${model_name}.json \
    --model_path models/${model_name} \
    --val_output_csv outputs/D4/D4_english_preds.csv

# Evaluation script
python3 src/eval.py \
    --val_output_csv outputs/D4/D4_english_preds.csv \
    --output_path results/D4_english_scores.out