In [None]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
[0mLooking in indexes: https://pypi.org/simple, https://us-pyt

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch

## Parameters

In [None]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base"

## Data

In [None]:
train = pd.read_csv('train.csv')
train = train[train['language']=='Chinese']

if True:
  languages = train['language'].unique()

  sampled = train.copy()
  train = pd.DataFrame(columns = ['text', 'label', 'language'])
  validation = pd.DataFrame(columns = ['text', 'label', 'language'])

  for language in languages:
    samples = sampled[sampled['language'] == language].sample(frac = 1).copy()
    ind = round(len(samples) * 0.8)
    train = pd.concat([train,samples[0:ind]])
    validation = pd.concat([validation, samples[ind:]])

test = pd.read_csv('test.csv')
test = test[test['label']!= 0.0]

print('Train data ' + str(len(train)) + ' total')
print('Validation data ' + str(len(validation)) + ' total')
print('Test data ' + str(len(test)) + ' total')

Train data 1277 total
Validation data 319 total
Test data 3881 total


In [None]:
test['label'].to_csv(r'test_labels.txt', header=None, index=None, sep='\n', mode='w')
test['text'].to_csv(r'test_text.txt', header=None, index=None, sep='\n', mode='w')
train['label'].to_csv(r'train_labels.txt', header=None, index=None, sep='\n', mode='w')
train['text'].to_csv(r'train_text.txt', header=None, index=None, sep='\n', mode='w')
validation['label'].to_csv(r'val_labels.txt', header=None, index=None, sep='\n', mode='w')
validation['text'].to_csv(r'val_text.txt', header=None, index=None, sep='\n', mode='w')
files = """test_labels.txt
test_text.txt
train_labels.txt
train_text.txt
val_labels.txt
val_text.txt""".split('\n')

In [None]:
dataset_dict = {}
for i in ['train','val','test']:
  dataset_dict[i] = {}
  for j in ['text','labels']:
    dataset_dict[i][j] = open(f"{i}_{j}.txt").read().split('\n')
    if j == 'labels':
      dataset_dict[i][j] = [float(x) for x in dataset_dict[i][j][:-1]]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(dataset_dict['train']['text'], truncation=True, padding=True)
val_encodings = tokenizer(dataset_dict['val']['text'], truncation=True, padding=True)
test_encodings = tokenizer(dataset_dict['test']['text'], truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, dataset_dict['train']['labels'])
val_dataset = MyDataset(val_encodings, dataset_dict['val']['labels'])
test_dataset = MyDataset(test_encodings, dataset_dict['test']['labels'])

## Fine-tuning

In [None]:
LR = 0.001
EPOCHS = 15
BATCH_SIZE = 64

training_args = TrainingArguments(
    output_dir='./results',                   # output directory
    num_train_epochs=EPOCHS,                  # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
    warmup_steps=100,                         # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                        # strength of weight decay
    logging_dir='./logs',                     # directory for storing logs
    logging_steps= 100,                         # when to print log
    load_best_model_at_end=True,              # load or not best model at the end
    evaluation_strategy ='steps',
    eval_steps = 100,                           # Evaluation and Save happens every 10 steps
    save_total_limit = 5,                      # Only last 5 models are saved. Older ones are deleted.
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.den

In [None]:
trainer = Trainer(
    model=model,                              # the instantiated 🤗 Transformers model to be trained
    args=training_args,                       # training arguments, defined above
    train_dataset=train_dataset,              # training dataset
    eval_dataset=val_dataset                  # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 1277
  Num Epochs = 15
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 300
  Number of trainable parameters = 278044417


Step,Training Loss,Validation Loss
100,1.8445,0.455566


***** Running Evaluation *****
  Num examples = 319
  Batch size = 64


In [None]:
trainer.save_model("./results/best_model") # save best model

## Evaluate on Test set

In [None]:
test_preds_raw, test_labels , _ = trainer.predict(test_dataset)

In [None]:
test = pd.read_csv('/content/drive/MyDrive/SemEval2023/groundtruth.csv')
test['predictions'] = test_preds_raw
test['chinese'] = test_preds_raw
test

In [None]:
languages = ['English', 'Spanish', 'Portuguese', 'Italian', 'French', 'Chinese','Hindi', 'Dutch', 'Korean', 'Arabic']
corr = []

f = plt.figure()
f.set_figwidth(16)
f.set_figheight(10)
i = 1

for language in languages:
  ind = (test['language'] == language).values
  plt.subplot(2,5,i)
  plt.scatter(test[ind]['label'], test[ind]['predictions'])
  error = np.corrcoef(test[ind]['label'], test[ind]['predictions'])[0,1]
  plt.title(language)
  print(language + ' ' + str(error))
  plt.xlim([1,5])
  plt.ylim([1,5])
  xpoints = ypoints = plt.xlim()
  plt.plot(xpoints, ypoints, linestyle='--', color='k', lw=3, scalex=False, scaley=False)
  i += 1
  corr.append(error)

overall = []
error = test['predictions'].corr(test['label'])
overall.append(error)
print('Overall', error)

s_df = test[test['language'].isin(['English', 'Spanish', 'Portuguese', 'Italian', 'French', 'Chinese'])]
error = s_df['predictions'].corr(s_df['label'])
overall.append(error)
print('Seen Languages',error)

u_df = test[test['language'].isin(['Hindi', 'Dutch', 'Korean', 'Arabic'])]
error = u_df['predictions'].corr(u_df['label'])
overall.append(error)
print('Unseen Languages',error)

print(' & '.join('{:0.5f}'.format(i) for i in corr))
print(' & '.join('{:0.5f}'.format(i) for i in overall))

In [None]:
test.to_csv('test.csv', index=False)