# Installs and imports

In [None]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
[0mLooking in indexes: https://pypi.org/simple, https://us-pyt

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import numpy as np

# Data

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import jieba
import nltk
import re

In [None]:
train = pd.read_csv('train.csv')

test = pd.read_csv('test.csv')
test = test[test['label']!= 0.0]

print('Train data ' + str(len(train)) + ' total')
print('Test data ' + str(len(test)) + ' total')

Train data 9491 total
Test data 3881 total


# Model

In [None]:
CUDA = True # set to true if using GPU (Runtime -> Change runtime Type -> GPU)
BATCH_SIZE = 32
MODEL = "cardiffnlp/twitter-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModel.from_pretrained(MODEL)
if CUDA:
  model = model.to('cuda')
_ = model.eval()

Downloading (…)lve/main/config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this mode

## Encode

In [None]:
def encode(text, cuda=True):
  encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
  if cuda:
    encoded_input.to('cuda')
    output = model(**encoded_input)
    embeddings = output[0].detach().cpu().numpy()
  else:
    output = model(**encoded_input)
    embeddings = output[0].detach().numpy()

  embeddings = np.max(embeddings, axis=1)
  embeddings = np.mean(embeddings, axis=0)

  return embeddings

In [None]:
def get_embedding_matrix(df):
  train_tweets = np.zeros((len(df),768))
  for i in range(0,len(df)):
    train_tweets[i,:] = encode(df.iloc[i]['text'])
  return train_tweets

In [None]:
train = shuffle(train)
train['fold'] = np.arange(0,len(train)) % 5

In [None]:
x_train = get_embedding_matrix(train)
x_test = get_embedding_matrix(test)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
cor_coef_train = []
cor_coef_validate = []
alpha = [40, 45, 50, 55, 60]

# podijela na foldove za krosvalidaciju
step = np.floor(len(train)/5)

for a in alpha: # optimize model over regularization coeficient
  error_train = 0
  error_validate = 0

  for k in range(0,5): # k-fold
    # Classifier:
    #clf = LinearRegression()
    clf = Ridge(alpha = a)
    trainfeatures = x_train[train['fold'] != k]
    validatefeatures = x_train[train['fold'] == k]
    clf.fit(trainfeatures, train[train['fold'] != k]['label'])

    # train results
    results = clf.predict(trainfeatures)
    error_train += np.corrcoef(train[train['fold'] != k]['label'], results)[0,1]

    # validation results
    results = clf.predict(validatefeatures)
    error_validate += np.corrcoef(train[train['fold'] == k]['label'], results)[0,1]

  cor_coef_train.append(error_train/5)
  cor_coef_validate.append(error_validate/5)

#plt.plot(alpha, cor_coef_train)
plt.plot(alpha, cor_coef_validate)

In [None]:
clf = Ridge(alpha = alpha[np.argmax(cor_coef_validate)])
clf.fit(x_train, train['label'])


In [None]:
test['predictions'] = clf.predict(x_test)
languages = ['English', 'Spanish', 'Portuguese', 'Italian', 'French', 'Chinese','Hindi', 'Dutch', 'Korean', 'Arabic']
corr = []

f = plt.figure()
f.set_figwidth(16)
f.set_figheight(10)
i = 1

for language in languages:
  ind = (test['language'] == language).values
  plt.subplot(2,5,i)
  plt.scatter(test[ind]['label'], test[ind]['predictions'])
  error = np.corrcoef(test[ind]['label'], test[ind]['predictions'])[0,1]
  plt.title(language)
  print(language + ' ' + str(error))
  plt.xlim([1,5])
  plt.ylim([1,5])
  xpoints = ypoints = plt.xlim()
  plt.plot(xpoints, ypoints, linestyle='--', color='k', lw=3, scalex=False, scaley=False)
  i += 1
  corr.append(error)

overall = []
error = test['predictions'].corr(test['label'])
overall.append(error)
print('Overall', error)

s_df = test[test['language'].isin(['English', 'Spanish', 'Portuguese', 'Italian', 'French', 'Chinese'])]
error = s_df['predictions'].corr(s_df['label'])
overall.append(error)
print('Seen Languages',error)

u_df = test[test['language'].isin(['Hindi', 'Dutch', 'Korean', 'Arabic'])]
error = u_df['predictions'].corr(u_df['label'])
overall.append(error)
print('Unseen Languages',error)

print(' & '.join('{:0.5f}'.format(i) for i in corr))
print(' & '.join('{:0.5f}'.format(i) for i in overall))