Each set of experiments on 10 seeds ran for about 10-30 minutes on V100 GPU (Google Colab). The list of packages that were part of the Google Colab environment at the time of experiments can be found at `google_colab_requirements.txt`.

In [None]:
!pip install transformers datasets evaluate accelerate



# Import libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# %cd /content/drive/MyDrive/your_path_here

In [None]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import statistics

In [None]:
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate

from transformers import set_seed

In [None]:
import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
random_seeds = [128, 101, 77, 34, 255, 67, 195, 3, 222, 234]

Prepare evaluation metrics

In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load('f1')

def compute_accuracy(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

def compute_f1(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels)


Define tokenizers

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Classifying Relevance

Load in the dataset

In [None]:
with open('agg_tweet_ratings.json') as f:
    agg_tweet_ratings = json.load(f)['data']
agg_tweet_ratings_df = pd.DataFrame.from_dict(agg_tweet_ratings)

In [None]:
def train_DistilBERT(seed):
    # set the seed
    set_seed(seed)

    # split the dataset
    tokenized_xmp_dataset.shuffle(seed)
    xmp_dataset_split = tokenized_xmp_dataset.train_test_split(test_size=0.2)

    # Prepare for training
    id2label = {0: "NOT_RELEVANT", 1: "RELEVANT"}
    label2id = {"NOT_RELEVANT": 0, "RELEVANT": 1}

    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir="xmp_relevance_model",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="no",
        save_strategy="no",
        load_best_model_at_end=True,
        # push_to_hub=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=xmp_dataset_split["train"],
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_f1,
    )

    #train the model
    trainer.train()

    # evaluate on the test set
    model.to('cpu')
    inference_batch_size = 16
    batch_count = int(len(xmp_dataset_split['test'])/inference_batch_size)+1
    evaluation_scores = []
    count = 0
    for i in range(batch_count):
        texts = xmp_dataset_split['test']['text'][i*inference_batch_size:(i+1)*inference_batch_size]
        labels = xmp_dataset_split['test']['label'][i*inference_batch_size:(i+1)*inference_batch_size]
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        count += len(labels)
        with torch.no_grad():
            logits = model(**inputs).logits
        evaluation_scores.append(compute_f1((logits, labels))['f1'])
    assert count == len(xmp_dataset_split['test'])

    return sum(evaluation_scores)/len(evaluation_scores)

## Cleaned uncased tweet text

Process the dataset

In [None]:
agg_tweet_ratings_df['text'] = agg_tweet_ratings_df['clean_text'].apply(lambda t: t.lower())
agg_tweet_ratings_df['label'] = agg_tweet_ratings_df['relevance_rating'].apply(int)
xmp_dataset = Dataset.from_pandas(agg_tweet_ratings_df[['text', 'label']])
xmp_dataset.shuffle().select(range(3))[0]

{'text': 'immigrants face enormous barriers to getting health care including policy decisions that exclude them from medicaid. the act by and would help right that wrong. ',
 'label': 1}

In [None]:
tokenized_xmp_dataset = xmp_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/7032 [00:00<?, ? examples/s]

Train and evaluate

In [None]:
evaluation_scores_with_seeds = []
for seed in tqdm([128, 101, 77, 34, 255, 67, 195, 3, 222, 234]):
    evaluation_scores_with_seeds.append(train_DistilBERT(seed))

  0%|          | 0/10 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.359
1000,0.2052


 10%|█         | 1/10 [02:54<26:06, 174.10s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3774
1000,0.2155


 20%|██        | 2/10 [05:31<21:55, 164.40s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3618
1000,0.2052


 30%|███       | 3/10 [08:05<18:35, 159.36s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3545
1000,0.1869


 40%|████      | 4/10 [10:33<15:29, 154.87s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3513
1000,0.2093


 50%|█████     | 5/10 [13:09<12:57, 155.41s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3495
1000,0.199


 60%|██████    | 6/10 [15:44<10:20, 155.20s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3373
1000,0.202


 70%|███████   | 7/10 [18:21<07:47, 155.88s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3467
1000,0.2093


 80%|████████  | 8/10 [20:58<05:12, 156.28s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3399
1000,0.2033


 90%|█████████ | 9/10 [23:33<02:35, 155.99s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3549
1000,0.2096


100%|██████████| 10/10 [26:10<00:00, 157.06s/it]


In [None]:
round(sum(evaluation_scores_with_seeds)/len(evaluation_scores_with_seeds), 3), round(statistics.stdev(evaluation_scores_with_seeds), 3)

(0.87, 0.01)

## Cleaned uncased tweet text + sentiment features

Process the dataset

In [None]:
agg_tweet_ratings_df['text'] = agg_tweet_ratings_df['clean_text_with_sentiment'].apply(lambda t: t.lower())
agg_tweet_ratings_df['label'] = agg_tweet_ratings_df['relevance_rating'].apply(int)
xmp_dataset = Dataset.from_pandas(agg_tweet_ratings_df[['text', 'label']])
xmp_dataset.shuffle().select(range(3))[0]

{'text': 'these 8 women opened a salon that is spreading hope in one of the world’s largest refugee settlements.…  this tweet is mostly positive and hateful and contains mostly joy.',
 'label': 1}

In [None]:
tokenized_xmp_dataset = xmp_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/7032 [00:00<?, ? examples/s]

Train and evaluate

In [None]:
evaluation_scores_with_seeds = []
for seed in tqdm(random_seeds):
    evaluation_scores_with_seeds.append(train_DistilBERT(seed))

  0%|          | 0/10 [00:00<?, ?it/s]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3702
1000,0.2176


 10%|█         | 1/10 [03:04<27:39, 184.42s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3814
1000,0.2221


 20%|██        | 2/10 [06:08<24:32, 184.08s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3646
1000,0.2187


 30%|███       | 3/10 [09:07<21:11, 181.71s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3582
1000,0.1925


 40%|████      | 4/10 [12:15<18:26, 184.41s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3533
1000,0.215


 50%|█████     | 5/10 [15:19<15:20, 184.03s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3571
1000,0.2172


 60%|██████    | 6/10 [18:21<12:13, 183.40s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3422
1000,0.2079


 70%|███████   | 7/10 [21:22<09:07, 182.60s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3542
1000,0.2188


 80%|████████  | 8/10 [24:25<06:05, 182.83s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3437
1000,0.2133


 90%|█████████ | 9/10 [27:24<03:01, 181.66s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3587
1000,0.215


100%|██████████| 10/10 [30:22<00:00, 182.25s/it]


In [None]:
round(sum(evaluation_scores_with_seeds)/len(evaluation_scores_with_seeds), 3), round(statistics.stdev(evaluation_scores_with_seeds), 3)

(0.873, 0.008)

# Classifying Xenophobia

Load in the dataset

In [None]:
agg_tweet_ratings = []
with open('agg_tweet_ratings.json') as f:
    for tweet in json.load(f)['data']:
        if tweet['relevance_rating']:
            agg_tweet_ratings.append(tweet)
agg_tweet_ratings_df = pd.DataFrame.from_dict(agg_tweet_ratings)
len(agg_tweet_ratings_df)

3791

In [None]:
def train_DistilBERT(seed):
    # set the seed
    set_seed(seed)

    # split the dataset
    tokenized_xmp_dataset.shuffle(seed)
    xmp_dataset_split = tokenized_xmp_dataset.train_test_split(test_size=0.2)

    # Prepare for training
    id2label = {0: "ANTI", 1: "NEUTRAL", 2: "PRO"}
    label2id = {"ANTI": 0, "NEUTRAL": 1, "PRO": 2}

    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir="xmp_xm_model",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="no",
        save_strategy="no",
        load_best_model_at_end=True,
        # push_to_hub=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=xmp_dataset_split["train"],
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_accuracy,
    )

    #train the model
    trainer.train()

    # evaluate on the test set
    model.to('cpu')
    inference_batch_size = 16
    batch_count = int(len(xmp_dataset_split['test'])/inference_batch_size)+1
    evaluation_scores = []
    count = 0
    for i in range(batch_count):
        texts = xmp_dataset_split['test']['text'][i*inference_batch_size:(i+1)*inference_batch_size]
        labels = xmp_dataset_split['test']['label'][i*inference_batch_size:(i+1)*inference_batch_size]
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        count += len(labels)
        with torch.no_grad():
            logits = model(**inputs).logits
        evaluation_scores.append(compute_accuracy((logits, labels))['accuracy'])
    assert count == len(xmp_dataset_split['test'])

    return sum(evaluation_scores)/len(evaluation_scores)

## Cleaned uncased tweet text

Process the dataset

In [None]:
def colapse_to_three_categories(r):
    if r < 0:
        return 0
    elif r > 0:
        return 2
    else:
        return 1

In [None]:
agg_tweet_ratings_df['text'] = agg_tweet_ratings_df['clean_text'].apply(lambda t: t.lower())
agg_tweet_ratings_df['label'] = agg_tweet_ratings_df['xm_rating'].apply(lambda r: colapse_to_three_categories(r))
xmp_dataset = Dataset.from_pandas(agg_tweet_ratings_df[['text', 'label']])
xmp_dataset.shuffle().select(range(3))[0]

{'text': 'ice arrested more than 2,000 during a 39-day enforcement surge targeting those with criminal arrests and/or convict… ',
 'label': 0}

In [None]:
tokenized_xmp_dataset = xmp_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3791 [00:00<?, ? examples/s]

Train and evaluate

In [None]:
evaluation_scores_with_seeds = []
for seed in tqdm(random_seeds):
    evaluation_scores_with_seeds.append(train_DistilBERT(seed))

  0%|          | 0/10 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3723


 10%|█         | 1/10 [01:26<12:59, 86.64s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3616


 20%|██        | 2/10 [02:36<10:13, 76.70s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3488


 30%|███       | 3/10 [03:39<08:14, 70.64s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3645


 40%|████      | 4/10 [04:43<06:48, 68.09s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.369


 50%|█████     | 5/10 [05:48<05:34, 66.97s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3734


 60%|██████    | 6/10 [06:52<04:23, 65.89s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3507


 70%|███████   | 7/10 [07:56<03:15, 65.13s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3604


 80%|████████  | 8/10 [09:00<02:09, 64.74s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3573


 90%|█████████ | 9/10 [10:03<01:04, 64.22s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3716


100%|██████████| 10/10 [11:07<00:00, 66.72s/it]


In [None]:
sum(evaluation_scores_with_seeds)/len(evaluation_scores_with_seeds), statistics.stdev(evaluation_scores_with_seeds)

(0.893359375, 0.009529689205057729)

In [None]:
round(sum(evaluation_scores_with_seeds)/len(evaluation_scores_with_seeds), 3), round(statistics.stdev(evaluation_scores_with_seeds), 3)

(0.893, 0.01)

## Cleaned uncased tweet text + sentiment features

Process the dataset

In [None]:
agg_tweet_ratings_df['text'] = agg_tweet_ratings_df['clean_text_with_sentiment'].apply(lambda t: t.lower())
agg_tweet_ratings_df['label'] = agg_tweet_ratings_df['xm_rating'].apply(lambda r: colapse_to_three_categories(r))
xmp_dataset = Dataset.from_pandas(agg_tweet_ratings_df[['text', 'label']])
xmp_dataset.shuffle().select(range(3))[0]

{'text': '"since march 21, when a series of travel restrictions began along the u.s. border with mexico and canada, cbp has been immediately expelling most migrants they encounter at the border back to either mexico or canada."  this tweet is mostly neutral and hateful.',
 'label': 2}

In [None]:
tokenized_xmp_dataset = xmp_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3791 [00:00<?, ? examples/s]

Train and evaluate

In [None]:
evaluation_scores_with_seeds = []
for seed in tqdm(random_seeds):
    evaluation_scores_with_seeds.append(train_DistilBERT(seed))

  0%|          | 0/10 [00:00<?, ?it/s]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.386


 10%|█         | 1/10 [01:16<11:30, 76.70s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3755


 20%|██        | 2/10 [02:46<11:15, 84.38s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3734


 30%|███       | 3/10 [04:02<09:24, 80.58s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3926


 40%|████      | 4/10 [05:18<07:52, 78.80s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3779


 50%|█████     | 5/10 [06:35<06:30, 78.05s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3878


 60%|██████    | 6/10 [07:50<05:08, 77.16s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3778


 70%|███████   | 7/10 [09:05<03:49, 76.38s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3763


 80%|████████  | 8/10 [10:19<02:31, 75.57s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3821


 90%|█████████ | 9/10 [11:34<01:15, 75.31s/it]Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3851


100%|██████████| 10/10 [12:48<00:00, 76.86s/it]


In [None]:
sum(evaluation_scores_with_seeds)/len(evaluation_scores_with_seeds), statistics.stdev(evaluation_scores_with_seeds)

(0.8934895833333332, 0.009008389234786113)

In [None]:
round(sum(evaluation_scores_with_seeds)/len(evaluation_scores_with_seeds), 3), round(statistics.stdev(evaluation_scores_with_seeds), 3)

(0.893, 0.009)