# Text as Data Q6
Isaac Tabb

03/01/23

### Step 0: Import our datasets

First we will import our three dataset csv files.

In [None]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

import io 
train_df = pd.read_csv(io.BytesIO(uploaded['training_set.csv']))
valid_df = pd.read_csv(io.BytesIO(uploaded['validation_set.csv']))
test_df = pd.read_csv(io.BytesIO(uploaded['test_set.csv']))

Saving test_set.csv to test_set.csv
Saving training_set.csv to training_set.csv
Saving validation_set.csv to validation_set.csv


In [None]:
# Turn the dataframes into dictionaries
train_dct = train_df.to_dict('records')
valid_dct = valid_df.to_dict('records')
test_dct = test_df.to_dict('records')

# Create two separate lists, the tweets and the labels, for each dataset split
train_tweets, train_labels = [], []
for tweet in train_dct:
  train_tweets.append(tweet['text'])
  train_labels.append(tweet['team'])

valid_tweets, valid_labels = [], []
for tweet in valid_dct:
  valid_tweets.append(tweet['text'])
  valid_labels.append(tweet['team'])

test_tweets, test_labels = [], []
for tweet in test_dct:
  test_tweets.append(tweet['text'])
  test_labels.append(tweet['team'])

The method that worked the best on my validation set was the DistilBERT end-to-end trained model using learning_rate=1e-5, batch_size=32, and epochs=10.

We will now test that model on the test set (woohoo!).

Let's begin by importing the transformers and datasets libraries.

In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m115.4 MB/s[0m eta [36m0:00:00[0m
Co

We will now set up our tokenizer and model.

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

# dictionaries for id -> label and label -> id
id2label = {0: "MiamiHeat", 1: "LosAngelesLakers", 2: "BostonCeltics", 3: "DenverNuggets"}
label2id = {"MiamiHeat": 0, "LosAngelesLakers": 1, "BostonCeltics": 2, "DenverNuggets": 3}

# define distilbert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# define distilbert for sequence classification model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Now we will set up out datasets.

In [None]:
# tokenize the training set using padding with max length
final_train_dct = {'input_ids': [], 'labels': []}
for dct in train_dct:
  final_train_dct['input_ids'].append(tokenizer.encode(dct['text'], padding='max_length', max_length=157, truncation=True))
  final_train_dct['labels'].append(label2id[dct['team']])

# tokenize the validation set
final_valid_dct = {'input_ids': [], 'labels': []}
for dct in valid_dct:
  final_valid_dct['input_ids'].append(tokenizer.encode(dct['text'], padding='max_length', max_length=157, truncation=True))
  final_valid_dct['labels'].append(label2id[dct['team']])

# tokenize the test set
final_test_dct = {'input_ids': [], 'labels': []}
for dct in test_dct:
  final_test_dct['input_ids'].append(tokenizer.encode(dct['text'], padding='max_length', max_length=157, truncation=True))
  final_test_dct['labels'].append(label2id[dct['team']])

# setup the datasets using the Dataset functionality
from datasets import Dataset
train_dataset = Dataset.from_dict(final_train_dct)
valid_dataset = Dataset.from_dict(final_valid_dct)
test_dataset = Dataset.from_dict(final_test_dct)

Let's set up our training arguments from our best model.

In [None]:
training_args = TrainingArguments(
    output_dir="DistilBERT NBA Tweets Model",
    learning_rate=1e-5,   # learning rate of 1e-5
    per_device_train_batch_size=32,   # batch size of 32
    per_device_eval_batch_size=32,
    num_train_epochs=10,    # 10 epochs
    weight_decay=0,
    evaluation_strategy="epoch"
)

We will initialize our data collator.

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Now we set up our trainer.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

And we will train!

In [None]:
trainer.train()

***** Running training *****
  Num examples = 6000
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1880
  Number of trainable parameters = 66956548


Epoch,Training Loss,Validation Loss
1,No log,0.823067
2,No log,0.779551
3,0.826100,0.735227
4,0.826100,0.700758
5,0.826100,0.678185
6,0.683600,0.679762
7,0.683600,0.678408
8,0.593500,0.679963
9,0.593500,0.683652
10,0.593500,0.683133


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
Saving model checkpoint to DistilBERT NBA Tweets Model/checkpoint-500
Configuration saved in DistilBERT NBA Tweets Model/checkpoint-500/config.json
Model weights saved in DistilBERT NBA Tweets Model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in DistilBERT NBA Tweets Model/checkpoint-500/tokenizer_config.json
Special tokens file saved in DistilBERT NBA Tweets Model/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
Saving model checkpoint to DistilBERT NBA Tweets Model/checkpoint-1000
Configuration saved in DistilBERT NBA Tweets Model/checkpoint-1000/config.json
Model weights saved in DistilBERT NBA Tweets Model/checkpoint-1000/pyto

TrainOutput(global_step=1880, training_loss=0.6694244465929396, metrics={'train_runtime': 878.5633, 'train_samples_per_second': 68.293, 'train_steps_per_second': 2.14, 'total_flos': 2437280082720000.0, 'train_loss': 0.6694244465929396, 'epoch': 10.0})

We will now use our trained model to make predictions on the test set.

In [None]:
predictions, label_ids, metrics = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 32


We will save all of the predicted labels using the argmax function.

In [None]:
import numpy as np
labels_predicted = []
# Iterate through the predictions, saving the highest value for each
for prediction in predictions:
  labels_predicted.append(id2label[np.argmax(prediction)])

And here are our final evaluation metrics!

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(test_labels, labels_predicted)
print(f"{accuracy=:.3f}")

precision = precision_score(test_labels, labels_predicted, average='macro')
print(f"{precision=:.3f}")

recall = recall_score(test_labels, labels_predicted, average='macro')
print(f"{recall=:.3f}")

f1 = f1_score(test_labels, labels_predicted, average='macro')
print(f"{f1=:.3f}")

accuracy=0.769
precision=0.628
recall=0.545
f1=0.577


Now let's construct our confusion matrix.

In [None]:
# The outside dictionary keys are the actual labels, the inside keys are the predicted labels
conf_mat = {"MiamiHeat": {"MiamiHeat": 0, "BostonCeltics": 0, "DenverNuggets": 0, "LosAngelesLakers": 0},
            "BostonCeltics": {"MiamiHeat": 0, "BostonCeltics": 0, "DenverNuggets": 0, "LosAngelesLakers": 0},
            "DenverNuggets": {"MiamiHeat": 0, "BostonCeltics": 0, "DenverNuggets": 0, "LosAngelesLakers": 0},
            "LosAngelesLakers": {"MiamiHeat": 0, "BostonCeltics": 0, "DenverNuggets": 0, "LosAngelesLakers": 0}}

We will iterate through the true test labels to find the values for confusion matrix.

In [None]:
for label in range(len(test_labels)):
  actual = test_labels[label]   # will be the actual team name
  pred = labels_predicted[label]  # will be the actual team name
  conf_mat[actual][pred] += 1

And our final confusion matrix!

In [None]:
print(conf_mat)

### Looking at errors

First lets look at mislabeled MiamiHeat tweets.

In [None]:
# Print 3 BostonCeltics examples
count = 0
print("Miami Heat classified as Boston Celtics")
for label in range(len(test_labels)):
  if test_labels[label] == 'MiamiHeat' and labels_predicted[label] == 'BostonCeltics':
    print(test_tweets[label])
    count += 1
  if count == 3:
    break
print()
print()

# Print 3 LosAngelesLakers examples
count = 0
print("Miami Heat classified as Los Angeles Lakers")
for label in range(len(test_labels)):
  if test_labels[label] == 'MiamiHeat' and labels_predicted[label] == 'LosAngelesLakers':
    print(test_tweets[label])
    count += 1
  if count == 3:
    break
print()
print()


Miami Heat classified as Boston Celtics
I’ve always said crowder is good when there’s no pressure. It’s game 4 tonight in a 2-1 series he’s a no show.  
This guy..... I mean he’s not wrong ! @celtics @MiamiHEAT   https://t.co/UiJjKSgOzZ
Relying on Kobe  Tatum to carry them made Celtics lose 3 of  series games.      https://t.co/ElsLeTrqdC


Miami Heat classified as Los Angeles Lakers
1 remaining from Lakers to the championship         👇⬇️Click the link for details⬇️👇 https://t.co/vZ9rPE0Ejm
16 offensive rebounds by the Lakers. SIXTEEN!! I don’t care how much bigger they are, there’s no excuse. You’re simply not being physical. Embarrassing   
LEBRON'S FINALS : LEBRON IN LOS ANGELES (1.5) ➡️ https://t.co/Ny1ortsWJv                         https://t.co/Ud2OmQZ0BP




Let's look at mislabeled Boston Celtics tweets.

In [None]:
# Print 3 MiamiHeat examples
count = 0
print("Boston Celtics classified as Miami Heat")
for label in range(len(test_labels)):
  if test_labels[label] == 'BostonCeltics' and labels_predicted[label] == 'MiamiHeat':
    print(test_tweets[label])
    count += 1
  if count == 3:
    break
print()
print()

# Print 3 LosAngelesLakers examples
count = 0
print("Boston Celtics classified as Los Angeles Lakers")
for label in range(len(test_labels)):
  if test_labels[label] == 'BostonCeltics' and labels_predicted[label] == 'LosAngelesLakers':
    print(test_tweets[label])
    count += 1
  if count == 3:
    break
print()
print()

Boston Celtics classified as Miami Heat
Silver Talks 2020-21 Season https://t.co/JU3YiE2D8z   
Let’s Go !!!!      https://t.co/Ix1WW6MSsh
In a do or die game win or go home our  are on pace for just over 60pts....EMBARRASSING @TonyMassarotti @adamjones985 @bigjimmurray something has gotta change...if you are a Celts fan in denial call 617-779-0985 it will be a good roast tomorrow


Boston Celtics classified as Los Angeles Lakers
Watching the game to see what Jeff van gundy says today           https://t.co/jjcRrwiQ2E
Doc Rivers se despede do Los Angeles Clippers. _ 📸 Getty Images  🗞 Henrique Cesar _    /21      https://t.co/oRKFz7TAV0 https://t.co/lvKZUQCrUv
Stopping Anthony Davis is the key to Stopping Lakers https://t.co/KiDxHU7VTi   




Let's look at mislabeled Denver Nuggets tweets.

In [None]:
# Print 3 MiamiHeat examples
count = 0
print("Denver Nuggets classified as Miami Heat")
for label in range(len(test_labels)):
  if test_labels[label] == 'DenverNuggets' and labels_predicted[label] == 'MiamiHeat':
    print(test_tweets[label])
    count += 1
  if count == 3:
    break
print()
print()

# Print 3 LosAngelesLakers examples
count = 0
print("Denver Nuggets classified as Los Angeles Lakers")
for label in range(len(test_labels)):
  if test_labels[label] == 'DenverNuggets' and labels_predicted[label] == 'LosAngelesLakers':
    print(test_tweets[label])
    count += 1
  if count == 3:
    break
print()
print()

Denver Nuggets classified as Miami Heat
Current mood to all these bull shit refs call phantom fouls.     https://t.co/hGnlxKpiHo
The  have taking control of the first half leaving the  dumbfounded. It's Halftime. Score 63 - 53 https://t.co/wSUgftjMLz
From my source 6.5 point TEASER 🙈🤷🏻‍♂️ DOLPHINS +9.5 OVER 42.5  BEARDS 🧔🏻&gt; MUSTACHES 👴🏼  CAPS MAKE IT SEEM SERIOUS SO DO THE HIEROGLYPHICS 💵      


Denver Nuggets classified as Los Angeles Lakers
What is your prediction for the Game 4? Comment your prediction!             https://t.co/MV9N0HEpVD
They really want the lakers to win this one lmao  
Halfway home: Lakers top Heat 124-114 for 2-0 Finals lead                 https://t.co/FKjlpriQG6




And finally, let's look at mislabeled Los Angeles Lakers tweets.

In [None]:
# Print 3 MiamiHeat examples
count = 0
print("Los Angeles Lakers classified as Miami Heat")
for label in range(len(test_labels)):
  if test_labels[label] == 'LosAngelesLakers' and labels_predicted[label] == 'MiamiHeat':
    print(test_tweets[label])
    count += 1
  if count == 3:
    break
print()
print()

# Print 3 DenverNuggets examples
count = 0
print("Los Angeles Lakers classified as Denver Nuggets")
for label in range(len(test_labels)):
  if test_labels[label] == 'LosAngelesLakers' and labels_predicted[label] == 'DenverNuggets':
    print(test_tweets[label])
    count += 1
  if count == 3:
    break
print()
print()

Los Angeles Lakers classified as Miami Heat
Chris Paul explains why he picks the Lakers to win the NBA Finals vs the Heat    https://t.co/Ea6i0zqswx
Lakers have not missed a shot in the 3rd quarter so far!   
"It’s night and day" how much better LeBron James now compared to 2011 https://t.co/q0M6bi9YyN     


Los Angeles Lakers classified as Denver Nuggets
LAKESHOW!!!! 💜💛 * *                 🏀 https://t.co/uxrnH3g6Ef
@hannah_kulik The 2020 NBA champions.  
   🏀  📺 Game 4   27  22 End Of Q1


