In [1]:
# Install if necessary
print('Installing packages')
!pip install datasets==1.18.3 
!pip install transformers[sentencepiece]==4.16.2 
!pip install tweet-preprocessor
!pip install pyspellchecker
!pip install optuna

Installing packages
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==1.18.3
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 6.6 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 50.9 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 40.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 23.5 MB/s 
Installing collected packages: xxhash, multiprocess, huggingface-hub, datasets
Successfully installed datasets-1.18.3 huggingface-hub-0.11.1 multiprocess-0.70.14 xxhash-3.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/c

In [2]:
!gdown 1U6CU4VonapD40kAkK3_X6LirG1_f8nim
!gdown 1gBqHkxh5Bg4O_VIEzrLO-mG9tkZ8uD_F

Downloading...
From: https://drive.google.com/uc?id=1U6CU4VonapD40kAkK3_X6LirG1_f8nim
To: /content/H1_Offensive_Language_Identification_test.csv
100% 132k/132k [00:00<00:00, 71.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1gBqHkxh5Bg4O_VIEzrLO-mG9tkZ8uD_F
To: /content/H1_Offensive_Language_Identification_train.csv
100% 1.83M/1.83M [00:00<00:00, 157MB/s]


In [3]:
model_checkpoint = "cardiffnlp/twitter-roberta-base-offensive"

batch_size = 8

<h2>Text Preprocessing and Dataset Preperation

In [4]:
import pandas as pd
tweets_df_train = pd.read_csv("/content/H1_Offensive_Language_Identification_train.csv")
tweets_df_test = pd.read_csv("/content/H1_Offensive_Language_Identification_test.csv")

In [5]:
import preprocessor as p
import numpy as np

# Remove numbers, emojis and &'s
p.set_options(p.OPT.NUMBER, p.OPT.EMOJI)

tweets_df_train_cleaned = (tweets_df_train
                           .assign(tweet=tweets_df_train["tweet"].apply(p.clean).str.replace("&", "and ").str[:512],
                                   label=np.where(tweets_df_train.label == "OFF", 0, 1) # Change OFF to 1 and NOT to 0
                 )
             )


tweets_df_test_cleaned = (tweets_df_test
             .assign(
                 tweet=tweets_df_test["tweet"].apply(p.clean).str.replace("&", "and ").str[:512],
                 )
             )

In [6]:
import re
import nltk
import string
import pandas as pd
from textblob import Word
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.remove("not")
stop_words.remove("no")
pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*')

html = re.compile('<.*?>')
spell = SpellChecker()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
def preprocess(text, stop_word_pattern = pattern, html_pattern = html):
    text = re.sub("([@][A-Za-z0-9_]+)|(\w+:\/\/\S+)", "", text)
    text = re.sub("\s+"," ",text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    text = re.sub(html_pattern, '', text)
    text = re.sub(r"URL", '', text)
    text = re.sub("\s+"," ",text)
    return text.lower()

In [8]:
cleaned = [preprocess(sent) for sent in tweets_df_train_cleaned["tweet"]]
tweets_df_train_cleaned["tweet"] = cleaned

In [9]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(tweets_df_train_cleaned, test_size=0.065, random_state=42)
print(df_train.shape, df_test.shape)

train_data = Dataset.from_pandas(df_train)
test_data = Dataset.from_pandas(df_test)
print(train_data)

(12379, 3) (861, 3)
Dataset({
    features: ['tweet', 'label', 'id', '__index_level_0__'],
    num_rows: 12379
})


In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [11]:
def preprocess_function(examples):
    return tokenizer(examples["tweet"], truncation=True)

In [12]:
train_dataset = train_data.map(preprocess_function, batched=True)
test_dataset = test_data.map(preprocess_function, batched=True)



  0%|          | 0/13 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
from datasets import load_metric
# Load metric
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [14]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

metric_name = "f1"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
)

<h2>Training with user defined parameters

In [16]:
# Load model from checkpoint
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
model.to(device)

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

OSError: ignored

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub(f"{model_name}-finetuned")