In [1]:
# Install if necessary
print('Installing packages')
!pip install datasets==1.18.3 
!pip install transformers[sentencepiece]==4.16.2 
!pip install tweet-preprocessor
!pip install pyspellchecker
!pip install optuna

Installing packages
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==1.18.3
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 6.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 67.9 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 74.4 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.9 MB/s 
Installing collected packages: xxhash, multiprocess, huggingface-hub, datasets
Successfully installed datasets-1.18.3 huggingface-hub-0.11.1 multiprocess-0.70.14 xxhash-3.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-w

In [2]:
!gdown 1U6CU4VonapD40kAkK3_X6LirG1_f8nim
!gdown 1gBqHkxh5Bg4O_VIEzrLO-mG9tkZ8uD_F

Downloading...
From: https://drive.google.com/uc?id=1U6CU4VonapD40kAkK3_X6LirG1_f8nim
To: /content/H1_Offensive_Language_Identification_test.csv
100% 132k/132k [00:00<00:00, 105MB/s]
Downloading...
From: https://drive.google.com/uc?id=1gBqHkxh5Bg4O_VIEzrLO-mG9tkZ8uD_F
To: /content/H1_Offensive_Language_Identification_train.csv
100% 1.83M/1.83M [00:00<00:00, 167MB/s]


<h2>Text Preprocessing and Dataset Preperation

In [29]:
import pandas as pd
tweets_df_train = pd.read_csv("/content/H1_Offensive_Language_Identification_train.csv")
tweets_df_test = pd.read_csv("/content/H1_Offensive_Language_Identification_test.csv")

In [30]:
import preprocessor as p
import numpy as np

# Remove numbers, emojis and &'s
p.set_options(p.OPT.NUMBER, p.OPT.EMOJI)

tweets_df_train_cleaned = (tweets_df_train
                           .assign(tweet=tweets_df_train["tweet"].apply(p.clean).str.replace("&", "and ").str[:512],
                                   label=np.where(tweets_df_train.label == "OFF", 0, 1) # Change OFF to 1 and NOT to 0
                 )
             )


tweets_df_test_cleaned = (tweets_df_test
             .assign(
                 tweet=tweets_df_test["tweet"].apply(p.clean).str.replace("&", "and ").str[:512],
                 )
             )

In [5]:
import re
import nltk
import string
import pandas as pd
from textblob import Word
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.remove("not")
stop_words.remove("no")
pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*')

html = re.compile('<.*?>')
spell = SpellChecker()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [6]:
def preprocess(text, stop_word_pattern = pattern, html_pattern = html):
    text = re.sub("([@][A-Za-z0-9_]+)|(\w+:\/\/\S+)", "", text)
    text = re.sub("\s+"," ",text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    text = re.sub(html_pattern, '', text)
    text = re.sub(r"URL", '', text)
    text = re.sub("\s+"," ",text)
    return text.lower()

In [31]:
cleaned = [preprocess(sent) for sent in tweets_df_train_cleaned["tweet"]]
tweets_df_train_cleaned["tweet"] = cleaned

In [32]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(tweets_df_train_cleaned, test_size=0.065, random_state=42)
print(df_train.shape, df_test.shape)

(12379, 3) (861, 3)


<h2>Ensemble

In [9]:
m_name1 = "fahad1247/hateBERT-finetuned"
m_name2 = "fahad1247/english-abusive-MuRIL-finetuned"
m_name3 = "fahad1247/distilroberta-finetuned-tweets-hate-speech-finetuned"
m_name4 = "fahad1247/twitter-roberta-base-hate-finetuned"
m_name5 = "fahad1247/twitter-roberta-base-offensive-finetuned"

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load Best model from checkpoint
model1 = AutoModelForSequenceClassification.from_pretrained(m_name1, num_labels=2)
model2 = AutoModelForSequenceClassification.from_pretrained(m_name2, num_labels=2)
model3 = AutoModelForSequenceClassification.from_pretrained(m_name3, num_labels=2)
model4 = AutoModelForSequenceClassification.from_pretrained(m_name4, num_labels=2)
model5 = AutoModelForSequenceClassification.from_pretrained(m_name5, num_labels=2)


# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
model1.to(device)
model2.to(device)
model3.to(device)
model4.to(device)
model5.to(device)

Downloading:   0%|          | 0.00/733 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/740 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/906M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/807 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/906 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/931 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [11]:
from transformers import AutoTokenizer
tokenizer1 = AutoTokenizer.from_pretrained(m_name1)
tokenizer2 = AutoTokenizer.from_pretrained(m_name2)
tokenizer3 = AutoTokenizer.from_pretrained(m_name3)
tokenizer4 = AutoTokenizer.from_pretrained(m_name4)
tokenizer5 = AutoTokenizer.from_pretrained(m_name5)

Downloading:   0%|          | 0.00/523 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/695k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/552 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

<h2>Training

In [12]:
def helper(df):
  X = []


  # Iteratively evaluate the model and compute metrics
  model1.eval()
  model2.eval()
  model3.eval()
  model4.eval()
  model5.eval()

  for tweet in df.tweet:
      row = []
      input1 = tokenizer1(tweet, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU
      input2 = tokenizer2(tweet, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU
      input3 = tokenizer3(tweet, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU
      input4 = tokenizer4(tweet, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU
      input5 = tokenizer5(tweet, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU

      with torch.no_grad():
          # Inference model and get logits
          output1 = model1(**input1)
          output2 = model2(**input1)
          output3 = model3(**input1)
          output4 = model4(**input1)
          output5 = model5(**input1)


      logits1 = output1.logits
      logits2 = output2.logits
      logits3 = output3.logits
      logits4 = output4.logits
      logits5 = output5.logits

      prediction1 = torch.argmax(logits1, dim=-1)
      prediction2 = torch.argmax(logits2, dim=-1)
      prediction3 = torch.argmax(logits3, dim=-1)
      prediction4 = torch.argmax(logits4, dim=-1)
      prediction5 = torch.argmax(logits5, dim=-1)


      row.extend(prediction1.tolist())
      row.extend(prediction2.tolist())
      row.extend(prediction3.tolist())
      row.extend(prediction4.tolist())
      row.extend(prediction5.tolist())
      X.append(row)
  return X

In [13]:
X_train = helper(df_train)
X_test = helper(df_test)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(C = 0.05, gamma='auto', kernel = 'poly'))

In [33]:
from sklearn.metrics import f1_score
y_true = df_train.label.to_list()
clf.fit(X_train, y_true)
y_pred = clf.predict(X_train)
print("Macro F1 Score : ", f1_score(y_true, y_pred, average='macro'))

Macro F1 Score :  0.85103098555713


In [34]:
y_true = df_test.label.to_list()
y_pred = clf.predict(X_test)
print("Macro F1 Score : ", f1_score(y_true, y_pred, average='macro'))

Macro F1 Score :  0.764808362369338


<h2>Test set

In [35]:
cleaned = [preprocess(sent) for sent in tweets_df_test_cleaned["tweet"]]
tweets_df_test_cleaned["tweet"] = cleaned

In [36]:
X = helper(tweets_df_test_cleaned)
y_pred = clf.predict(X)

In [37]:
tweets_df_test["label"] = y_pred
tweets_df_test.to_csv("final.csv")

In [39]:
y_true = tweets_df_test_cleaned.label.to_list() # To be Provided
print("Macro F1 Score : ", f1_score(y_true, y_pred, average='macro'))