In [1]:
# Install if necessary
print('Installing packages')
!pip install datasets==1.18.3 
!pip install transformers[sentencepiece]==4.16.2 
!pip install tweet-preprocessor
!pip install pyspellchecker
!pip install optuna
!pip install autokeras

Installing packages
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==1.18.3
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 32.8 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 58.4 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 69.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.2 MB/s 
Installing collected packages: xxhash, multiprocess, huggingface-hub, datasets
Successfully installed datasets-1.18.3 huggingface-hub-0.11.1 multiprocess-0.70.14 xxhash-3.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/

In [2]:
!gdown 1U6CU4VonapD40kAkK3_X6LirG1_f8nim
!gdown 1gBqHkxh5Bg4O_VIEzrLO-mG9tkZ8uD_F

Downloading...
From: https://drive.google.com/uc?id=1U6CU4VonapD40kAkK3_X6LirG1_f8nim
To: /content/H1_Offensive_Language_Identification_test.csv
100% 132k/132k [00:00<00:00, 89.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1gBqHkxh5Bg4O_VIEzrLO-mG9tkZ8uD_F
To: /content/H1_Offensive_Language_Identification_train.csv
100% 1.83M/1.83M [00:00<00:00, 147MB/s]


<h2>Text Preprocessing and Dataset Preperation

In [2]:
import pandas as pd
tweets_df_train = pd.read_csv("/content/H1_Offensive_Language_Identification_train.csv")
tweets_df_test = pd.read_csv("/content/H1_Offensive_Language_Identification_test.csv")

In [3]:
import preprocessor as p
import numpy as np

# Remove numbers, emojis and &'s
p.set_options(p.OPT.NUMBER, p.OPT.EMOJI)

tweets_df_train_cleaned = (tweets_df_train
                           .assign(tweet=tweets_df_train["tweet"].apply(p.clean).str.replace("&", "and ").str[:512],
                                   label=np.where(tweets_df_train.label == "OFF", 0, 1) # Change OFF to 1 and NOT to 0
                 )
             )


tweets_df_test_cleaned = (tweets_df_test
             .assign(
                 tweet=tweets_df_test["tweet"].apply(p.clean).str.replace("&", "and ").str[:512],
                 )
             )

In [4]:
import re
import nltk
import string
import pandas as pd
from textblob import Word
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.remove("not")
stop_words.remove("no")
pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*')

html = re.compile('<.*?>')
spell = SpellChecker()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
def preprocess(text, stop_word_pattern = pattern, html_pattern = html):
    text = re.sub("([@][A-Za-z0-9_]+)|(\w+:\/\/\S+)", "", text)
    text = re.sub("\s+"," ",text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    text = re.sub(html_pattern, '', text)
    text = re.sub(r"URL", '', text)
    text = re.sub("\s+"," ",text)
    return text.lower()

In [6]:
cleaned = [preprocess(sent) for sent in tweets_df_train_cleaned["tweet"]]
tweets_df_train_cleaned["tweet"] = cleaned

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(tweets_df_train_cleaned, test_size=0.065, random_state=42)
print(df_train.shape, df_test.shape)

(12379, 3) (861, 3)


<h2>Inference on Models

In [25]:
# model = "fahad1247/hateBERT-finetuned"
# model = "fahad1247/english-abusive-MuRIL-finetuned"
# model = "fahad1247/distilroberta-finetuned-tweets-hate-speech-finetuned"
model = "fahad1247/twitter-roberta-base-hate-finetuned"
# model = "fahad1247/twitter-roberta-base-offensive-finetuned"

In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load Best model from checkpoint
best_model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2)

# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
best_model.to(device)

Downloading:   0%|          | 0.00/906 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [27]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model)

Downloading:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

<h2>Dev set

In [28]:
from sklearn.metrics import classification_report, f1_score
y_true = df_test.label.to_list()
y_pred = []

# Iteratively evaluate the model and compute metrics
best_model.eval()
for tweet in df_test.tweet:
    inputs = tokenizer(tweet, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU
    with torch.no_grad():
        # Inference model and get logits
        outputs = best_model(**inputs)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_pred.extend(predictions.tolist())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [29]:
from sklearn.metrics import f1_score
print("Macrfo F1 Score : ", f1_score(y_true, y_pred, average='macro'))

Macrfo F1 Score :  0.7933988410178886


<h2>Test set

In [None]:
cleaned = [preprocess(sent) for sent in tweets_df_test_cleaned["tweet"]]
tweets_df_test_cleaned["tweet"] = cleaned

In [None]:
from sklearn.metrics import classification_report, f1_score
y_true = [] # Provide the labels of test set
y_pred = []

# Iteratively evaluate the model and compute metrics
best_model.eval()
for tweet in tweets_df_test_cleaned.tweet:
    inputs = tokenizer(tweet, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU
    with torch.no_grad():
        # Inference model and get logits
        outputs = best_model(**inputs)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_pred.extend(predictions.tolist())

In [None]:
tweets_df_test["label"] = y_pred
tweets_df_test.to_csv("final.csv")

In [None]:
print("Macrfo F1 Score : ", f1_score(y_true, y_pred, average='macro'))

<h2>Inference of Model from Neural Architecture Search

In [30]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.5.4-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.5.4


In [36]:
!gdown 1ohP4ySu8ud3qHVLrnZDNAdoCXhEWt7HH

Downloading...
From: https://drive.google.com/uc?id=1ohP4ySu8ud3qHVLrnZDNAdoCXhEWt7HH
To: /content/best_model.zip
100% 406M/406M [00:02<00:00, 137MB/s] 


In [37]:
!unzip /content/best_model.zip

Archive:  /content/best_model.zip
   creating: best_model/assets/
  inflating: best_model/keras_metadata.pb  
  inflating: best_model/saved_model.pb  
   creating: best_model/variables/
  inflating: best_model/variables/variables.data-00000-of-00001  
  inflating: best_model/variables/variables.index  


In [39]:
from tensorflow.keras.models import load_model
import tensorflow as tf
import autokeras as ak

# load the model from disk
loaded_model = load_model("best_model",
                          custom_objects = ak.CUSTOM_OBJECTS)

Downloading data from https://github.com/keras-team/autokeras/releases/download/1.0.13/bert_vocab.txt


<h2>Dev Test

In [40]:
x_test = np.array(df_test.tweet)
y_test = df_test.label.to_list()
# use loaded model for future predictions on new samples
y_hat = loaded_model.predict(x_test)



In [41]:
print("Macrfo F1 Score : ", f1_score(y_true, y_pred, average='macro'))

Macrfo F1 Score :  0.7933988410178886
