# This nootbook Extracts Embeddings for tweets using Arabert, then Classifies the embeddings using SVM Classifiers

In [1]:
!pip install bert-for-tf2

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m677.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py-params>=0.9.6 (from bert-for-tf2)
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting params-flow>=0.8.0 (from bert-for-tf2)
  Downloading params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30510 sha256=2d592da02ae9ebab10702b2d0973ae09550ac543201e38ca9816928b4d7cfe44
  Stored in directory: /root/.cache/pip/wheels/d8/da/50/126d7b8416d9a0e6bf876935c2219a71e72a6529c25e150c56
  Building wheel for params-flow (s

In [2]:
!pip install transformers
!pip install arabert

Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyArabic (from arabert)
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting farasapy (from arabert)
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Collecting emoji==1.4.2 (from arabert)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=186459 sha256=b9963fb9a199dd4d318cfdcb1b5f438a972a04

In [3]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [44]:
train = pd.read_csv("train.csv")
test= pd.read_csv("test.csv")

# Extract text and labels
train_texts = train["comment"]
train_labels = train["label"]

test_texts = test["comment"]
test_labels = test["label"]


In [3]:
# Load AraBERT
model_name = "aubmindlab/bert-base-arabertv02-twitter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [4]:
def get_embeddings(samples):
    embeddings = []

    # Move the model to GPU for speeeeeed 
    model.to('cuda')

    for sample in samples:
        # Tokenize the text and move the tensors to GPU
        tokenized_text = tokenizer(sample, padding=True, truncation=True, return_tensors="pt").to('cuda')

        with torch.no_grad():
            outputs = model(**tokenized_text)
            sample_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Move to CPU tensor before converting to numpy bcz the Incompatibility of NumPy with GPU Tensors

        embeddings.append(sample_embedding)

    embeddings = np.vstack(embeddings)  # Create a 2D array
    return embeddings


In [46]:
train_texts= train_texts.to_list()
test_texts= test_texts.to_list()

In [None]:
# Obtain embeddings for all samples
X_train = get_embeddings(train_texts)
X_test = get_embeddings(test_texts)
y_train = train_labels
y_test = test_labels

In [None]:
# Train SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Predict using the trained classifier
y_pred = svm_classifier.predict(X_test)

In [2]:
print(classification_report(y_test, y_pred))