<a href="https://colab.research.google.com/github/jonaidsharif/Stress-Detection-from-Social-Media-Articles/blob/main/Using_transformers_for_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class posEmbed(nn.Module):

    def __init__(self, v, d, max_length, dropout=.01):
        """
        - v: int
            - vocab size
        - d: int
            - embeddings dims
        - max_length: int
            - maximum sentence length used for pos embeddings
        - dropout: float
            - dropout rate (fraction of neurons that must be randomly zeroed out)
        """
        super().__init__()

        # building embedding layers that expect indices
        # in the forward method below
        self.token_embed = nn.Embedding(v, d)
        self.pos_embed = nn.Embedding(max_length, d)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_len = x.shape
        pos = torch.arange(0, seq_len).expand(
            batch_size, seq_len
        ).to(device)
        emb = self.token_embed(x) + self.pos_embed(pos)
        return self.dropout(emb)

In [None]:
class mhSelfAttention(nn.Module):

    def __init__(self, d, n_heads):
        """
        - d: int
            - embedding dims
        - n_heads: int
            - number of self-attention layers (heads)
        """
        super().__init__()
        self.d = d
        self.n_heads = n_heads
        self.head_d = self.d // self.n_heads

        assert (self.n_heads*self.head_d == self.d), \
            'embed dims must be divisible by N heads'

        self.q_proj = nn.Linear(self.d, self.d, bias=False)
        self.v_proj = nn.Linear(self.d, self.d, bias=False)
        self.k_proj = nn.Linear(self.d, self.d, bias=False)

        self.fc_out = nn.Linear(self.n_heads * self.head_d, self.d)

    def forward(self, x):
        """
        - x: torch.Tensor
            - input tensor of shape [batch_size, seq_len, embed_dims]
        """

        batch_size, seq_len = x.shape[0], x.shape[1]

        # understanding linear projections combined with different
        # permutations can be tricky here. Thus, for visual reference
        # for the inner workings of them refer to the fig. 3 below
        Q = self.q_proj(x).reshape(
            batch_size, seq_len, self.n_heads,self.head_d).permute(
                0, 2, 1, 3
            )
        K = self.k_proj(x).reshape(
            batch_size, seq_len, self.n_heads,self.head_d).permute(
                0, 2, 3, 1
            )
        V = self.v_proj(x).reshape(
            batch_size, seq_len, self.n_heads,self.head_d).permute(
                0, 2, 1, 3
            )

        # making BMM via einsum API (see examples of einsum operations above)
        att_scores = torch.einsum('bijk,bikl -> bijl', Q, K)

        # transforming QK into probabilities with preliminary norm
        att_matrix = torch.softmax(att_scores/self.d**.5, dim=-1)
        att_out = torch.einsum('bijk,bikl -> bijl', att_matrix, V)

        # reverse transformation of multihead -> tensor of sequences of tokens
        concat_out = att_out.permute(0, 2, 1, 3).reshape(batch_size,seq_len,self.d)

        return concat_out

In [None]:
class TransformerEncoder(nn.Module):

    def __init__(self, d, n_heads, gamma=1, dropout=.1, *args, **kwargs) -> None:
        """
        - d: int
            - embedding dimensions
        - n_heads: int
            - number of heads in multihead attention
        - gamma: int
            - multiplier for number of neurons to be in hidden layers
         compared to the number of input ones
        - dropout: float
            - dropout rate (explained in previously defined encoding module)
        """
        super().__init__(*args, **kwargs)

        self.att = mhSelfAttention(d, n_heads)

        # simple normalization layers, that apply Z-score
        # normalization to weights with additional embed parameters of
        # gamma and beta: https://arxiv.org/pdf/1607.06450
        self.norm1 = nn.LayerNorm(d)
        self.norm2 = nn.LayerNorm(d)

        # instead of ReLU we could apply GELU that is incorporated in many
        # models' norm modules including Gemma: https://arxiv.org/pdf/2403.08295
        self.ffn = nn.Sequential(
            nn.Linear(d,d*gamma),
            nn.GELU(),
            nn.Linear(d*gamma,d)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        att_out = self.dropout(self.att(x))
        x = self.norm1(x + att_out)
        ffn_out = self.ffn(x)
        return self.norm2(x + ffn_out)

In [None]:
class attBinaryClassifier(nn.Module):

    def __init__(self, vocab_size, max_seq_len, embed_dims,
                 n_heads, gamma, dropout, *args, **kwargs) -> None:
        """
        - vocab_size: int
            - possible number of tokens
        - max_seq_len: int
            - maximum number of tokens (for pos. encoding)
        - embed_dims: int
            - embedding dimensions
        - n_heads: int
            - number of heads in MH attention
        - gamma: int
            - multiplier for number of neurons to be in hidden layers of
            the encoder compared to the number of input ones
        - dropout: float
            - dropout rate (explained in previously defined encoding module)
            that is used for Encoder
        """

        super().__init__(*args, **kwargs)

        self.embedder = posEmbed(vocab_size, embed_dims, max_seq_len)
        self.encoder = TransformerEncoder(embed_dims, n_heads, gamma, dropout)
        self.fc = nn.Linear(embed_dims, 1)

    def forward(self, x):
        embeddings = self.embedder(x)
        encodings = self.encoder(embeddings)
        compact_encodings = encodings.max(dim=1)[0]
        return self.fc(compact_encodings)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import polars as pl

# Using the 'read_csv' function with 'truncate_ragged_lines=True'
# and specifying the separator to solve the inconsistent number of columns.
ds_path = '/content/drive/MyDrive/Machine Learning/😰Stress Detection from Social Media Articles/Reddit_Combi.csv'
df = pl.read_csv(
    ds_path,
    truncate_ragged_lines=True, # Handle rows with extra columns by truncating them
    separator=';'  # Set the correct delimiter in the file
)
df.head(2)

title,body,Body_Title,label,Unnamed: 4_level_0
str,str,str,i64,str
"""Envy to other is swallowing me""","""Im from developingcountry, Ind…","""Envy to other is swallowing me…",1,
"""Nothin outta the ordinary. Par…","""Um hello ....well many can rel…","""Nothin outta the ordinary. Par…",1,


In [None]:
xFeature, yFeature = 'Body_Title', 'label'

# dropping NA's
df = df.with_columns(pl.col(xFeature).drop_nans())
df = df.with_columns(pl.col(yFeature).drop_nans())
df = df.with_columns(pl.col(xFeature).drop_nulls())
df = df.with_columns(pl.col(yFeature).drop_nulls())

# get rid of empty or extremely short posts
df = df.filter(~(pl.col(xFeature).str.len_chars() <= 1))

# get rid of duplicates
df = df.filter(~(pl.col(xFeature).is_duplicated()))

# removing any links
df = df.with_columns(pl.col(xFeature).replace(r'(https?://[^\s]+)', ' '))
X, y = df.select(xFeature).to_numpy(), df.select(yFeature).to_numpy()
X.shape, y.shape

((3123, 1), (3123, 1))

In [None]:
# class imbalance

f'Just {round(y[y < 1].shape[0] / y.shape[0] * 100, 2)}% of records labeled as 0'

'Just 12.1% of records labeled as 0'

In [None]:
# lots of lengthy sequences (computational difficulties for Encoder and, hence, loss of useful(?) data in result of needed truncation)

import plotly.figure_factory as ff

dist_data = [[len(x[0]) for x in X]]
dist_label = ['Dist of lengths']

fig = ff.create_distplot(dist_data, dist_label)
fig.show()

In [None]:
from torch.utils.data import Dataset
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

textual_params = {
    'vocab_size': 2**15,
    'embedding_dims': 2**9,
    'max_seq': 256
}

# initializing WordPiece tokenizer
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

# normalization pipeline of the tokenizer that involves:
# NFD - Unicode normalizer (e.g schön -> scho\u0308n)
# Lowercase - tokens to lowercase
# StripAccents - get rid of accents in tokens. Used in pair with NDF
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

# pretokenizer zips each token with their indices in a string

# training the WordPiece tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=textual_params['vocab_size'], special_tokens=special_tokens)
tokenizer.train_from_iterator(X, trainer=trainer)

In [None]:
# tokenization example

encoding = tokenizer.encode("Fox jumps over a lazy dog")
print(f'Tokenized string: {encoding.tokens}\n Token IDs: {encoding.ids}')

Tokenized string: ['fo', '##x', 'jumps', 'over', 'a', 'lazy', 'dog']
 Token IDs: [637, 112, 13349, 363, 41, 1661, 1918]


In [None]:
from torch.utils.data import Dataset
import torch

def pad_trunc_sequence(seq: list) -> list:
    if len(seq) < textual_params['max_seq']:
        seq += [tokenizer.encode('[PAD]').ids[0] \
                           for _ in range(textual_params['max_seq'] - len(seq))]
        return seq
    else:
        return seq[:textual_params['max_seq']]

class postsDataset(Dataset):
    def __init__(self, P, L):

        self.posts = []
        self.labels = []
        for index, post in enumerate(P):
            try:
                tokensAsVocabIDs = tokenizer.encode(post[0]).ids
                self.posts.append(pad_trunc_sequence(tokensAsVocabIDs))
                self.labels.append(L[index].tolist())
            except Exception as err:
                print(f'Raised an err ({err}) on sequence: {post}')
                pass

        self.labels = torch.tensor(self.labels, dtype=torch.long)
        self.posts = torch.tensor(self.posts, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return (self.posts[index], self.labels[index])

In [None]:
# initialize the dataset

ds = postsDataset(X, y)

# get the second item
ds[1]

(tensor([14865,  5485,   146,  9570,    17, 19747,    17,   421,   276,    17,
           288,   276,    17,    11,    46, 18917,   712,   169,   834,  6763,
            16,  1735,  2208,  5048,  2272,   204,   569,    12,   666,   181,
          5752,  2257,    17,    17,    17,    17,   567,   620,   222,  2037,
           379,   802,    17,   439,   692,    49,    80,    53,  3102,    49,
            80,    53,    41, 19844,    17,  1492,  6545,  1097,   553,    49,
           221,   506,   585,   140,   237,   913,   337,   980,    17,    49,
            80,    53,   181,   151,   464,   140,   389,    41,    81,  6993,
          6876,    82,   314,    49,    80,    53,   202,   179, 20568,    17,
            49,    80,    53,  5277,   551,    17,   262,   346,  9661,   169,
           278,   306,   368,    49,    80,    53,   516,   488,   367,   306,
           330,   285,   314,    49,  2383,   551,   302,   814,    49,   275,
          2230,     8,   306,    80,   294,   516,  

In [None]:
from torch.utils.data import random_split

train_sample_size = int(.75*len(ds))
test_sample_size = len(ds) - train_sample_size
train, test = random_split(ds, [train_sample_size,
                                   test_sample_size])

print(f'Train size: {len(train)}\n Test size: {len(test)}')

Train size: 2342
 Test size: 781


In [None]:
from torch.utils.data import DataLoader

batch_size = 2**4
train_dl = DataLoader(train, batch_size=batch_size,
                        shuffle=True, num_workers=0)
test_dl = DataLoader(test, batch_size=batch_size,
                        shuffle=False, num_workers=0)

# double checking if dataloaders match expected sizes of datasets
assert (len(train_dl) in [len(train)//batch_size,len(train)//batch_size+1]) and \
    (len(test_dl) in [len(test)//batch_size,len(test)//batch_size+1])

In [None]:
clf_params = {
    'embed_dims': 2**7,
    'n_heads': 2**3,
    'gamma': 3,
    'max_seq_len': 2**8,
    'vocab_size': 2**15,
    'dropout_rate': .01
}

classifier = attBinaryClassifier(
    clf_params['vocab_size'],
    clf_params['max_seq_len'],
    clf_params['embed_dims'],
    clf_params['n_heads'],
    clf_params['gamma'],
    clf_params['dropout_rate']
)
classifier.to(device)

attBinaryClassifier(
  (embedder): posEmbed(
    (token_embed): Embedding(32768, 128)
    (pos_embed): Embedding(256, 128)
    (dropout): Dropout(p=0.01, inplace=False)
  )
  (encoder): TransformerEncoder(
    (att): mhSelfAttention(
      (q_proj): Linear(in_features=128, out_features=128, bias=False)
      (v_proj): Linear(in_features=128, out_features=128, bias=False)
      (k_proj): Linear(in_features=128, out_features=128, bias=False)
      (fc_out): Linear(in_features=128, out_features=128, bias=True)
    )
    (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (ffn): Sequential(
      (0): Linear(in_features=128, out_features=384, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=384, out_features=128, bias=True)
    )
    (dropout): Dropout(p=0.01, inplace=False)
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
trainParams = {
    'lr': .001,
    'max_epochs': 10,
    'verbose': 1
}

criterion = nn.BCEWithLogitsLoss().to(device)
optim = torch.optim.Adam(classifier.parameters(), lr=trainParams['lr'])

In [None]:
def bin_acc(preds, y):
    """
    Computing accuracy for binary classification, hence the usage of sigmoid
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

In [None]:
from tqdm import tqdm
import numpy as np

for epoch in range(trainParams['max_epochs']):
    losses = []
    accuracies = []
    for (encPosts, labels) in tqdm(train_dl):
        optim.zero_grad()

        labels = labels.flatten()
        out = classifier(encPosts.to(device)).squeeze(1)
        loss = criterion(out, labels.float().to(device))
        losses.append(loss.item())
        accuracies.append(bin_acc(out.detach().cpu(), labels))

        loss.backward()
        # prevent exploding gradients by clipping their norm to 1
        nn.utils.clip_grad_norm_(classifier.parameters(),1)
        optim.step()

    if epoch % trainParams['verbose'] == 0:
        print(f'Epoch {epoch} :: av. bin. cross entropy:', np.array(losses).mean(), ':: av. accuracy:', np.array(accuracies).mean())

100%|██████████| 147/147 [01:08<00:00,  2.13it/s]


Epoch 0 :: av. bin. cross entropy: 0.3709443273897074 :: av. accuracy: 0.875


100%|██████████| 147/147 [00:58<00:00,  2.53it/s]


Epoch 1 :: av. bin. cross entropy: 0.2745596881525046 :: av. accuracy: 0.89540815


100%|██████████| 147/147 [01:04<00:00,  2.29it/s]


Epoch 2 :: av. bin. cross entropy: 0.19153281661434746 :: av. accuracy: 0.9264456


100%|██████████| 147/147 [00:57<00:00,  2.55it/s]


Epoch 3 :: av. bin. cross entropy: 0.13038150299669934 :: av. accuracy: 0.95535713


100%|██████████| 147/147 [00:58<00:00,  2.51it/s]


Epoch 4 :: av. bin. cross entropy: 0.07422773813873175 :: av. accuracy: 0.9737812


100%|██████████| 147/147 [00:57<00:00,  2.56it/s]


Epoch 5 :: av. bin. cross entropy: 0.025038253316389664 :: av. accuracy: 0.9957483


100%|██████████| 147/147 [00:59<00:00,  2.49it/s]


Epoch 6 :: av. bin. cross entropy: 0.007879052247034803 :: av. accuracy: 1.0


100%|██████████| 147/147 [00:57<00:00,  2.57it/s]


Epoch 7 :: av. bin. cross entropy: 0.0028681592925904687 :: av. accuracy: 1.0


100%|██████████| 147/147 [00:59<00:00,  2.46it/s]


Epoch 8 :: av. bin. cross entropy: 0.0014337978738979404 :: av. accuracy: 1.0


100%|██████████| 147/147 [00:57<00:00,  2.57it/s]

Epoch 9 :: av. bin. cross entropy: 0.0009450614802795937 :: av. accuracy: 1.0





In [None]:
accuratePred = 0

with torch.no_grad():
    for (encPosts, labels) in test_dl:
        labels = labels.flatten()
        out = classifier(encPosts.to(device)).squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(out))
        correct = (rounded_preds.detach().cpu() == labels).float().sum()
        accuratePred += correct

print(f'Accuracy on the test set: {accuratePred / len(test)}')

Accuracy on the test set: 0.9142125248908997


In [None]:
def inference(publication: str):
    classifier.eval()
    tokenized_seq = torch.tensor([pad_trunc_sequence(tokenizer.encode(publication).ids)])
    with torch.no_grad():
        out = classifier(tokenized_seq.to(device)).squeeze(1)
        prediction = torch.round(torch.sigmoid(out))
    classifier.train()
    return prediction

In [None]:
import numpy as np

sample = X[np.random.choice(np.arange(0,X.shape[0]))][0]
print('Sample:',sample,'\n')
print(f'Predicted class: {inference(sample).item()}')

Sample: I just want to be passionate about something again It's been so fucking long. I try to find a hobby, a goal, a fucking anything, but I just can't. Best case scenario I put a lot of effort into it for a week or two before I finally give up faking it. I mean I literally force myself to do things hoping I'll grow passionate about them, or at least enjoy them enough to look forward to doing them. But I can't, and I don't know how to make it better. 40 years old, my youngest kids are starting to get to the point where they don't need me as much anymore which is good for them. I fear the day tho when I don't even have them to give me some purpose. Living out the rest of my days as a functional vegetable. Pretty sure that's where I'm headed. /rant 

Predicted class: 1.0
