In [12]:
import torch 
import lightning as ln
import torch.nn as nn 
import torch.nn.functional as Func 
import pytorch_lightning as pl 
import torchmetrics
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader

import os
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [7]:
train_path = 'cyberbullying_dataset.csv'
dataset = pd.read_csv(train_path)

In [8]:
print(dataset['cyberbullying_type'].value_counts())
print('#' * 20)
print(dataset.info())

cyberbullying_type
5    7998
0    7992
2    7973
1    7961
3    7945
4    7823
Name: count, dtype: int64
####################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47692 non-null  object
 1   cyberbullying_type  47692 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 745.3+ KB
None


In [9]:
le = LabelEncoder()

dataset['cyberbullying_type'] = le.fit_transform(dataset['cyberbullying_type'])
classes  = le.classes_

print(classes)

[0 1 2 3 4 5]


In [10]:
tokenizer = get_tokenizer('basic_english')



<function torchtext.data.utils._basic_english_normalize(line)>

In [11]:
# Accessing the data

class TweetDataset(Dataset):
    def __init__(self, csv_file, batch_size=32):
        super().__init__()
        self.csv_file = csv_file
        self.batch_size = batch_size
        self.tokenizer = get_tokenizer('basic_english')
        self.vocab = None

    def setup(self, stage=None):
        data = pd.read_csv(self.csv_file)
        
        self.train_data, self.val_data = train_test_split(data, test_size=0.2, random_state=42)
        self.test_data = self.val_data

        text_iterator = (text for text in self.train_data['tweet_text'])
        self.vocab = build_vocab_from_iterator(text_iterator, specials=['<UNK>'])
        self.vocab.set_default_index(self.vocab['<UNK>'])
        
        self.input_size = len(elf.vocab)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        
        sample = self.data.iloc[index]

        if self.transform:
            sample = self.transform(sample)

        sample_dict = sample.to_dict()

        if self.transform:
            sample_dict = self.transform(sample_dict)

        return sample_dict

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)
    
    def val_dataloader(self):
        return DataLoader(self.val_data, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)

    def test_dataloader(self):
        return DataLoader(self.test_data, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)

    def collate_fn(self, batch):
        text = [self.vocab(self.tokenizer(row['tweet_text'])) for row in batch]
        label = [row['cyberbullying_type'] for row in batch]

        return torch.tensor(text), torch.tensor(label)


In [14]:
class TextClassifier(pl.LightningModule):
    def __init__(self, input_size, num_classes=6, lr=0.001):
        super().__init__()
        self.save_hyperparameters()
        self.embedding = nn.Embedding(input_size, 64)
        self.fc1 = nn.Linear(64, 64)
        self.fc2 = nn.Linear(64, num_classes)
        self.relu = nn.ReLu()

        self.train_acc = torchmetrics.Accuracy()
        self.val_acc = torchmetrics.Accuracy()
        self.test_acc = torchmetrics.Accuracy()

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

    def training_step(self, batch, batch_id):
        x, y = batch
        logit = self(x)
        loss = Func.cross_entropy(logits, y)
        preds = logits.argmax(dim=1)
        self.train_acc(preds, y)
        self.log('train_loss', loss)
        self.log('train_acc', self.train_acc)
        return loss 

    def validation_step(self, batch, batch_id):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        preds = logits.argmax(dim=1)
        self.val_acc(preds, y)
        self.log('val_loss', loss)
        self.log('val_acc', self.val_acc)

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        preds = logits.argmax(dim=1)
        self.test_acc(preds, y)
        self.log('test_loss', loss)
        self.log('test_acc', self.test_acc)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        