In [1]:
!pip install transformers
!pip install datasets



In [2]:
from transformers import RobertaConfig, RobertaModel
from transformers import RobertaTokenizer
import torch
import torch.nn as nn
from datasets import load_dataset
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

In [3]:
# loading tweets
dataset = load_dataset("tweets_hate_speech_detection")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1430.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=827.0, style=ProgressStyle(description_…

Using custom data configuration default



Downloading and preparing dataset tweets_hate_speech_detection/default (download: 2.96 MiB, generated: 3.04 MiB, post-processed: Unknown size, total: 6.00 MiB) to /root/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c32a982d8b2d6233065d820ac655454174f8aaa8faddc74979cf793486acd3b0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1276746.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset tweets_hate_speech_detection downloaded and prepared to /root/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c32a982d8b2d6233065d820ac655454174f8aaa8faddc74979cf793486acd3b0. Subsequent calls will reuse this data.


In [4]:
from sklearn.model_selection import train_test_split

# convertind dataset to the pandas dataframe + train_test_split 
def data_preprocessing(dataset):
    data = {'tweets': [], 'labels': []}
    for record in dataset['train']:
        data['tweets'].append(record['tweet'])
        data['labels'].append(record['label'])
    
    dataframe = pd.DataFrame(data=data)
    dataframe.reset_index(drop=True, inplace=True)
    train_tweets, test_tweets, train_labels, test_labels = train_test_split(dataframe['tweets'], 
                                                                            dataframe['labels'],
                                                                            test_size=0.15,
                                                                            random_state=42,
                                                                            shuffle=True)
    train_tweets = train_tweets.reset_index(drop=True)
    test_tweets = test_tweets.reset_index(drop=True)
    train_labels = train_labels.reset_index(drop=True)
    test_labels = test_labels.reset_index(drop=True)

    return (
        train_tweets,
        test_tweets, 
        train_labels, 
        test_labels
    )

train_tweets, test_tweets, train_labels, test_labels = data_preprocessing(dataset)
print(train_tweets.shape)
print(test_tweets.shape)

(27167,)
(4795,)


In [5]:
import re

class Dataset():
    def __init__(self, tweet, label):
        self.tweet = tweet
        self.label = label
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-large', add_prefix_space=True)
        self.max_len = 140

    def __len__(self):
        return len(self.tweet)
      
    def _process_tweet(self, tweet):

        cleaned_tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet) # remove all @mentions
        cleaned_tweet = re.sub(r'[^A-Za-z0-9, ]', '', cleaned_tweet)

        encoding = self.tokenizer.encode(cleaned_tweet)
        mask = [1] * len(encoding)

        padding_length = self.max_len - len(encoding)
        if padding_length > 0:
            encoding = encoding + ([1] * padding_length)
            mask = mask + ([0] * padding_length)
        return encoding, mask

    def __getitem__(self, item):
        encoding, mask = self._process_tweet(self.tweet[item])
        return { 'tweet_ids': torch.tensor(encoding, dtype=torch.long),
                 'mask': torch.tensor(mask, dtype=torch.long),
                 'label_ids': torch.tensor(self.label[item], dtype=torch.float64) 
        }

In [6]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        config = RobertaConfig.from_pretrained('roberta-large')    
        self.model = RobertaModel.from_pretrained('roberta-large', config=config)

        # for param in self.model.parameters():
        #     param.requires_grad = False

        self.l0 = nn.Linear(config.hidden_size, 1)
        self.act = nn.Sigmoid()
        # torch.nn.init.normal_(self.l0.weight, std=0.02)

    def forward(self, ids, mask):
        # return the hidden states from the BERT backbone
        out = self.model(ids, attention_mask=mask)
        logits = self.l0(out[-1])
        act = self.act(logits)

        return act

In [7]:
train_dataset = Dataset(train_tweets, train_labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [8]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=20,
    num_workers=2
)

In [9]:
validation_dataset = Dataset(test_tweets, test_labels)

In [10]:
test_data_loader = torch.utils.data.DataLoader(
    validation_dataset,
    batch_size=20,
    num_workers=2
)

In [11]:
model = Model()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




In [12]:
def train_model(model, train_iter, val_iter, optim, loss, num_epochs, batch_size=20):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    for epoch in range(num_epochs):
      
        model.train()
        train_epoch_loss = []
        train_epoch_acc = []
        val_epoch_loss = []
        val_epoch_acc = []
        
        for idx, batch in enumerate(tqdm(train_iter)):
            tweet_ids = batch['tweet_ids'].to(device)
            mask = batch['mask'].to(device)
            labels = batch['label_ids'].to(dtype=float).to(device)

            optim.zero_grad()
            prediction = model(tweet_ids, mask).squeeze().to(dtype=float)
            # prediction = torch.softmax(prediction, dim=0)

            output = (prediction>0.2).float()
            print(prediction)
            print(labels)

            loss_train = loss(prediction, labels)
            loss_train.backward()

            num_corrects = (labels == output).float().sum()

            acc = 100.0 * num_corrects / batch_size
            print(acc)
            train_epoch_loss.append(loss_train.item())
            train_epoch_acc.append(acc.item())

            optim.step()

        print(f'Train Epoch: {epoch}, Training Loss: {np.mean(train_epoch_loss):.4f}, Training Accuracy: {np.mean(train_epoch_acc): .2f}%')
            
        model.eval()

        with torch.no_grad():
            _predictions = []
            _labels = []
            for idx, batch in enumerate(tqdm(val_iter)):

                tweet_ids = batch['tweet_ids'].to(device)
                mask = batch['mask'].to(device)
                labels = batch['label_ids'].to(dtype=float).to(device)

                prediction = model(tweet_ids, mask).squeeze().to(dtype=float)
                loss_val = loss(prediction, labels)

                _predictions += prediction
                _labels += labels

                output = (prediction>0.2).float()
                num_corrects = (labels == output).float().sum()

                acc = 100.0 * num_corrects / batch_size
                val_epoch_loss.append(loss_val.item())
                val_epoch_acc.append(acc.item())
            
            np.save('current.npy', {'predictions': _predictions, 'labels': _labels}) 

            print(f'Vadlidation Epoch: {epoch}, Vadlidation Loss: {np.mean(val_epoch_loss):.4f}, Vadlidation Accuracy: {np.mean(val_epoch_acc): .2f}%')



device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
num_epochs = 1
lr = 1e-4

model.to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)
loss = nn.BCELoss()

train_model(model, train_data_loader, test_data_loader, optim, loss, num_epochs)

HBox(children=(FloatProgress(value=0.0, max=1359.0), HTML(value='')))

tensor([0.4498, 0.4477, 0.4508, 0.5085, 0.5817, 0.4514, 0.4496, 0.4529, 0.4446,
        0.4477, 0.4529, 0.4800, 0.4482, 0.4581, 0.4504, 0.4522, 0.4517, 0.4519,
        0.4475, 0.4541], dtype=torch.float64, grad_fn=<CopyBackwards>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0.], dtype=torch.float64)
tensor(5.)


RuntimeError: ignored

In [None]:
loss = nn.CrossEntropyLoss()

for record in train_data_loader:
    print(record['label_ids'])
    prediction = model(record['tweet_ids'], record['mask']).squeeze().to(dtype=float)
    print(prediction)
    record = record['label_ids'].to(dtype=float)
    loss_train = loss(prediction, record)
    print(loss_train)

tensor([0., 0.], dtype=torch.float64)
tensor([0.6005, 0.5993], dtype=torch.float64, grad_fn=<CopyBackwards>)
tensor(0.9160, dtype=torch.float64, grad_fn=<BinaryCrossEntropyBackward>)
tensor([0., 0.], dtype=torch.float64)
tensor([0.6042, 0.5977], dtype=torch.float64, grad_fn=<CopyBackwards>)
tensor(0.9188, dtype=torch.float64, grad_fn=<BinaryCrossEntropyBackward>)
tensor([0., 0.], dtype=torch.float64)
tensor([0.5975, 0.6019], dtype=torch.float64, grad_fn=<CopyBackwards>)
tensor(0.9155, dtype=torch.float64, grad_fn=<BinaryCrossEntropyBackward>)
tensor([0., 0.], dtype=torch.float64)
tensor([0.6051, 0.6033], dtype=torch.float64, grad_fn=<CopyBackwards>)
tensor(0.9268, dtype=torch.float64, grad_fn=<BinaryCrossEntropyBackward>)
tensor([0., 0.], dtype=torch.float64)
tensor([0.5979, 0.5971], dtype=torch.float64, grad_fn=<CopyBackwards>)
tensor(0.9101, dtype=torch.float64, grad_fn=<BinaryCrossEntropyBackward>)
tensor([0., 0.], dtype=torch.float64)
tensor([0.5992, 0.5954], dtype=torch.float64, g

KeyboardInterrupt: ignored

In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
loss = nn.BCELoss()

def evaluation(model, test_data_loader):
    model.eval()
    val_epoch_loss = []
    val_epoch_acc = []
    predictions = []
    targets = []
    with torch.no_grad():
        for idx, batch in enumerate(tqdm(test_data_loader)):

            batch_size = len(batch['tweet_ids'])

            tweet_ids = batch['tweet_ids'].to(device)
            mask = batch['mask'].to(device)
            labels = batch['label_ids'].to(dtype=float).to(device)

            prediction = model(tweet_ids, mask).squeeze().to(dtype=float)
            loss_val = loss(prediction, labels)

            predictions += prediction
            targets += labels

            output = (prediction>0.2).float()
            num_corrects = (labels == output).float().sum()

            acc = 100.0 * num_corrects / batch_size
            val_epoch_loss.append(loss_val.item())
            val_epoch_acc.append(acc.item())
    return {'predictions': predictions, 'targets': targets, 'val_epoch_loss': val_epoch_loss, 'val_epoch_acc': val_epoch_acc}
        # print(f'Vadlidation Epoch: {epoch}, Vadlidation Loss: {np.mean(val_epoch_loss):.4f}, Vadlidation Accuracy: {np.mean(val_epoch_acc): .2f}%')


In [23]:
result = evaluation(model, test_data_loader)

HBox(children=(FloatProgress(value=0.0, max=240.0), HTML(value='')))

KeyboardInterrupt: ignored

In [None]:
data = {'tweets': [], 'labels': []}
for record in dataset['train']:
    data['tweets'].append(record['tweet'])
    data['labels'].append(record['label'])

dataframe = pd.DataFrame(data=data)

maxx = 0
tweet = ''
for t in dataframe['tweets']:
    t = re.sub(r'@[A-Za-z0-9]+', '', t) # remove all @mentions
    t = re.sub(r'[^A-Za-z0-9, ]', '', t)
    if len(t) > maxx:
        maxx = len(t)
        tweet = t
print(maxx)
print(tweet)

139
we live in a world where people complain amp post things to justify their laziness instead of getting up amp doing something to change it  


In [24]:
print(result)

{'predictions': [tensor(0.4407, dtype=torch.float64), tensor(0.4343, dtype=torch.float64), tensor(0.4374, dtype=torch.float64), tensor(0.4416, dtype=torch.float64), tensor(0.4407, dtype=torch.float64), tensor(0.4357, dtype=torch.float64), tensor(0.4440, dtype=torch.float64), tensor(0.4378, dtype=torch.float64), tensor(0.4400, dtype=torch.float64), tensor(0.4376, dtype=torch.float64), tensor(0.4338, dtype=torch.float64), tensor(0.4369, dtype=torch.float64), tensor(0.4316, dtype=torch.float64), tensor(0.4380, dtype=torch.float64), tensor(0.4404, dtype=torch.float64), tensor(0.4350, dtype=torch.float64), tensor(0.4367, dtype=torch.float64), tensor(0.4376, dtype=torch.float64), tensor(0.4359, dtype=torch.float64), tensor(0.4357, dtype=torch.float64), tensor(0.4347, dtype=torch.float64), tensor(0.4390, dtype=torch.float64), tensor(0.4354, dtype=torch.float64), tensor(0.4427, dtype=torch.float64), tensor(0.4379, dtype=torch.float64), tensor(0.4368, dtype=torch.float64), tensor(0.4344, dtype=

In [25]:
np.save('my_file.npy', result) 

In [1]:
!pip install numpy==1.16.1
import numpy as np
from sklearn.metrics import precision_recall_curve

result = np.load('my_file.npy')



In [2]:
predictions = [ i.item() for i in result.item(0)['predictions'] ]
labels = [ i.item() for i in result.item(0)['targets'] ]

In [3]:
print(len(labels))
print(len(predictions))

4795
4795


In [70]:
precision, recall, thresholds = precision_recall_curve(labels, updated_predictions)