In [None]:
!pip install transformers
!pip install pytorch-transformers

import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [None]:
import torch
import torch.cuda
from google.colab import drive
drive.mount('/content/drive')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {'cuda' if torch.cuda.is_available() else 'cpu'}")

Mounted at /content/drive
Using cuda


In [None]:
import pandas as pd
import torch
import numpy as np

from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
labels = {'NOT': 0,
         'HOF': 1}

labels2 = {'NOT': 0,
         'OFF': 1}

def preprocess(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['tweet'] = data['tweet'].str.lower()
    data['tweet'] = data['tweet'].str.replace(r'(@\S+)', r' ', regex=True)
    data['tweet'] = data['tweet'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['tweet'] = data['tweet'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['tweet'] = data['tweet'].str.replace('[^\w\s]', '')                                                       # remove special characters
    data['tweet'] = data['tweet'].replace('\d', '', regex=True)
    data['tweet'] = data['tweet'].str.replace(r'[^\x00-\x7F]+', r' ', regex=True)
    data['tweet'] = data['tweet'].replace('@[a-zA-Z0-9-_]+', '@USER', regex=True)                                     # remove usernames
    data['tweet'] = data['tweet'].replace('https://t.co/[a-zA-Z0-9]+', '', regex=True) 
    
    i = 0
    for index, row in data.iterrows():
        text = row['tweet']
        #word_tokens = tokenizer.tokenize(text)
        token_id = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors='pt')
        #filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_ = df_.append({
            "index": i,
            "Class": labels[row['task_1']],
            "Tweet": text,
            'Token_id': token_id
        }, ignore_index=True)
        i = i + 1
    return df_

def pre2(data, columns):
    df_ = pd.DataFrame(columns=columns)
    
    i = 0
    for index, row in data.iterrows():
        text = row['tweet']
        token_id = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors='pt')
        #filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_ = df_.append({
            "index": i,
            "Class": labels2[row['subtask_a']],
            "Tweet": text,
            'Token_id': token_id
        }, ignore_index=True)
        i = i + 1
    return df_

In [None]:

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
data1 = pd.read_table('/content/drive/MyDrive/deepNN-labs/data/HASOCData/english_dataset.tsv')
#data2 = pd.read_table('/content/drive/MyDrive/deepNN-labs/data/HASOCData/hasoc2019_en_test-2919.tsv')
columns = ['index', 'Tweet', 'Token_id','Class']
data = preprocess(data1, columns)

#dataT = preprocess(data2, columns)


In [None]:
tex = pd.read_table('/content/drive/MyDrive/deepNN-labs/data/olid_pre/train.txt')

data = pre2(tex, columns)

In [None]:


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model = model.to(device)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
train_samp = int(0.8*len(data))
val_samp = len(data) - train_samp

train, val = torch.utils.data.random_split(data, [train_samp, val_samp], generator=torch.Generator())

dataloader = DataLoader(train, batch_size=32, shuffle=True)

val_loader = DataLoader(val, batch_size=32, shuffle=True)

In [None]:
from torch.utils.data import Dataset

from torch.nn.utils.rnn import pad_sequence

class TweetDataset(Dataset):
    def __init__(self, tweets, labels):
        self.tweets = tweets
        self.labels = labels

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        return tweet, torch.tensor(label, dtype=torch.long)

    @staticmethod
    def collate_fn(batch):
        tweets, labels = zip(*batch)
        
        # Ensure input_ids and attention_mask are lists of 1D tensors
        input_ids = [t['input_ids'].squeeze() for t in tweets]
        attention_mask = [t['attention_mask'].squeeze() for t in tweets]

        # Pad sequences
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
        attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.stack(labels)
        }




In [None]:
train_df = data.iloc[train.indices]
val_df = data.iloc[val.indices]

# Create the dataset
train_dataset = TweetDataset(train_df['Token_id'].tolist(), train_df['Class'].tolist())
val_dataset = TweetDataset(val_df['Token_id'].tolist(), val_df['Class'].tolist())

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=TweetDataset.collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=TweetDataset.collate_fn)

num_epochs = 10

from transformers import AdamW

# Setup the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


# Setup the training loop
for epoch in range(num_epochs):
    # Training
    total_train_loss = 0
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # ========================
    #   Validation
    # ========================
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:
        with torch.no_grad():        
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    




  Accuracy: 0.81
  Accuracy: 0.81
  Accuracy: 0.79
  Accuracy: 0.77
  Accuracy: 0.79
  Accuracy: 0.75
  Accuracy: 0.78
  Accuracy: 0.78
  Accuracy: 0.77
  Accuracy: 0.78
