In [78]:
import os
import shutil
import re
import string
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
import transformers
from transformers import AutoModel, BertTokenizerFast, AdamW, BertModel, BertConfig, AutoModelForSequenceClassification, TrainingArguments
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
tf.get_logger().setLevel('ERROR')
device =  torch.device("cpu")

In [49]:
batch_size = 32
seed = 123

In [50]:
df = pd.read_csv('comments with acceptreject.csv')

In [51]:
data = df.iloc[:,1:3]

In [58]:
def set_seed(seed=None):
    if seed is None:
        seed = 1
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.use_deterministic_algorithms(True)
    torch.backends.cudnn.benchmark = False
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'


In [62]:
MAX_LEN = 50
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')




In [63]:
train_X, test_X, train_Y, test_Y = train_test_split(data['text_comments'], data['label'], train_size = 0.7, random_state=seed)

In [64]:
train_tokens = tokenizer(list(train_X), padding = True, truncation=True)
test_tokens = tokenizer(list(test_X), padding = True, truncation=True)

In [67]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = train_X
            self.tokens = train_tokens
            self.labels = list(train_Y)
        else:
            self.text_data = test_X
            self.tokens = test_tokens
            self.labels = list(test_Y)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [68]:
batch_size = 40
train_dataset = TokenData(train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TokenData(train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [77]:
bert_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Pre-trained model
optimizer = AdamW(bert_model.parameters(), lr=1e-5) # Optimization function
loss_fn = torch.nn.CrossEntropyLoss() # Loss function

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [73]:
num_epochs = 3
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
bert_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [74]:
for epoch in range(num_epochs):
    print("Epoch: ",(epoch + 1))
    # TRAINING BLOCK STARTS
    bert_model.train()
    for i,batch in enumerate(train_loader):    
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Setting the gradients to zero
        optimizer.zero_grad()
        
        # Passing the data to the model
        outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # The logits will be used for measuring the loss
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])

        # Calculating the gradient for the loss function
        loss.backward()
        
        # Optimizing the parameters of the bert model
        optimizer.step()

        # Calculating the running loss for logging purposes
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size

        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
    # Logging epoch-wise training loss
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    # TRAINING BLOCK ENDS 

    # TESTING BLOCK STARTS
    bert_model.eval()
    correct = 0
    test_pred = []
    for i, batch in enumerate(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # We don't need gradients for testing
        with torch.no_grad():
            outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # Logits act as predictions
        logits = outputs.logits
        
        # Calculating total batch loss using the logits and labels
        loss = loss_fn(logits, batch['labels'])
        test_batch_loss = loss.item()
        
        # Calculating the mean batch loss
        test_last_loss = test_batch_loss / batch_size
        print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))
        
        # Comparing the predicted target with the labels in the batch
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        print("Testing accuracy: ",correct/((i + 1) * batch_size))
    
    print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)

Epoch:  1
Training batch 1 last loss: 0.018407671153545378
Training batch 2 last loss: 0.015554276108741761
Training batch 3 last loss: 0.017709970474243164
Training batch 4 last loss: 0.016583089530467988
Training batch 5 last loss: 0.01867441236972809

Training epoch 1 loss:  0.01867441236972809
Testing batch 1 loss: 0.016036444902420045
Testing accuracy:  0.725
Testing batch 2 loss: 0.01789584308862686
Testing accuracy:  0.625

Testing epoch 1 last loss:  0.01789584308862686
Epoch:  2
Training batch 1 last loss: 0.015916526317596436
Training batch 2 last loss: 0.01715259104967117
Training batch 3 last loss: 0.01704298108816147
Training batch 4 last loss: 0.015425939857959748
Training batch 5 last loss: 0.01484663486480713

Training epoch 2 loss:  0.01484663486480713
Testing batch 1 loss: 0.017059850692749023
Testing accuracy:  0.6
Testing batch 2 loss: 0.01653236895799637
Testing accuracy:  0.6

Testing epoch 2 last loss:  0.01653236895799637
Epoch:  3
Training batch 1 last loss: 0.

more custom one below

In [6]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.text_comments
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [7]:
train_size = 0.8
train_dataset=data.sample(frac=train_size,random_state=200)
test_dataset=data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (250, 2)
TRAIN Dataset: (200, 2)
TEST Dataset: (50, 2)


In [8]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [70]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 2)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output



In [10]:
model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [16]:
loss_fn = torch.nn.CrossEntropyLoss()

In [17]:
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

In [20]:
def train(epoch):
    model.train()
    total_loss = 0  # Track total loss for the epoch
    for batch in training_loader:
        ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
        targets = batch['targets'].to(device, dtype=torch.long)  # Use long for CrossEntropyLoss

        # Forward pass
        outputs = model(ids, mask, token_type_ids)

        # Compute loss
        loss = loss_fn(outputs, targets)

        # Zero gradients, backward pass, optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # Add current batch's loss to total loss

        # Print the loss for every batch if you want more detailed tracking
        print(f"Epoch {epoch} Batch Loss: {loss.item()}")

    # Print the average loss for the epoch
    avg_loss = total_loss / len(training_loader)
    print(f"Epoch {epoch} Average Loss: {avg_loss}")

In [21]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch 0 Batch Loss: 0.6146759390830994
Epoch 0 Batch Loss: 0.5926244258880615
Epoch 0 Batch Loss: 0.652580738067627
Epoch 0 Batch Loss: 0.7104547619819641
Epoch 0 Batch Loss: 0.7690680623054504
Epoch 0 Batch Loss: 0.7386425137519836
Epoch 0 Batch Loss: 0.5696561932563782
Epoch 0 Batch Loss: 0.6657519340515137
Epoch 0 Batch Loss: 0.669732928276062
Epoch 0 Batch Loss: 0.8662726879119873
Epoch 0 Batch Loss: 0.5507792830467224
Epoch 0 Batch Loss: 0.5551527142524719
Epoch 0 Batch Loss: 0.4965013861656189
Epoch 0 Batch Loss: 0.5561211705207825
Epoch 0 Batch Loss: 0.75409996509552
Epoch 0 Batch Loss: 0.7059251070022583
Epoch 0 Batch Loss: 0.5550674200057983
Epoch 0 Batch Loss: 0.6335068941116333
Epoch 0 Batch Loss: 0.5066803693771362
Epoch 0 Batch Loss: 0.585049033164978
Epoch 0 Batch Loss: 0.7676651477813721
Epoch 0 Batch Loss: 0.5296412706375122
Epoch 0 Batch Loss: 0.6476107239723206
Epoch 0 Batch Loss: 0.7686125636100769
Epoch 0 Batch Loss: 0.6716576814651489
Epoch 0 Average Loss: 0.645341

In [22]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)

            # Apply sigmoid to the outputs (assuming outputs are logits)
            sigmoid_outputs = torch.sigmoid(outputs)

            # Get the predicted class by thresholding at 0.5 (binary classification)
            predicted_classes = (sigmoid_outputs[:, 1] > 0.3).cpu().detach().numpy()
   
            # Store the targets and predictions
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(predicted_classes.tolist())

    return fin_outputs, fin_targets

In [23]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    report = classification_report(targets, outputs, target_names=["R", "A"], zero_division=0)

    print(report)

              precision    recall  f1-score   support

           R       0.00      0.00      0.00        30
           A       0.40      1.00      0.57        20

    accuracy                           0.40        50
   macro avg       0.20      0.50      0.29        50
weighted avg       0.16      0.40      0.23        50



In [107]:
outputs_probabilities = torch.sigmoid(outputs).cpu().detach().numpy()
print(outputs_probabilities)  # Check the raw probabilities


TypeError: sigmoid(): argument 'input' (position 1) must be Tensor, not list

In [67]:
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.36      1.00      0.53        18

    accuracy                           0.36        50
   macro avg       0.18      0.50      0.26        50
weighted avg       0.13      0.36      0.19        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


sentiment analysis down below

In [18]:
#comments = data['text_comments'].tolist()
#labels = data['label'].tolist()

In [19]:
def clean_text(text):
    # Remove commas
    text = text.replace(',', '')
    # Remove quotation marks
    text = text.replace('“', '').replace('”', '')
    # Replace newlines with spaces
    text = text.replace('\n', ' ')
    # Remove URLs
    text = re.sub(r' ?(f|ht)tp(s?)://(.*)[.][a-z]+', '', text)
    # Replace non-alphanumeric characters with spaces
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    # Convert to lowercase
    #wtext = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

In [20]:
#sentiment_pipeline = pipeline('sentiment-analysis')

In [21]:
#results = sentiment_pipeline(comments)

In [22]:
#data['sentiment'] = [result['label'] for result in results]
#data['confidence'] = [result['score'] for result in results]