<a href="https://colab.research.google.com/github/kc2409/cl1/blob/main/paper_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install logger

Collecting logger
  Downloading logger-1.4.tar.gz (1.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: logger
  Building wheel for logger (setup.py) ... [?25l[?25hdone
  Created wheel for logger: filename=logger-1.4-py3-none-any.whl size=1759 sha256=c21a0431e4b893575ad598d6cfe2f25600d964f570fb371fa7c486d5e4c5a5af
  Stored in directory: /root/.cache/pip/wheels/fb/19/7b/09fc73f7503166eaf7f31b4aa0095b7f78af2ec0898e1f8312
Successfully built logger
Installing collected packages: logger
Successfully installed logger-1.4


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Col

In [9]:
class unitedCLLoss(nn.Module):
    def __init__(self, opt, contrast_mode='all'):
        super(unitedCLLoss, self).__init__()
        self.opt = opt
        self.temperature = opt.temperature
        self.contrast_mode = contrast_mode

    def forward(self, features, labels, mask=None):
        """
            Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
            It also supports the unsupervised contrastive loss in SimCLR
        """
        """ Compute loss for model. If both `labels` and `mask` are None,
            it degenerates to SimCLR unsupervised loss:
            https://arxiv.org/pdf/2002.05709.pdf
            Args:
                features: hidden vector of shape [bsz, n_views, ...].
                labels: ground truth of shape [bsz].
                mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
                    has the same class as sample i. Can be asymmetric.
            Returns:
                A loss scalar.
        """
        device = (torch.device('cuda')
                  if features.is_cuda
                  else torch.device('cpu'))

        if len(features.shape) < 3:
            raise ValueError('`features` needs to be [bsz, n_views, ...],'
                             'at least 3 dimensions are required')
        if len(features.shape) > 3:
            features = features.view(features.shape[0], features.shape[1], -1)

        batch_size = features.shape[0]
        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                labels = torch.cat([labels, labels], dim=0)

            mask = torch.eq(labels, labels.T).float().add(0.0000001).to(
                device)
        else:
            mask = mask.float().to(device)

        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)

        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
            0
        )
        mask_pos = mask * logits_mask
        mask_neg = (torch.ones_like(mask) - mask) * logits_mask

        similarity = torch.exp(torch.mm(anchor_feature, contrast_feature.t()) / self.temperature)

        pos = torch.sum(similarity * mask_pos, 1)
        neg = torch.sum(similarity * mask_neg, 1)
        loss = -(torch.mean(torch.log(pos / (pos + neg))))

        return loss

In [5]:
import json
import os
import math
import argparse
import random
import time
from tqdm import tqdm
import numpy
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
#from criterion import unitedCLLoss
#from logger.CSVlogger import CSVlogger
#from model import SSCL
from sklearn import metrics
#from utils.data_utils import DatesetReader
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [6]:
from transformers import BertTokenizer, BertConfig, AutoTokenizer

In [7]:
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = {
            'data': torch.tensor(self.data[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return sample

In [None]:
!pip install torchtext



In [8]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
#from torchtext.legacy import data


df = pd.read_excel('la_train.xlsx')


class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        text = sample['text']
        sentiment_label = sample['sentiment_label']
        stance_label = sample['stance_label']
        return {
            'text': text,
            'sentiment_label': sentiment_label,
            'stance_label': stance_label
        }


batch_size = 16
epochs = 10
learning_rate = 0.001


dataset = CustomDataset(df)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)




In [10]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x79901493b130>

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

# Define the maximum sequence length
max_sequence_length = 24

# Text Encoder architecture
class TextEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(TextEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        return lstm_out[:, -1, :]

# Projection Head architecture
# Projection Head architecture
class ProjectionHead(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ProjectionHead, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)


# Custom Contrastive Loss
class CustomContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.5):
        super(CustomContrastiveLoss, self).__init__()
        self.temperature = temperature

    def forward(self, z1, z2):
        z1 = z1 / torch.norm(z1, dim=1, keepdim=True)
        z2 = z2 / torch.norm(z2, dim=1, keepdim=True)
        sim = torch.mm(z1, z2.t())
        sim /= self.temperature
        loss = torch.nn.functional.cross_entropy(sim, torch.arange(len(sim)))
        return loss

# Stance Model combining Text Encoder and Projection Head
class StanceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, projection_dim):
        super(StanceModel, self).__init__()
        self.text_encoder = TextEncoder(vocab_size, embedding_dim, hidden_dim, num_layers)
        self.projection_head = ProjectionHead(hidden_dim, projection_dim)

    def forward(self, text):
        text_representation = self.text_encoder(text)
        projected_representation = self.projection_head(text_representation)
        return projected_representation

# Load your dataset (replace 'la_train.xlsx' with your dataset file)
data = pd.read_excel('la_train.xlsx')

# Extract relevant data columns
text_data = data['text']
stance_labels = data['stance_label']

# Tokenize your text data (assuming you have a tokenizer)
# Replace this with your actual tokenizer or tokenization process
tokenized_texts = [text.split() for text in text_data]

# Create a vocabulary and mapping for word to index
vocab = set(word for text in tokenized_texts for word in text)
vocab_size = len(vocab)
vocab_to_index = {word: idx for idx, word in enumerate(vocab)}

# Encode stance labels
label_encoder = LabelEncoder()
stance_labels = label_encoder.fit_transform(stance_labels)

# Convert text sequences to lists of indices
text_sequences = [[vocab_to_index[word] for word in text] for text in tokenized_texts]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_sequences, stance_labels, test_size=0.2, random_state=42)

# Pad the text sequences to a consistent length
padded_sequences = [torch.LongTensor(seq[:max_sequence_length]) if len(seq) >= max_sequence_length
                    else torch.cat((torch.LongTensor(seq), torch.zeros(max_sequence_length - len(seq), dtype=torch.long)))
                    for seq in X_train]

# Create DataLoader
train_dataset = TensorDataset(torch.stack(padded_sequences), torch.tensor(y_train, dtype=torch.long))
batch_size = 64
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# Define model hyperparameters
embedding_dim = 128
hidden_dim = 64
num_layers = 2
projection_dim = 32
temperature = 0.5
learning_rate = 0.001
epochs = 10

# Instantiate the StanceModel, optimizer, and contrastive loss
model = StanceModel(vocab_size, embedding_dim, hidden_dim, num_layers, projection_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
contrastive_loss = CustomContrastiveLoss(temperature=temperature)

# Move the model to a GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        optimizer.zero_grad()
        input_ids_batch, stance_batch = [item.to(device) for item in batch]

        output = model(input_ids_batch)

        loss = contrastive_loss(output, output)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch + 1}/{epochs}] Loss: {avg_loss:.4f}')

print('Training complete!')


Epoch 1/10: 100%|██████████| 50/50 [00:02<00:00, 20.55it/s]


Epoch [1/10] Loss: 2.9607


Epoch 2/10: 100%|██████████| 50/50 [00:02<00:00, 21.23it/s]


Epoch [2/10] Loss: 2.5078


Epoch 3/10: 100%|██████████| 50/50 [00:03<00:00, 16.26it/s]


Epoch [3/10] Loss: 2.4213


Epoch 4/10: 100%|██████████| 50/50 [00:02<00:00, 17.03it/s]


Epoch [4/10] Loss: 2.3949


Epoch 5/10: 100%|██████████| 50/50 [00:02<00:00, 22.74it/s]


Epoch [5/10] Loss: 2.3733


Epoch 6/10: 100%|██████████| 50/50 [00:02<00:00, 22.40it/s]


Epoch [6/10] Loss: 2.3639


Epoch 7/10: 100%|██████████| 50/50 [00:02<00:00, 22.35it/s]


Epoch [7/10] Loss: 2.3482


Epoch 8/10: 100%|██████████| 50/50 [00:02<00:00, 22.30it/s]


Epoch [8/10] Loss: 2.3411


Epoch 9/10: 100%|██████████| 50/50 [00:02<00:00, 17.68it/s]


Epoch [9/10] Loss: 2.3265


Epoch 10/10: 100%|██████████| 50/50 [00:03<00:00, 14.13it/s]

Epoch [10/10] Loss: 2.3218
Training complete!



