# Import requirements

In [1]:
from google.colab import files
uploaded = files.upload()

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    ElectraForSequenceClassification, ElectraTokenizer,
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW
)

# 1. Preprocess

In [4]:
def make_id_file(task, tokenizer):
    def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join(file_name), 'r', encoding='utf-8') as f:
            id_file_data = [tokenizer.encode(line.lower()) for line in f.readlines()]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings
    
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1')
    # train_neg = make_data_strings('train.0_aug_cleaning.txt')
    train_neg = make_data_strings('sentiment.train.0')
    dev_pos = make_data_strings('sentiment.dev.1')
    dev_neg = make_data_strings('sentiment.dev.0')

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

In [5]:
tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")

In [6]:
!ls

pytorch_model.bin  sentiment.dev.1    submission_electra_disc_aug.csv
sample_data	   sentiment.train.0  test_no_label.csv
sentiment.dev.0    sentiment.train.1  train.0_aug_cleaning.txt


In [7]:
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer)

it will take some times...
make id file finished!


In [8]:
train_neg[:10]

['101 1045 2001 13718 13534 1012 102',
 '101 2061 2006 2000 1996 7570 22974 2229 1010 1996 3059 2003 2236 2448 1997 1996 4971 1012 102',
 '101 10124 6240 1998 1037 10228 1997 29022 2292 8525 3401 1012 102',
 '101 2498 2428 2569 1004 2025 11007 1997 1996 1002 1035 16371 2213 1035 3976 6415 1012 102',
 '101 2117 1010 1996 21475 7570 22974 2063 1010 2009 2003 2012 3217 18436 1012 102',
 '101 1045 2018 2000 3477 1002 1035 16371 2213 1035 2000 5587 8808 2000 1996 7570 22974 2063 1012 102',
 '101 2016 2409 2033 2045 2001 1037 3715 2005 1996 11225 2006 1996 2217 1012 102',
 '101 2024 2017 12489 2033 1029 102',
 '101 1045 2001 2025 2183 2000 3477 2005 1996 11225 2006 1996 2217 1012 102',
 '101 1045 3641 2009 2302 2292 8525 3401 1010 20856 1010 24444 1010 2030 11225 1012 102']

In [9]:
class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

In [10]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
dev_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)

In [11]:
for i, item in enumerate(train_dataset):
    print(item)
    if i == 10:
        break

(array([ 101, 6581, 2833, 1012,  102]), array([1]))
(array([  101, 21688,  8013,  2326,  1012,   102]), array([1]))
(array([  101,  2027,  2036,  2031,  3679, 19247,  1998,  3256,  6949,
        2029,  2003,  2428,  2204,  1012,   102]), array([1]))
(array([  101,  2009,  1005,  1055,  1037,  2204, 15174,  2098,  7570,
       22974,  2063,  1012,   102]), array([1]))
(array([ 101, 1996, 3095, 2003, 5379, 1012,  102]), array([1]))
(array([ 101, 2204, 3347, 2833, 1012,  102]), array([1]))
(array([ 101, 2204, 2326, 1012,  102]), array([1]))
(array([  101, 11350,  1997,  2154,  2003, 25628,  1998,  7167,  1997,
       19247,  1012,   102]), array([1]))
(array([  101,  2307,  2173,  2005,  6265,  2030,  3347, 27962,  1998,
        5404,  1012,   102]), array([1]))
(array([ 101, 1996, 2047, 2846, 3504, 6429, 1012,  102]), array([1]))
(array([ 101, 2023, 2173, 2001, 2200, 2204, 1012,  102]), array([1]))


In [12]:
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)
    sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    
    attention_mask = torch.tensor([[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in sorted_indices])

    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])
    
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids, labels

In [13]:
train_batch_size=512
eval_batch_size=512

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [18]:
# random seed
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator")
model.to(device)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.d

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [19]:
model.train()
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)

In [20]:
def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

In [21]:
train_epoch = 3

lowest_valid_loss = 9999.

epoch = 1

 
for epoch in range(train_epoch):
  with tqdm(train_loader, unit="batch") as tepoch:
      for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
          tepoch.set_description(f"Epoch {epoch}")
          input_ids = input_ids.to(device)
          attention_mask = attention_mask.to(device)
          token_type_ids = token_type_ids.to(device)
          position_ids = position_ids.to(device)
          labels = labels.to(device, dtype=torch.long)

          optimizer.zero_grad()

          output = model(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          position_ids=position_ids,
                          labels=labels)

          loss = output.loss
          loss.backward()

          optimizer.step()

          tepoch.set_postfix(loss=loss.item())
          if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
              # Evaluate the model five times per epoch
              with torch.no_grad():
                  model.eval()
                  valid_losses = []
                  predictions = []
                  target_labels = []
                  for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                              desc='Eval',
                                                                                              position=1,
                                                                                              leave=None):
                      input_ids = input_ids.to(device)
                      attention_mask = attention_mask.to(device)
                      token_type_ids = token_type_ids.to(device)
                      position_ids = position_ids.to(device)
                      labels = labels.to(device, dtype=torch.long)

                      output = model(input_ids=input_ids,
                                      attention_mask=attention_mask,
                                      token_type_ids=token_type_ids,
                                      position_ids=position_ids,
                                      labels=labels)

                      logits = output.logits
                      loss = output.loss
                      valid_losses.append(loss.item())

                      batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                      batch_labels = [int(example) for example in labels]

                      predictions += batch_predictions
                      target_labels += batch_labels



              acc = compute_acc(predictions, target_labels)
              valid_loss = sum(valid_losses) / len(valid_losses)
              if lowest_valid_loss > valid_loss:
                  print('Acc for model which have lower valid loss: ', acc)
                  torch.save(model.state_dict(), "./pytorch_model.bin")
                  lowest_valid_loss = valid_loss
                

Epoch 0:  20%|█▉        | 173/866 [01:17<04:58,  2.32batch/s, loss=0.675]
Eval:   0%|          | 0/8 [00:00<?, ?it/s][A
Eval:  12%|█▎        | 1/8 [00:00<00:01,  3.59it/s][A
Eval:  25%|██▌       | 2/8 [00:00<00:01,  4.83it/s][A
Eval:  38%|███▊      | 3/8 [00:00<00:00,  5.40it/s][A
Eval:  50%|█████     | 4/8 [00:00<00:00,  5.93it/s][A
Eval:  62%|██████▎   | 5/8 [00:00<00:00,  5.81it/s][A
Eval:  75%|███████▌  | 6/8 [00:01<00:00,  5.87it/s][A
Eval:  88%|████████▊ | 7/8 [00:01<00:00,  5.78it/s][A
Eval: 100%|██████████| 8/8 [00:01<00:00,  6.18it/s][A
                                                   [A

Acc for model which have lower valid loss:  0.5


Epoch 0:  22%|██▏       | 187/866 [01:25<05:10,  2.18batch/s, loss=0.661]


KeyboardInterrupt: ignored

In [None]:
print(model)

In [None]:
import pandas as pd
test_df = pd.read_csv('test_no_label.csv')


In [None]:
test_dataset = test_df['Id']

In [None]:
def make_id_file_test(tokenizer, test_dataset):
    data_strings = []
    id_file_data = [tokenizer.encode(sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [None]:
test = make_id_file_test(tokenizer, test_dataset)

In [None]:
test[:10]

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [None]:
test_dataset = SentimentTestDataset(tokenizer, test)

In [None]:
def collate_fn_style_test(samples):
    input_ids = samples
    max_len = max(len(input_id) for input_id in input_ids)

    #sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1] #bug
    sorted_indices = [i for i in range(len(input_ids))]
    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],batch_first=True)
    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
# def collate_fn_style_test(samples):
#     input_ids = samples
#     max_len = max(len(input_id) for input_id in input_ids)
#     sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]

#     input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
#                              batch_first=True)
#     attention_mask = torch.tensor(
#         [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
#          sorted_indices])
#     token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
#     position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

#     return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [None]:
with torch.no_grad():
    model.eval()
    predictions = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions += batch_predictions



In [None]:
test_df['Category'] = predictions

In [None]:
test_df.to_csv('submission_electra_disc_aug.csv', index=False)

# **voting code**

In [None]:
import pandas as pd

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
pred1 = pd.read_csv('submissiond.csv')
pred2 = pd.read_csv('submission_electra_base_disc_cleaned.csv')
pred3 = pd.read_csv('submissionh.csv')
pred4 = pd.read_csv('submissionj.csv')
pred5 = pd.read_csv('submission_distilbert.csv')

In [None]:
pred1 = pred1['Category']
pred2 = pred2['Category']
pred3 = pred3['Category']
pred4 = pred4['Category']
pred5 = pred5['Category']

In [None]:
pred_df = pd.DataFrame({'pred1':pred1, 'pred2':pred2, 'pred3':pred3, 'pred4': pred4, 'pred5': pred5})

In [None]:
pred_df

In [None]:
def find_most_frequent(row):
    counts = row.value_counts()
    return counts.idxmax()

# Apply the function to the DataFrame and store the result in a new column
pred_df['most_frequent'] = pred_df.apply(find_most_frequent, axis=1)

print(pred_df)

In [None]:
test_df['Category'] = pred_df['most_frequent']

In [None]:
test_df.to_csv('submission_vote_2.csv', index=False)