# Kaggle data download

In [1]:
!pip install kaggle
from google.colab import files
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jeonhyotaek","key":"d2ddaf4c586e0bf63051e1a4c6a74dd4"}'}

In [2]:
ls -1ha kaggle.json

kaggle.json


In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [4]:
!kaggle competitions download -c copy-of-6th-goorm-project-1-text-classification

Downloading copy-of-6th-goorm-project-1-text-classification.zip to /content
 82% 5.00M/6.12M [00:00<00:00, 41.6MB/s]
100% 6.12M/6.12M [00:00<00:00, 48.9MB/s]


In [5]:
!unzip copy-of-6th-goorm-project-1-text-classification.zip

Archive:  copy-of-6th-goorm-project-1-text-classification.zip
  inflating: sentiment.dev.0         
  inflating: sentiment.dev.1         
  inflating: sentiment.train.0       
  inflating: sentiment.train.1       
  inflating: test_no_label.csv       


# Import requirements

In [6]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 90.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 77.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [108]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    ElectraForSequenceClassification, ElectraTokenizer

)

# Preprocess

In [109]:
def make_id_file(task, tokenizer):
    def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join(file_name), 'r', encoding='utf-8') as f:
            id_file_data = [tokenizer.encode(line.lower()) for line in f.readlines()]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings
    
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1')
    train_neg = make_data_strings('sentiment.train.0')
    dev_pos = make_data_strings('sentiment.dev.1')
    dev_neg = make_data_strings('sentiment.dev.0')

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

In [110]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

In [111]:
!ls

copy-of-6th-goorm-project-1-text-classification.zip  sentiment.dev.1
kaggle.json					     sentiment.train.0
pytorch_model.bin				     sentiment.train.1
sample_data					     test_no_label.csv
sentiment.dev.0					     wandb


In [112]:
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer)

it will take some times...
make id file finished!


In [113]:
train_pos[:10]

['101 6581 2833 1012 102',
 '101 21688 8013 2326 1012 102',
 '101 2027 2036 2031 3679 19247 1998 3256 6949 2029 2003 2428 2204 1012 102',
 '101 2009 1005 1055 1037 2204 15174 2098 7570 22974 2063 1012 102',
 '101 1996 3095 2003 5379 1012 102',
 '101 2204 3347 2833 1012 102',
 '101 2204 2326 1012 102',
 '101 11350 1997 2154 2003 25628 1998 7167 1997 19247 1012 102',
 '101 2307 2173 2005 6265 2030 3347 27962 1998 5404 1012 102',
 '101 1996 2047 2846 3504 6429 1012 102']

In [114]:
class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

In [116]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
dev_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)

In [117]:
for i, item in enumerate(train_dataset):
    print(item)
    if i == 10:
        break

(array([ 101, 6581, 2833, 1012,  102]), array([1]))
(array([  101, 21688,  8013,  2326,  1012,   102]), array([1]))
(array([  101,  2027,  2036,  2031,  3679, 19247,  1998,  3256,  6949,
        2029,  2003,  2428,  2204,  1012,   102]), array([1]))
(array([  101,  2009,  1005,  1055,  1037,  2204, 15174,  2098,  7570,
       22974,  2063,  1012,   102]), array([1]))
(array([ 101, 1996, 3095, 2003, 5379, 1012,  102]), array([1]))
(array([ 101, 2204, 3347, 2833, 1012,  102]), array([1]))
(array([ 101, 2204, 2326, 1012,  102]), array([1]))
(array([  101, 11350,  1997,  2154,  2003, 25628,  1998,  7167,  1997,
       19247,  1012,   102]), array([1]))
(array([  101,  2307,  2173,  2005,  6265,  2030,  3347, 27962,  1998,
        5404,  1012,   102]), array([1]))
(array([ 101, 1996, 2047, 2846, 3504, 6429, 1012,  102]), array([1]))
(array([ 101, 2023, 2173, 2001, 2200, 2204, 1012,  102]), array([1]))


In [118]:
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)

    sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)

    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids, labels

# Wandb sweep

In [93]:
!pip install wandb -qqq
import wandb
wandb.login()

True

In [119]:
sweep_config = {
      'name' : 'test_albert_base_v2.ipynb',
      'method' : 'grid',
      'metric':{
          'name': 'validation_acc',
          'goal': 'maximize'  
      },
      'parameters' : {
          'learning_rate' : {
              'values' : [1e-5 ,2.5e-5, 5e-5, 7.5e-5, 1e-4]  ##1e-6, 1e-7]
          },   
          'batch_size' :{
              'values' : [512]
          },
          'epochs' : {
              'values' : [2] 
          }
      }
}

sweep_id = wandb.sweep(sweep_config)

Create sweep with ID: io2mg0ta
Sweep URL: https://wandb.ai/goorm_project/uncategorized/sweeps/io2mg0ta


# Train model def

In [None]:
def sweep_optimizer(input_model, optimizer, learning_rate) :
    if optimizer == "AdamW" :
        optimizer = torch.optim.AdamW(input_model.parameters(), lr = learning_rate)
    elif optimizer == 'RMSprop' :
        optimizer = torch.optim.RMSprop(input_model.parameters(), lr = learning_rate)
    return optimizer

In [120]:
def train():
    wandb.init(project = 'test_sweep', reinit = True)

    ## batch_size - sweep
    train_batch_size = int(wandb.config.batch_size/2)
    eval_batch_size = wandb.config.batch_size

    # dataload
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=train_batch_size,
                                              shuffle=True, collate_fn=collate_fn_style,
                                              pin_memory=True, num_workers=2)
    dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                            shuffle=False, collate_fn=collate_fn_style,
                                            num_workers=2)
    ## random seed
    random_seed=42
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # model-pretrain
    model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator')
    model.to(device)

    # model train - sweep
    model.train()
    learning_rate = wandb.config.learning_rate
    optimizer =  AdamW(model.parameters(), lr=learning_rate, weight_decay = 1e-2, eps = 1e-6)

    #scheduler
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer, lr_lambda = lambda epoch : 0.9** epoch, last_epoch=-1,verbose=False)

    def compute_acc(predictions, target_labels):
        return (np.array(predictions) == np.array(target_labels)).mean()

    # train_epoch
    train_epoch = wandb.config.epochs
    # validation_acc
    wan_valid_acc = []

    lowest_valid_loss = 9999.

    #train
    for epoch in range(train_epoch):
        #total train loss
        wan_train_loss = []

        with tqdm(train_loader, unit="batch") as tepoch:
            
            for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
                tepoch.set_description(f"Epoch {epoch}")
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                token_type_ids = token_type_ids.to(device)
                position_ids = position_ids.to(device)
                labels = labels.to(device, dtype=torch.long)

                optimizer.zero_grad()

                output = model(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              position_ids=position_ids,
                              labels=labels)

                loss = output.loss
                loss.backward()

                # log the train loss / total loss
                wandb.log({'train_batch_loss':loss.item()})
                wan_train_loss.append(loss.item())

                optimizer.step()

                tepoch.set_postfix(loss=loss.item())

                #sum_acc
                sum_acc = [] 

                if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                    # Evaluate the model five times per epoch
                    with torch.no_grad():
                        model.eval()
                        valid_losses = []
                        predictions = []
                        target_labels = []
                        for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                    desc='Eval',
                                                                                                    position=1,
                                                                                                    leave=None):
                            input_ids = input_ids.to(device)
                            attention_mask = attention_mask.to(device)
                            token_type_ids = token_type_ids.to(device)
                            position_ids = position_ids.to(device)
                            labels = labels.to(device, dtype=torch.long)

                            output = model(input_ids=input_ids,
                                          attention_mask=attention_mask,
                                          token_type_ids=token_type_ids,
                                          position_ids=position_ids,
                                          labels=labels)

                            logits = output.logits
                            loss = output.loss
                            valid_losses.append(loss.item())

                            batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                            batch_labels = [int(example) for example in labels]

                            predictions += batch_predictions
                            target_labels += batch_labels


                    acc = compute_acc(predictions, target_labels)
                    valid_loss = sum(valid_losses) / len(valid_losses)

                    wan_valid_acc.append(float(acc))
                    wandb.log({'validation_acc': sum(wan_valid_acc)/len(wan_valid_acc)})

                    sum_acc.append(float(acc)) ############
                    wandb.log({"sum_acc": sum(sum_acc) / len(sum_acc)}) #####


                    if lowest_valid_loss > valid_loss:
                        print('Acc for model which have lower valid loss: ', acc)
                        torch.save(model.state_dict(), "./pytorch_model.bin")
                        lowest_valid_loss = valid_loss

                else:
                    #avg train loss
                    wandb.log({'avg_train_loss': sum(wan_train_loss) / len(wan_train_loss)})

            scheduler.step()


In [None]:
wandb.agent( sweep_id , function=train, count=1)

[34m[1mwandb[0m: Agent Starting Run: khzpr8t3 with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 1e-05


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.d

Acc for model which have lower valid loss:  0.97225


Epoch 0:  40%|███▉      | 692/1732 [02:45<04:04,  4.25batch/s, loss=0.125] 
Eval:   0%|          | 0/8 [00:00<?, ?it/s][A
Eval:  12%|█▎        | 1/8 [00:00<00:01,  3.60it/s][A
Eval:  25%|██▌       | 2/8 [00:00<00:01,  4.80it/s][A
Eval:  38%|███▊      | 3/8 [00:00<00:00,  5.39it/s][A
Eval:  50%|█████     | 4/8 [00:00<00:00,  5.85it/s][A
Eval:  62%|██████▎   | 5/8 [00:00<00:00,  5.71it/s][A
Eval:  75%|███████▌  | 6/8 [00:01<00:00,  5.77it/s][A
Eval:  88%|████████▊ | 7/8 [00:01<00:00,  5.70it/s][A
Eval: 100%|██████████| 8/8 [00:01<00:00,  6.12it/s][A
                                                   [A

Acc for model which have lower valid loss:  0.977


Epoch 0:  58%|█████▊    | 1006/1732 [04:01<02:44,  4.41batch/s, loss=0.0674]

In [None]:
if __name__ == '__main__':
    main()