# Import requirements

In [1]:
!pip install transformers
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mleadawon[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import wandb

wandb.init()
wandb.run.name = 'test_run_1'
wandb.run.save()



ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mleadawon[0m. Use [1m`wandb login --relogin`[0m to force relogin




True

In [4]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW
)

#new model
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification,

    AutoModelForSequenceClassification,
    AutoTokenizer
)

# 1. Preprocess

In [5]:
def make_id_file(task, tokenizer):
    def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join(file_name), 'r', encoding='utf-8') as f:
            id_file_data = [tokenizer.encode(line.lower()) for line in f.readlines()]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings
    
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1')
    train_neg = make_data_strings('sentiment.train.0')
    dev_pos = make_data_strings('sentiment.dev.1')
    dev_neg = make_data_strings('sentiment.dev.0')

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

In [6]:
tokenizer = AutoTokenizer.from_pretrained('VictorSanh/roberta-base-finetuned-yelp-polarity')

In [7]:
from google.colab import files
uploaded = files.upload()

In [8]:
!ls

pytorch_model.bin  sentiment.dev.0  sentiment.train.0  test_no_label.csv
sample_data	   sentiment.dev.1  sentiment.train.1  wandb


In [9]:
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer)

it will take some times...
make id file finished!


In [10]:
train_pos[:10]

['0 3463 39462 689 479 50118 2',
 '0 16101 428 2111 544 479 50118 2',
 '0 10010 67 33 1230 24827 8 2480 6353 61 16 269 205 479 50118 2',
 '0 405 128 29 10 205 7 14317 9379 1073 324 479 50118 2',
 '0 627 813 16 5192 479 50118 2',
 '0 8396 2003 689 479 50118 2',
 '0 8396 544 479 50118 2',
 '0 29 18615 9 183 16 17798 8 3739 9 24827 479 50118 2',
 '0 12338 317 13 4592 50 2003 14967 8 4437 479 50118 2',
 '0 627 92 1186 1326 2770 479 50118 2']

In [11]:
train_neg[:10]

['0 118 21 16748 22037 479 50118 2',
 '0 2527 15 7 5 9379 1073 918 2156 5 24 21999 16 937 422 9 5 7259 479 50118 2',
 '0 4691 16980 4884 8 10 4866 9 30274 24515 479 50118 2',
 '0 23702 269 780 359 45 11314 9 5 68 18134 42666 1215 425 6694 479 50118 2',
 '0 10815 2156 5 19464 9379 1073 324 2156 24 16 35790 22191 479 50118 2',
 '0 118 56 7 582 68 18134 42666 1215 7 1606 7134 7 5 9379 1073 324 479 50118 2',
 '0 8877 174 162 89 21 10 1427 13 5 10386 15 5 526 479 50118 2',
 '0 1322 47 27537 162 17487 50118 2',
 '0 118 21 45 164 7 582 13 5 10386 15 5 526 479 50118 2',
 '0 118 2740 24 396 24515 2156 20406 2156 21568 2156 50 10386 479 50118 2']

In [12]:
class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

In [13]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
dev_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)

In [14]:
for i, item in enumerate(train_dataset):
    print(item)
    if i == 10:
        break

(array([    0,  3463, 39462,   689,   479, 50118,     2]), array([1]))
(array([    0, 16101,   428,  2111,   544,   479, 50118,     2]), array([1]))
(array([    0, 10010,    67,    33,  1230, 24827,     8,  2480,  6353,
          61,    16,   269,   205,   479, 50118,     2]), array([1]))
(array([    0,   405,   128,    29,    10,   205,     7, 14317,  9379,
        1073,   324,   479, 50118,     2]), array([1]))
(array([    0,   627,   813,    16,  5192,   479, 50118,     2]), array([1]))
(array([    0,  8396,  2003,   689,   479, 50118,     2]), array([1]))
(array([    0,  8396,   544,   479, 50118,     2]), array([1]))
(array([    0,    29, 18615,     9,   183,    16, 17798,     8,  3739,
           9, 24827,   479, 50118,     2]), array([1]))
(array([    0, 12338,   317,    13,  4592,    50,  2003, 14967,     8,
        4437,   479, 50118,     2]), array([1]))
(array([    0,   627,    92,  1186,  1326,  2770,   479, 50118,     2]), array([1]))
(array([    0,  9226,   317,    21,   

In [15]:
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)

    sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)

    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids, labels

In [16]:
train_batch_size=128 #batch size -> 32 to 128
eval_batch_size=128 #batch size -> 64 to 128

### wandb start
# you can also initialize your run with a config
wandb.init(config={"batch_size": 128,
                   "learning_rate":5e-5,})

train_batch_size=wandb.config.batch_size
### wandb end


train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=0)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=True, collate_fn=collate_fn_style, #shuffle false -> true
                                         num_workers=0)

In [17]:
# random seed
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained('VictorSanh/roberta-base-finetuned-yelp-polarity')
model.to(device)


## wandb start

wandb.watch(model)
## wandb end

Some weights of the model checkpoint at VictorSanh/roberta-base-finetuned-yelp-polarity were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[]

In [18]:
model.train()
learning_rate = 5e-5

## wandb start

learning_rate = wandb.config.learning_rate
## wandb end

optimizer = AdamW(model.parameters(), lr=learning_rate)



In [19]:
def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

In [20]:
def train():    
    train_epoch = 3
    lowest_valid_loss = 9999.
    for epoch in range(train_epoch):
        with tqdm(train_loader, unit="batch") as tepoch:
            for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
                tepoch.set_description(f"Epoch {epoch}")
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                token_type_ids = token_type_ids.to(device)
                position_ids = position_ids.to(device)
                labels = labels.to(device, dtype=torch.long)

                optimizer.zero_grad()

                # output = model(input_ids=input_ids,
                #                attention_mask=attention_mask,
                #                token_type_ids=token_type_ids,
                #                position_ids=position_ids,
                #                labels=labels)
                output = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            labels=labels)

                loss = output.loss
                loss.backward()

                optimizer.step()

                tepoch.set_postfix(loss=loss.item())


                
                if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                    # Evaluate the model five times per epoch
                    with torch.no_grad():
                        model.eval()
                        valid_losses = []
                        predictions = []
                        target_labels = []
                        for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                    desc='Eval',
                                                                                                    position=1,
                                                                                                    leave=None):
                            input_ids = input_ids.to(device)
                            attention_mask = attention_mask.to(device)
                            token_type_ids = token_type_ids.to(device)
                            position_ids = position_ids.to(device)
                            labels = labels.to(device, dtype=torch.long)

                            # output = model(input_ids=input_ids,
                            #                attention_mask=attention_mask,
                            #                token_type_ids=token_type_ids,
                            #                position_ids=position_ids,
                            #                labels=labels)
                            output = model(input_ids=input_ids,
                                            attention_mask=attention_mask,
                                            token_type_ids=token_type_ids,
                                            position_ids=position_ids,
                                            labels=labels)

                            logits = output.logits
                            loss = output.loss
                            valid_losses.append(loss.item())

                            batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                            batch_labels = [int(example) for example in labels]

                            predictions += batch_predictions
                            target_labels += batch_labels
                    
                    acc = compute_acc(predictions, target_labels)
                    valid_loss = sum(valid_losses) / len(valid_losses)

                    ## wandb ##
                    wandb.log({
                                "Test Accuracy": acc,
                                "Test Loss": valid_loss})
                    ##

                    if lowest_valid_loss > valid_loss:
                        print('Acc for model which have lower valid loss: ', acc)
                        torch.save(model.state_dict(), "./pytorch_model.bin")
                        lowest_valid_loss = valid_loss
                    

In [21]:
import math
sweep_config = {
    'name' : 'bayes-test',
    'method': 'random',
    'metric' : {
        'name': 'Test_loss',
        'goal': 'minimize'   
        },
    'parameters' : {
        # 'optimizer': {
        #     'values': ['adam', 'sgd']
        #     },
        # 'dropout': {
        #     'values': [0.3, 0.4]
        #     },
        'learning_rate': {
            'distribution': 'uniform',
            'min': 0,
            'max': 0.1
            },
        # 'epochs': {
        #     'values': [5, 6]
        #     },
        'batch_size': {
            'distribution': 'q_log_uniform',
            'q': 1,
            'min': math.log(32),
            'max': math.log(256),
            }
        }
    }
sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id, train, count=2)



Create sweep with ID: c2ptyge8
Sweep URL: https://wandb.ai/leadawon/uncategorized/sweeps/c2ptyge8


[34m[1mwandb[0m: Agent Starting Run: 9e292gv5 with config:
[34m[1mwandb[0m: 	batch_size: 70
[34m[1mwandb[0m: 	learning_rate: 0.008799509607042278
Epoch 0:  20%|█▉        | 692/3463 [05:26<21:18,  2.17batch/s, loss=0.094] 
Eval:   0%|          | 0/32 [00:00<?, ?it/s][A
Eval:   3%|▎         | 1/32 [00:00<00:04,  6.24it/s][A
Eval:   6%|▋         | 2/32 [00:00<00:04,  6.22it/s][A
Eval:   9%|▉         | 3/32 [00:00<00:04,  6.17it/s][A
Eval:  12%|█▎        | 4/32 [00:00<00:04,  6.24it/s][A
Eval:  16%|█▌        | 5/32 [00:00<00:04,  6.16it/s][A
Eval:  19%|█▉        | 6/32 [00:00<00:04,  6.40it/s][A
Eval:  22%|██▏       | 7/32 [00:01<00:03,  6.67it/s][A
Eval:  25%|██▌       | 8/32 [00:01<00:03,  6.56it/s][A
Eval:  28%|██▊       | 9/32 [00:01<00:03,  6.80it/s][A
Eval:  31%|███▏      | 10/32 [00:01<00:03,  6.67it/s][A
Eval:  34%|███▍      | 11/32 [00:01<00:03,  6.28it/s][A
Eval:  38%|███▊      | 12/32 [00:01<00:03,  6.53it/s][A
Eval:  41%|████      | 13/32 [00:02<00:02,  6.

In [22]:
wandb.run.save()

BrokenPipeError: ignored

In [None]:
import pandas as pd
test_df = pd.read_csv('test_no_label.csv')

In [None]:
test_dataset = test_df['Id']

In [None]:
def make_id_file_test(tokenizer, test_dataset):
    data_strings = []
    id_file_data = [tokenizer.encode(sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [None]:
test = make_id_file_test(tokenizer, test_dataset)

In [None]:
test[:10]

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [None]:
test_dataset = SentimentTestDataset(tokenizer, test)

In [None]:
def collate_fn_style_test(samples):
    input_ids = samples
    max_len = max(len(input_id) for input_id in input_ids)

    #sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1] #bug
    sorted_indices = [i for i in range(len(input_ids))]
    
    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],batch_first=True)
    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [None]:
with torch.no_grad():
    model.eval()
    predictions = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        # output = model(input_ids=input_ids,
        #                attention_mask=attention_mask,
        #                token_type_ids=token_type_ids,
        #                position_ids=position_ids)
        output = model(input_ids=input_ids,
                      attention_mask=attention_mask,
                      token_type_ids=token_type_ids,
                      position_ids=position_ids)

        logits = output.logits
        print(logits)
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions += batch_predictions

In [None]:
test_df['Category'] = predictions

In [None]:
test_df.to_csv('submission2.csv', index=False)

In [None]:
print(lowest_valid_loss)