<a href="https://colab.research.google.com/github/jihoon99/transformers/blob/master/2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# GPU

import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU not found')
print(f'Found GPU at : {device_name}')


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
# torch.cuda.get_device_name(0)

Found GPU at : /device:GPU:0


'Tesla T4'

In [11]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b5/d5/c6c23ad75491467a9a84e526ef2364e523d45e2b0fae28a7cbe8689e7e84/transformers-4.8.1-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 8.4MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 50.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |█████

In [12]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot

# load data set : GLUE

In [14]:
df = pd.read_csv("/content/drive/MyDrive/study/transformer_data/2/in_domain_train.tsv", delimiter='\t', header = None, 
                 names = ['sentence_source','label','label_notes','sentence'])

'''
sentence_source : 문장의 출처
label : 0 - unacceptable, 1 - acceptable
label_notes : 저자가 주석 달아놓은거
sentence : 문장
'''

In [15]:
df.shape

(8551, 4)

In [20]:
df.sample(10)

Unnamed: 0,sentence_source,label,label_notes,sentence
5367,b_73,1,,susan does n't eat her vegetables enough .
4615,ks08,1,,john has been taken to the library .
1832,r-67,1,,"this guitar , i 've sung folk songs and accomp..."
8309,ad03,1,,bill 's reading shakespeare and maureen 's sin...
6848,m_02,1,,what place did john send the book ?
4675,ks08,1,,the scandal was talked about for days .
8490,ad03,1,,i did not understand .
800,bc01,0,*,mary desired john to go abroad .
1859,r-67,1,,"them , they ca n't stand each other ."
5201,kl93,1,,every student who handed in some homework will...


## BERT tokenizer

### add special token CLS, SEP

In [22]:
sentences = df.sentence.values

sentences = ['[CLS] ' + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

In [23]:
# BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case = True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print("Tokenize the first sentence : ")
print(tokenized_texts[0])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


Tokenize the first sentence : 
['[CLS]', 'our', 'friends', 'wo', 'n', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']


## PreProcess

In [25]:
max([len(i) for i in tokenized_texts])

47

In [28]:
# 논문에서는 512길이의 토큰을 썻지만 여기서는 128만 쓰겟다, 왜냐면 가장긴 sequence가 47이기 때문이다
MAX_LEN = 128

# Token To Index(BERT index)
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad to my tokens
input_ids = pad_sequences(input_ids, maxlen = MAX_LEN, dtype = 'long', truncating = 'post', padding = 'post')

## Creating Attention Mask

In [34]:
# padding된 부분에 attention되길 원하지 않음. 그래서 attention할때 단어가 있는 부분을 1로 단어가 없는 부분을 0로 맞춰줘야함.
attention_masks = []

# mask 생성
for se in input_ids:
    tmp_mask = [float(i>0) for i in se] # i는 BERT2index 변환 된것이라, 단어가 있으면 1이상의 숫자를 갖음.
    attention_masks.append(tmp_mask)

## Splitting data : train and valid

In [36]:
x_train, x_valid, y_train, y_valid = train_test_split(input_ids, labels, random_state = 42, test_size = 0.1)

mask_train, mask_valid, _, _ = train_test_split(attention_masks, input_ids, random_state = 42, test_size = 0.1)

# Transform to Torch

In [47]:
train_inputs = torch.tensor(x_train)
validation_inputs = torch.tensor(x_valid)
train_labels = torch.tensor(y_train)
validation_labels = torch.tensor(y_valid)
train_masks = torch.tensor(mask_train)
validation_masks = torch.tensor(mask_valid)

## Batch, iterator

In [43]:
BATCH_SIZE = 32

# TensorDataset : Gen, Yield
train_data = TensorDataset(train_inputs, train_masks, train_labels)
print(next(iter(train_data)))

# shuffling
train_sampler = RandomSampler(train_data)

# final dataloader
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = BATCH_SIZE)

(tensor([  101,  1996,  2341,  7480,  1010, 14572,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]

In [48]:
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data) # 순서대로
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size = BATCH_SIZE)

# BERT model

In [52]:
try:
    import transformers
except:
    print("install transformer")
    !pip install transformers

from transformers import BertModel, BertConfig


configuration = BertConfig()

# model from 소문자 style configuration
model = BertModel(configuration)

# model
configuration = model.config
print(configuration)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



## Loading Pretrained Model(소문자)

In [55]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Optimizer grouped params
- weight decay

In [57]:
!git add .

fatal: not a git repository (or any of the parent directories): .git


In [58]:
!git init
!git add README.md
!git commit -m "first commit"
!git branch -M main
!git remote add origin https://github.com/jihoon99/transformers.git
!git push -u origin main

Initialized empty Git repository in /content/.git/
fatal: pathspec 'README.md' did not match any files

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@29b9a0059f9e.(none)')
error: refname refs/heads/master not found
fatal: Branch rename failed
error: src refspec main does not match any.
error: failed to push some refs to 'https://github.com/jihoon99/transformers.git'
