In [61]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
import torch.nn.functional as F

In [62]:
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [63]:
train_df = pd.read_csv('./nsmc/showingTelling_csv.txt', sep='\t', encoding='utf-16')
test_df = pd.read_csv('./nsmc/showingTelling_csv.txt', sep='\t', encoding='utf-16')

In [64]:
!pwd

/Users/kimkwangil/Project/01EssayFitAI/showing_telling


In [65]:
train_df

Unnamed: 0,id,label,text
0,9274899,0,John was sad to see his girlfriend leave.
1,8544678,0,The house was creepy.
2,6825595,0,I heard footsteps creeping behind me and it ma...
3,6723715,0,She was my best friend. I could tell her almos...
4,7898805,0,She hated it there because it smelled bad.
5,6315043,0,When they embraced she could tell he had been ...
6,6097171,1,The temperature fell and the ice reflected the...
7,8932678,0,Suzie was blind.
8,6242223,0,It was late fall.
9,7462111,1,She was a plumber and asked where the bathroom...


In [66]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_df = train_df.sample(frac=1, random_state=999)
test_df = test_df.sample(frac=1, random_state=999)

In [67]:
train_df

Unnamed: 0,id,label,text
2,6825595,0,I heard footsteps creeping behind me and it ma...
4,7898805,0,She hated it there because it smelled bad.
11,6900881,1,I was nervous.
6,6097171,1,The temperature fell and the ice reflected the...
15,2968565,0,Joey missed his father.
7,8932678,0,Suzie was blind.
9,7462111,1,She was a plumber and asked where the bathroom...
3,6723715,0,She was my best friend. I could tell her almos...
10,8425305,0,I had a great conversation with Tim over dinne...
8,6242223,0,It was late fall.


In [68]:
train_df.iloc[0, 2]

'I heard footsteps creeping behind me and it made the whole situation scarier.'

In [69]:
train_df.shape

(14, 3)

In [86]:
class NsmcDataset(Dataset):
    ''' Naver Sentiment Movie Corpus Dataset '''
    def __init__(self, df): #데이터 전처리
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 2]
        label = int(self.df.iloc[idx, 1])
        return text, label

In [87]:
nsmc_train_dataset = NsmcDataset(train_df)
train_loader = DataLoader(nsmc_train_dataset, batch_size=2, shuffle=True, num_workers=2)

In [88]:
nsmc_train_dataset.__getitem__(7)

('She was my best friend. I could tell her almost anything.', 0)

In [89]:
nsmc_train_dataset

<__main__.NsmcDataset at 0x13e7d5a50>

In [90]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x13e7d5710>

In [91]:
import torch

#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#device = torch.device("cuda")
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [92]:
for text, label in train_loader :
    print("text:",text)
    print("label:",label)

text: ('She was my best friend. I could tell her almost anything.', 'I had a great conversation with Tim over dinner and loved hearing his stories.')
label: tensor([0, 0])
text: ('I heard footsteps creeping behind me and it made the whole situation scarier.', 'The house was creepy.')
label: tensor([0, 0])
text: ('It was late fall.', 'She hated it there because it smelled bad.')
label: tensor([0, 0])
text: ('John was sad to see his girlfriend leave.', 'When they embraced she could tell he had been smoking and was scared.')
label: tensor([0, 0])
text: ('I was nervous.', 'She was so angry that she threw a rock into the house and broke the glass.')
label: tensor([1, 0])
text: ('The temperature fell and the ice reflected the sun.', 'Joey missed his father.')
label: tensor([1, 0])
text: ('She was a plumber and asked where the bathroom was.', 'Suzie was blind.')
label: tensor([1, 0])


In [93]:
#working properly but recomment using google colab!

optimizer = Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 500
epochs = 1
total_loss = 0
total_len = 0
total_correct = 0

model.train()
for epoch in range(epochs):
    
    for text, label in train_loader:
        optimizer.zero_grad()
        
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1




In [None]:
# evaluation
model.eval()

nsmc_eval_dataset = NsmcDataset(test_df)
eval_loader = DataLoader(nsmc_eval_dataset, batch_size=2, shuffle=False, num_workers=2)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in eval_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)
 