# Classification showing and telling with pytorch

!pip install pytorch-transformers

In [11]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
import torch.nn.functional as F

In [12]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/showingTelling_csv.csv', delimiter = ',', encoding="ISO-8859-1")

data.index.name = "index"
data.columns = ["label", "text"]
data

Unnamed: 0_level_0,label,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,John was sad to see his girlfriend leave.
1,1,The house was creepy.
2,1,I heard footsteps creeping behind me and it ma...
3,1,She was my best friend. I could tell her almos...
4,1,She hated it there because it smelled bad.
...,...,...
254,2,"Her hand reached for the massive, iron door ha..."
255,2,The way the door decisively slammed behind her...
256,2,Dust coated every last surface. He ran his fin...
257,2,The lime green patio umbrella flapped happily ...


In [10]:
def shuffle(df, n=1, axis=0): #데이터가 1111 0000이라 잘 섞어주자. 
    df = data.copy()
    for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
    return df

shuffle(data)

train, test = train_test_split(data, test_size=0.2)
print(len(train))
print(len(test))

207
52


In [30]:
train.head() # 잘 섞였군! 하지만 데이터 클린징이 필요하다.

Unnamed: 0_level_0,label,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
59,1,Molly is a wonderful person.
162,2,"""I don't see why he needs an ax,"" continued Fe..."
219,2,The crowd boiled like a simmering cauldron. Wh...
211,2,From behind came the pounding of hoofbeats. Tr...
64,1,I was really mad.


In [31]:
test.head()

Unnamed: 0_level_0,label,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
54,1,It was an unusual cat.
30,1,She felt embarrassed when she fell.
119,2,She wore coveralls carried a plunger and metal...
137,2,The little girl pressed so close to the window...
204,2,Jay waited the whole day to go to reading clas...


In [32]:
import re
import pandas
import numpy
import json
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [33]:
def preprocessing(review, remove_stopwords=False):
        
    # 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    review_text = re.sub("[^a-zA-Z]", " ", review)

    # 대문자들을 소문자로 바꾸고 공백단위로 텍스트들 나눠서 리스트로 만든다.
    words = review_text.lower().split()

    if remove_stopwords: 
        # 불용어들을 제거
    
        #영어에 관련된 불용어 불러오기
        stops = set(stopwords.words("english"))
        # 불용어가 아닌 단어들로 이루어진 새로운 리스트 생성
        words = [w for w in words if not w in stops]
        # 단어 리스트를 공백을 넣어서 하나의 글로 합친다.
        clean_review = ' '.join(words)

    else: # 불용어 제거하지 않을 때
        clean_review = ' '.join(words)

    return clean_review

In [34]:
clean_train_ = []
for review in train['text']:
    clean_train_.append(preprocessing(review, remove_stopwords=True))

# 전처리된 데이터 확인. 잘됨 !! ㅎ
clean_train_[0]

'molly wonderful person'

In [35]:
clean_train_[:5]

['molly wonderful person',
 'see needs ax continued fern eight',
 'crowd boiled like simmering cauldron guards ft looking people threw apple cores aimed well polished helmets yells curses cut air captain stepped forward hands spread',
 'behind came pounding hoofbeats tree branches whipped across orias fs face showered saddle leaves gritted teeth face set snarl become protection unjust world would catch must catch',
 'really mad']

In [36]:
clean_test_ = []
for review in test['text']:
    clean_test_.append(preprocessing(review, remove_stopwords=True))
    
# 전처리된 데이터 확인. 잘됨 !! ㅎ
clean_test_[0]

'unusual cat'

In [37]:
clean_test_[:5]

['unusual cat',
 'felt embarrassed fell',
 'wore coveralls carried plunger metal toolbox wrenches various sizes hung leather belt around waist gpoint head h said',
 'little girl pressed close window breath fogged glass',
 'jay waited whole day go reading class always smiled got']

In [38]:
print(train["label"]) #showing telling 컬럼값을 확인해보고

index
59     1
162    2
219    2
211    2
64     1
      ..
139    2
182    2
167    2
164    2
143    2
Name: label, Length: 206, dtype: int64


In [39]:
train['cleaned_text'] = clean_train_ # 이제 전처리된 내용을 한눈에 비교해 볼 수 있다.
train[:5] #데이터 앞부분 5개반 확인해보자

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,label,text,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
59,1,Molly is a wonderful person.,molly wonderful person
162,2,"""I don't see why he needs an ax,"" continued Fe...",see needs ax continued fern eight
219,2,The crowd boiled like a simmering cauldron. Wh...,crowd boiled like simmering cauldron guards ft...
211,2,From behind came the pounding of hoofbeats. Tr...,behind came pounding hoofbeats tree branches w...
64,1,I was really mad.,really mad


In [40]:
test['cleaned_text'] = clean_test_
test[:5] #test 데이터셋도 전처리된 결과를 확인할 수 있다.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,label,text,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
54,1,It was an unusual cat.,unusual cat
30,1,She felt embarrassed when she fell.,felt embarrassed fell
119,2,She wore coveralls carried a plunger and metal...,wore coveralls carried plunger metal toolbox w...
137,2,The little girl pressed so close to the window...,little girl pressed close window breath fogged...
204,2,Jay waited the whole day to go to reading clas...,jay waited whole day go reading class always s...


In [41]:
train_dataset = train[['label', 'cleaned_text']] #전처리 1차 끝!
train_dataset[:5]

Unnamed: 0_level_0,label,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
59,1,molly wonderful person
162,2,see needs ax continued fern eight
219,2,crowd boiled like simmering cauldron guards ft...
211,2,behind came pounding hoofbeats tree branches w...
64,1,really mad


In [42]:
test_dataset = test[['label', 'cleaned_text']]  #전처리 1차 끝!
test_dataset.head()

Unnamed: 0_level_0,label,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
54,1,unusual cat
30,1,felt embarrassed fell
119,2,wore coveralls carried plunger metal toolbox w...
137,2,little girl pressed close window breath fogged...
204,2,jay waited whole day go reading class always s...


df_test = test_dataset.astype(str)
df_test.dtypes

df_train = train_dataset.astype(str)
df_train.dtypes

train_dataset.values #train_dataset의 df를 numpy array로 변환

test_dataset.values #역시 이것도 변환

In [43]:
train_dataset.dropna(inplace=True)
test_dataset.dropna(inplace=True)

train_dataset = train_dataset.sample(frac=1, random_state=999)
test_dataset = test_dataset.sample(frac=1, random_state=999)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [44]:
train_dataset

Unnamed: 0_level_0,label,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
29,1,lights sent signals village village
174,2,looks glumly mess behind cupboard knowing fll ...
47,1,night cold moonlit sleigh moved fast forest
112,2,crunching hit ears behind accelerating already...
117,2,suzie felt bench white cane
...,...,...
150,2,awaken annoying buzz alarm clock anything read...
248,2,town red brick brick would red smoke ashes all...
198,2,cynthia shrieked pulled wildly hair slammed do...
228,2,sun set evening sky malcolm slowly turned walk...


In [45]:
train_dataset.iloc[0, 1]


'lights sent signals village village'

In [46]:
class STdataset(Dataset):
    '''showing telling dataset'''
    def __init__(self, df): #데이터 전처리
        self.df = df
    
    def __len__(self): #데이터의 길이
        return len(self.df)
    
    def __getitem__(self, idx): #데이터 1개를 가져오기
        label = self.df.iloc[idx, 0]
        text = self.df.iloc[idx, 1]
        return label, text

In [47]:
ST_train_dataset = STdataset(train_dataset)
train_loader = DataLoader(ST_train_dataset, batch_size=2, shuffle=True, num_workers=2)

In [48]:
ST_train_dataset

<__main__.STdataset at 0x1a82e09d90>

In [49]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x1a4ea49c90>

In [50]:
#import torch

#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#device = torch.device("cuda")
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [51]:
optimizer = Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 500
epochs = 2
total_loss = 0
total_len = 0
total_correct = 0

model.train()
for epoch in range(epochs):
    
    for text, label in train_loader:
        optimizer.zero_grad()
        
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1

TypeError: split_with_sizes(): argument 'split_sizes' (position 1) must be tuple of ints, not str

In [None]:
# evaluation
model.eval()

nsmc_eval_dataset = NsmcDataset(test_df)
eval_loader = DataLoader(nsmc_eval_dataset, batch_size=2, shuffle=False, num_workers=2)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in eval_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)
 

In [None]:
#이하 코드는 pytorch로 이진분류하려고 했으나 위 코드로 대체할거임, 위 코드가 BERT로 성능이 더 좋다는 뇌피셜! ㅋ

In [16]:
train_dataset.to_csv('datasets/train_datasets.csv', index=False, header=False, sep=',')
test_dataset.to_csv('datasets/test_datasets.csv', index=False, header=False, sep=',')

In [17]:
from torchtext import data # torchtext.data 임포트

# 필드 정의

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=20)


from torchtext.data import TabularDataset


train_data, test_data = TabularDataset.splits(
        path='datasets/', train='train_datasets.csv', test='test_datasets.csv', format='csv',
        fields=[('label', LABEL), ('text', TEXT)], skip_header=True)


print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))

#ref: https://wikidocs.net/60314

훈련 샘플의 개수 : 205
테스트 샘플의 개수 : 51


In [18]:
print(vars(train_data[2])) # text, label이 구분됨

{'label': '2', 'text': ['ekaterina', 'shocked', 'cold', 'fd', 'known', 'winters', 'never', 'far', 'north', 'never', 'deep', 'burrowed', 'furs', 'still', 'felt', 'eyelashes', 'freeze', 'crystals', 'ice', 'face', 'breath', 'frozen', 'solid', 'clear', 'night', 'raced', 'whispering', 'pines', 'like', 'feather', 'drawn', 'sheet', 'silver', 'seemed', 'magical', 'impossible', 'temporary', 'forbidden']}


In [19]:
 #단어장 생성
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)



#단어장 생성 확인
#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   


#ref :::::: https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/


Size of TEXT vocabulary: 1576
Size of LABEL vocabulary: 3
[('ft', 24), ('h', 17), ('fs', 16), ('like', 15), ('face', 13), ('never', 12), ('one', 12), ('felt', 11), ('could', 11), ('cold', 10)]
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x1a3ece3790>>, {'<unk>': 0, '<pad>': 1, 'ft': 2, 'h': 3, 'fs': 4, 'like': 5, 'face': 6, 'never': 7, 'one': 8, 'could': 9, 'felt': 10, 'cold': 11, 'red': 12, 'would': 13, 'another': 14, 'behind': 15, 'day': 16, 'house': 17, 'long': 18, 'looked': 19, 'said': 20, 'stairs': 21, 'c': 22, 'came': 23, 'door': 24, 'every': 25, 'night': 26, 'old': 27, 'see': 28, 'sun': 29, 'time': 30, 'black': 31, 'blood': 32, 'glass': 33, 'got': 34, 'late': 35, 'made': 36, 'nervous': 37, 'people': 38, 'reached': 39, 'tree': 40, 'two': 41, 'walked': 42, 'went': 43, 'away': 44, 'bright': 45, 'dark': 46, 'darkness': 47, 'eyes': 48, 'far': 49, 'father': 50, 'first': 51, 'hand': 52, 'knew': 53, 'last': 54, 'little': 55, 'man': 56, 'mother'

In [20]:
print(train_data.fields.items()) # tex, label 로 구분되어 있는 것을 확인할 수 있다.

dict_items([('label', <torchtext.data.field.Field object at 0x1a3ecc2250>), ('text', <torchtext.data.field.Field object at 0x1a3ecb5510>)])


In [22]:
import torch

#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 16

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [23]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [25]:
VOCAB_SIZE = len(train_data.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_data.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

TypeError: 'generator' object is not callable

In [26]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [27]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [29]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')


NameError: name 'model' is not defined

In [23]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [24]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [25]:
#architecture
print(model)



classifier(
  (embedding): Embedding(1581, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)


In [26]:
#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

The model has 217,557 trainable parameters


TypeError: copy_(): argument 'other' (position 1) must be Tensor, not NoneType

In [72]:
print(test_data.fields.items()) # label, text가 각각 잘 구분되어 플드에 저작외었다. 이것을 이제 모델이 처 넣으면 된다.

dict_items([('label', <torchtext.data.field.Field object at 0x1a4ae03490>), ('text', <torchtext.data.field.Field object at 0x1a4ae034d0>)])


In [73]:
TEXT.build_vocab(train_data, min_freq=1, max_size=10000) #보캐브러리를 빌드한다. 그런데 왜하는겨?

In [74]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab))) #여기에 사용한 다어는 이많큼이다.

단어 집합의 크기 : 1542


In [75]:
print(TEXT.vocab.stoi) # 생성된 집합 내 단어들을 확인해보자

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x1a4ae032d0>>, {'<unk>': 0, '<pad>': 1, 'ft': 2, 'h': 3, 'fs': 4, 'could': 5, 'like': 6, 'behind': 7, 'one': 8, 'cold': 9, 'door': 10, 'face': 11, 'long': 12, 'looked': 13, 'see': 14, 'felt': 15, 'garden': 16, 'never': 17, 'said': 18, 'black': 19, 'c': 20, 'first': 21, 'hands': 22, 'night': 23, 'sun': 24, 'time': 25, 'day': 26, 'eyes': 27, 'front': 28, 'got': 29, 'hand': 30, 'late': 31, 'little': 32, 'stairs': 33, 'walked': 34, 'would': 35, 'back': 36, 'blood': 37, 'breath': 38, 'came': 39, 'every': 40, 'f': 41, 'far': 42, 'father': 43, 'house': 44, 'knew': 45, 'light': 46, 'made': 47, 'man': 48, 'nervous': 49, 'old': 50, 'reached': 51, 'red': 52, 'saw': 53, 'sweat': 54, 'tree': 55, 'two': 56, 'us': 57, 'village': 58, 'wind': 59, 'window': 60, 'already': 61, 'another': 62, 'away': 63, 'beautiful': 64, 'clouds': 65, 'dark': 66, 'darkness': 67, 'engine': 68, 'feet': 69, 'give': 70, 'glass': 71, 'go': 

In [44]:
#여기까지 잘됨, 이것을 이제 레이블, torch tensor로 변환 해야함. train_dataset, test_dataset 각각 처리할것

In [76]:
#토치텍스트의 테이터로더 생성

from torchtext.data import Iterator

batch_size = 16

In [77]:
train_loader = Iterator(dataset=train_data, batch_size = batch_size)
test_loader = Iterator(dataset=test_data, batch_size = batch_size)

print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_loader)))

훈련 데이터의 미니 배치 수 : 13
테스트 데이터의 미니 배치 수 : 4


In [78]:
train_loader

<torchtext.data.iterator.Iterator at 0x1a4b00bad0>

In [79]:
type(train_loader)

torchtext.data.iterator.Iterator

In [80]:
batch = next(iter(train_loader)) # 16개씩 묶어줬음. 첫번째 미니배치에 저장

print(batch.text) #첫번째 미니 배치의 text 필드를 호출해서 확인해봄

tensor([[ 771, 1401,   72,   41,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [  86,  232,  110,  232,  188,  876,  900,  980, 1431,  634,   37,  132,
          295,   15,   54,  433,    9,  824,    1,    1],
        [ 426,   33,  361,  310,  965,  282, 1257,  153,  353, 1194,   28,   10,
            1,    1,    1,    1,    1,    1,    1,    1],
        [  23,    9, 1065, 1325,  382,  319,  331,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [ 579,  131,  312,  751, 1297, 1361,  102,   22,  933,    7,   36, 1017,
          975, 1380,  236,  721,  136,  191, 1317,  301],
        [  34,   33,   51,  183,  113,  979, 1077,   47,  140, 1008, 1284,  778,
          814,  633, 1237,   95,  725,  215,  826,   38],
        [1457,  808,  529,   90,  563,  627,  233,   22,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [ 203,  318,  218, 

In [81]:
batch = next(iter(train_loader)) # 첫번째 미니배치
print(batch.text[0]) # 첫번째 미니배치 중 첫번째 샘플

tensor([ 133,  819, 1381,  474,   55,  690,   63,  119,  588,  746,  962,    8,
        1131,  472,   55,  474, 1258,    6,  104,  449])


In [82]:
print(type(batch)) #미니배치 자료형 확인. 토치텍스늬 데이터로더는  'torchtext.data.batch.Batch'라는 객체를 가져온다. 

<class 'torchtext.data.batch.Batch'>


In [83]:
batch_tr = batch.text.numpy() #tensor를 array로 전환해봄(테스트) , 여기에 라벨값이 포함되어 있는데, 이것을 구분해서 추가해줘보자...
batch_tr[0]

array([ 133,  819, 1381,  474,   55,  690,   63,  119,  588,  746,  962,
          8, 1131,  472,   55,  474, 1258,    6,  104,  449])

In [84]:
# 여기가지 진행완료!!!

data_verif = []
data_verif = test_dataset
data_verif_len = len(data_verif)
data_verif_len

data_verif = []
data_verif = train_dataset
data_verif_len = len(data_verif)
data_verif_len

type(data_verif)

data_verif[:2]

In [89]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [90]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

AttributeError: 'DataFrame' object has no attribute 'get_vocab'

In [54]:
#원본코드인데.. 이미 가공된 데이터셋을 ngrams 처리해서 불러오기 때문에 입력데이터를 dataset에 맞게 수정해야 한다.

import torch
import torchtext
from torchtext.datasets import text_classification

NGRAMS = 2

import os

if not os.path.isdir('./data2'):
    os.mkdir('./data2')
    
    
#text_classification.DATASETS의 구조를 보고 결과데이터를 어떻게 생성하는지 분석하거나 이하 학습코드를 분석    
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./data2', ngrams=NGRAMS, vocab=None) 
#ref : https://pytorch.org/text/datasets.html#ag-news



BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

120000lines [00:08, 14582.67lines/s]
120000lines [00:15, 7822.49lines/s]
7600lines [00:00, 8340.65lines/s]


In [55]:
test_dataset[10:16]

[(3,
  tensor([    131,       5,   23258,      27,    2922,     357,    2688,     769,
              814,      14,      32,      15,      16,       6,     131,       7,
              230,     293,     452,     836,    6438,      85,       2,      51,
            43647,       2,   24372,       4,   60885,      51,  281059,       2,
                0,       9,   21969,     115,       2,      51,  108539,       2,
            36279,       4,      11,      81,      31,      90,      39,   23258,
                6,      27,     357,    4090,    1698,      53,       5,     273,
              821,       3,    1507,       7,       3,    1473,    3049,       2,
             9821,   53919,  115850,   75043,   30252,  478273,  244818,     822,
             4291,      43,      44,      46,     296,    2486,    2022,    8893,
            23909,   49141,  381768,    9169,   19041,      89,     122,       0,
            43648,   24031,   69185,   94356, 1174101,       0,  450795,       0,
           

In [56]:
train_dataset[0:10]

[(2,
  tensor([    572,     564,       2,    2326,   49106,     150,      88,       3,
             1143,      14,      32,      15,      32,      16,  443749,       4,
              572,     499,      17,      10,  741769,       7,  468770,       4,
               52,    7019,    1050,     442,       2,   14341,     673,  141447,
           326092,   55044,    7887,     411,    9870,  628642,      43,      44,
              144,     145,  299709,  443750,   51274,     703,   14312,      23,
          1111134,  741770,  411508,  468771,    3779,   86384,  135944,  371666,
             4052])),
 (2,
  tensor([  55003,    1474,    1150,    1832,    7559,      14,      32,      15,
               32,      16,    1262,    1072,     436,   55003,     131,       4,
           142576,      33,       6,    8062,      12,     756,  475640,       9,
           991346,    3186,       8,       3,     698,     329,       4,      33,
             6764, 1040465,   13979,      11,     278,     483,   

In [57]:
data_verif = []
data_verif = test_dataset
data_verif_len = len(data_verif)
data_verif_len

7600

In [58]:
data_verif_train = []
data_verif_train = train_dataset
data_verif_tr_len = len(data_verif_train)
data_verif_len

7600

In [59]:
os.path.exists('./data')
os.listdir('./data')

['.DS_Store', 'showingTelling.xlsx', 'showingTelling_csv.csv']

In [60]:
os.getcwd()

'/Users/kimkwangil/Project/01EssayFitAI/showing_telling'

In [61]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [62]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [64]:
train_dataset[0]

(2,
 tensor([    572,     564,       2,    2326,   49106,     150,      88,       3,
            1143,      14,      32,      15,      32,      16,  443749,       4,
             572,     499,      17,      10,  741769,       7,  468770,       4,
              52,    7019,    1050,     442,       2,   14341,     673,  141447,
          326092,   55044,    7887,     411,    9870,  628642,      43,      44,
             144,     145,  299709,  443750,   51274,     703,   14312,      23,
         1111134,  741770,  411508,  468771,    3779,   86384,  135944,  371666,
            4052]))

In [65]:
type(train_dataset)

torchtext.datasets.text_classification.TextClassificationDataset

In [63]:
VOCAB_SIZE

1308844

In [48]:
NUN_CLASS

4

In [4]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [7]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [8]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 25 seconds
	Loss: 0.0260(train)	|	Acc: 84.8%(train)
	Loss: 0.0001(valid)	|	Acc: 90.5%(valid)
Epoch: 2  | time in 0 minutes, 24 seconds
	Loss: 0.0118(train)	|	Acc: 93.7%(train)
	Loss: 0.0000(valid)	|	Acc: 89.3%(valid)
Epoch: 3  | time in 0 minutes, 24 seconds
	Loss: 0.0068(train)	|	Acc: 96.4%(train)
	Loss: 0.0001(valid)	|	Acc: 90.6%(valid)
Epoch: 4  | time in 0 minutes, 25 seconds
	Loss: 0.0038(train)	|	Acc: 98.2%(train)
	Loss: 0.0000(valid)	|	Acc: 90.8%(valid)
Epoch: 5  | time in 0 minutes, 24 seconds
	Loss: 0.0022(train)	|	Acc: 99.1%(train)
	Loss: 0.0000(valid)	|	Acc: 91.3%(valid)


In [9]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0002(test)	|	Acc: 89.3%(test)


In [12]:

import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

vocab = train_dataset.get_vocab()
model = model.to("cpu")

In [13]:
print("This is a %s news" %ag_news_label[predict(ex_text_str, model, vocab, 2)])

This is a Sports news
