# Classification showing and telling with pytorch

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/showingTelling_csv.csv', delimiter = ',')

data.index.name = "index"
data.columns = ["type", "text"]
data

Unnamed: 0_level_0,type,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,The house was creepy.
1,1,I heard footsteps creeping behind me and it ma...
2,1,She was my best friend. I could tell her almos...
3,1,She hated it there because it smelled bad.
4,1,When they embraced she could tell he had been ...
...,...,...
253,2,"Her hand reached for the massive, iron door ha..."
254,2,The way the door decisively slammed behind her...
255,2,Dust coated every last surface. He ran his fin...
256,2,The lime green patio umbrella flapped happily ...


In [2]:
def shuffle(df, n=1, axis=0): #데이터가 1111 0000이라 잘 섞어주자. 
    df = data.copy()
    for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
    return df

shuffle(data)

train, test = train_test_split(data, test_size=0.2)
print(len(train))
print(len(test))

206
52


In [3]:
train.head() # 잘 섞였군만! 하지만 데이터 전처리가 필요하다.

Unnamed: 0_level_0,type,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
132,2,and rain grew louder and seemed closer₩cJoey s...
26,1,I climbed the fence.
180,2,She trembled and looked up at him with fear in...
43,1,"Imagine someone burning the book page by page,..."
130,2,"the slow, steep walk up the stairs. ₩eEvery da..."


In [4]:
test.head()

Unnamed: 0_level_0,type,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
87,1,Elroy drove the boat for fifteen minutes until...
208,2,A saw and hammer dangled from his belt and an ...
138,2,My mother and I would take walks to watch the ...
21,1,Bill was frightened. He thought someone
152,2,"run down the stairs, let my dog out, and inhal..."


In [5]:
import re
import pandas
import numpy
import json
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [6]:
def preprocessing(review, remove_stopwords=False):
        
    # 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    review_text = re.sub("[^a-zA-Z]", " ", review)

    # 대문자들을 소문자로 바꾸고 공백단위로 텍스트들 나눠서 리스트로 만든다.
    words = review_text.lower().split()

    if remove_stopwords: 
        # 불용어들을 제거
    
        #영어에 관련된 불용어 불러오기
        stops = set(stopwords.words("english"))
        # 불용어가 아닌 단어들로 이루어진 새로운 리스트 생성
        words = [w for w in words if not w in stops]
        # 단어 리스트를 공백을 넣어서 하나의 글로 합친다.
        clean_review = ' '.join(words)

    else: # 불용어 제거하지 않을 때
        clean_review = ' '.join(words)

    return clean_review

In [7]:
clean_train_ = []
for review in train['text']:
    clean_train_.append(preprocessing(review, remove_stopwords=True))

# 전처리된 데이터 확인. 잘됨 !! ㅎ
clean_train_[0]

'rain grew louder seemed closer cjoey sat still'

In [8]:
clean_train_[:5]

['rain grew louder seemed closer cjoey sat still',
 'climbed fence',
 'trembled looked fear eyes',
 'imagine someone burning book page page give us incorrect knowledge',
 'slow steep walk stairs eevery day fs f thought']

In [9]:
clean_test_ = []
for review in test['text']:
    clean_test_.append(preprocessing(review, remove_stopwords=True))
    
# 전처리된 데이터 확인. 잘됨 !! ㅎ
clean_test_[0]

'elroy drove boat fifteen minutes reached canadian waters cut engine began fishing'

In [10]:
clean_test_[:5]

['elroy drove boat fifteen minutes reached canadian waters cut engine began fishing',
 'saw hammer dangled belt adze hooked one thumbnail black bowed saw several long wood shavings caught curly hair',
 'mother would take walks watch glow chapel fs stained glass',
 'bill frightened thought someone',
 'run stairs let dog inhale breakfast scramble books jacket race front door']

In [11]:
print(train["type"]) #showing telling 컬럽값을 확인해보고

index
132    2
26     1
180    2
43     1
130    2
      ..
121    2
79     1
219    2
198    2
13     1
Name: type, Length: 206, dtype: int64


In [12]:
train['cleaned_text'] = clean_train_ # 이제 전처리된 내용을 한눈에 비교해 볼 수 있다.
train[:5] #데이터 앞부분 5개반 확인해보자

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,type,text,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
132,2,and rain grew louder and seemed closer₩cJoey s...,rain grew louder seemed closer cjoey sat still
26,1,I climbed the fence.,climbed fence
180,2,She trembled and looked up at him with fear in...,trembled looked fear eyes
43,1,"Imagine someone burning the book page by page,...",imagine someone burning book page page give us...
130,2,"the slow, steep walk up the stairs. ₩eEvery da...",slow steep walk stairs eevery day fs f thought


In [13]:
test['cleaned_text'] = clean_test_
test[:5] #test 데이터셋도 전처리된 결과를 확인할 수 있다.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,type,text,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
87,1,Elroy drove the boat for fifteen minutes until...,elroy drove boat fifteen minutes reached canad...
208,2,A saw and hammer dangled from his belt and an ...,saw hammer dangled belt adze hooked one thumbn...
138,2,My mother and I would take walks to watch the ...,mother would take walks watch glow chapel fs s...
21,1,Bill was frightened. He thought someone,bill frightened thought someone
152,2,"run down the stairs, let my dog out, and inhal...",run stairs let dog inhale breakfast scramble b...


In [14]:
train_dataset = train[['type', 'cleaned_text']] #전처리 1차 끝!
train_dataset[:5]

Unnamed: 0_level_0,type,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
132,2,rain grew louder seemed closer cjoey sat still
26,1,climbed fence
180,2,trembled looked fear eyes
43,1,imagine someone burning book page page give us...
130,2,slow steep walk stairs eevery day fs f thought


In [15]:
test_dataset = test[['type', 'cleaned_text']]  #전처리 1차 끝!
test_dataset.head()

Unnamed: 0_level_0,type,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
87,1,elroy drove boat fifteen minutes reached canad...
208,2,saw hammer dangled belt adze hooked one thumbn...
138,2,mother would take walks watch glow chapel fs s...
21,1,bill frightened thought someone
152,2,run stairs let dog inhale breakfast scramble b...


In [16]:
df_test = test_dataset.astype(str)
df_test.dtypes

type            object
cleaned_text    object
dtype: object

In [17]:
df_train = train_dataset.astype(str)
df_train.dtypes

type            object
cleaned_text    object
dtype: object

In [18]:
df_train = df_train[['cleaned_text', 'type']]
df_test = df_test[['cleaned_text', 'type']]

In [19]:
df_train[:5]

Unnamed: 0_level_0,cleaned_text,type
index,Unnamed: 1_level_1,Unnamed: 2_level_1
132,rain grew louder seemed closer cjoey sat still,2
26,climbed fence,1
180,trembled looked fear eyes,2
43,imagine someone burning book page page give us...,1
130,slow steep walk stairs eevery day fs f thought,2


In [20]:
df_train.to_csv('datasets/train_datasets.csv', index=False, header=False, sep=',')
df_test.to_csv('datasets/test_datasets.csv', index=False, header=False, sep=',')

In [21]:
from torchtext import data # torchtext.data 임포트

# 필드 정의

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=20)


from torchtext.data import TabularDataset


train_data, test_data = TabularDataset.splits(
        path='datasets/', train='train_datasets.csv', test='test_datasets.csv', format='csv',
        fields=[('text', TEXT), ('label', LABEL)], skip_header=True)


print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))

#ref: https://wikidocs.net/60314

훈련 샘플의 개수 : 205
테스트 샘플의 개수 : 51


In [22]:
print(vars(train_data[0])) # 성공

{'text': ['climbed', 'fence'], 'label': '1'}


In [23]:
 #단어장 생성
TEXT.build_vocab(train_data)
TEXT.build_vocab(test_data)

#단어장 생성 확인
print('Total vocabulary: {}'.format(len(TEXT.vocab)))
print('Token for "<unk>": {}'.format(TEXT.vocab.stoi['<unk>']))
print('Token for "<pad>": {}'.format(TEXT.vocab.stoi['<pad>']))



Total vocabulary: 577
Token for "<unk>": 0
Token for "<pad>": 1


In [24]:
print(train_data.fields.items())

dict_items([('text', <torchtext.data.field.Field object at 0x1a4ac65850>), ('label', <torchtext.data.field.Field object at 0x1a4ac9bdd0>)])


In [25]:
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)

In [26]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))

단어 집합의 크기 : 14


In [27]:
print(TEXT.vocab.stoi) # 생성된 집합 내 단어들을 확인해보자

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x1a4ac4dd50>>, {'<unk>': 0, '<pad>': 1, 'ft': 2, 'h': 3, 'fs': 4, 'could': 5, 'like': 6, 'one': 7, 'felt': 8, 'house': 9, 'door': 10, 'face': 11, 'red': 12, 'time': 13})


In [28]:
from torchtext.data import Iterator
batch_size = 16

In [29]:
train_loader = Iterator(dataset=train_data, batch_size = batch_size)
test_loader = Iterator(dataset=test_data, batch_size = batch_size)

print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_loader)))

훈련 데이터의 미니 배치 수 : 13
테스트 데이터의 미니 배치 수 : 4


In [34]:
batch = next(iter(train_loader)) # 첫번째 미니배치

print(batch.text) #첫번째 미니 배치의 text 필드를 호출

tensor([[ 0,  8,  0,  0, 10,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  0,  1,  1,  1,
          1,  1],
        [ 0,  0,  0,  0,  0,  3,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0,  5,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0,  0,  0,  2,  0,  0,  0,  5,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0,  0,  0,  0,  0,  0,  4,  0,  0,  8,  0,  0,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0,  0,  0,  0,  0, 12,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 

In [35]:
batch = next(iter(train_loader)) # 첫번째 미니배치
print(batch.text[0]) # 첫번째 미니배치 중 첫번째 샘플

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [18]:
# ['1,unusual cat'] 이렇게 저장되는 것을 아래와 같은 포멧으로 저장해야하는데...
# ['3',"The Importance reading from Bartlett 's Familiar Quotations"]

import numpy as np

train_list = df_train.values.tolist() #데이터프레임을 리스트로 변환
test_list = df_test.values.tolist()

#train_list[:3]
test_list[:3] #리스트로 변환, 데이터는 문자료 변환 확인, 이것을 직접 torchtext로 입력해야 하는디...


#How to create a torchtext.data.TabularDataset directly from a list or dict........이 방법이 최선

class TabularDataset_From_List(data.Dataset):

    def __init__(self, input_list, format, fields, skip_header=False, **kwargs):
        make_example = {
            'json': Example.fromJSON, 'dict': Example.fromdict,
            'tsv': Example.fromTSV, 'csv': Example.fromCSV}[format.lower()]

        examples = [make_example(item, fields) for item in input_list]

        if make_example in (Example.fromdict, Example.fromJSON):
            fields, field_dict = [], fields
            for field in field_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(TabularDataset_From_List, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, path=None, root='.data', train=None,
               test=None, **kwargs):
        if path is None:
            path = cls.download(root)
        train_data = None if train is None else cls(
            train, **kwargs)
        test_data = None if test is None else cls(
            test, **kwargs)
        return tuple(d for d in (train_data, test_data)
                     if d is not None)

form torchtext import data
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu" # gpu 사용불가시 cpu로 학습 설정

def tokenizer(text):
    return list(text.lower())

#저장한 데이터를 확인해보니 데이터 구분이 ''로 합져 있음.  그래서 직접 리스트를 torchtext로 넣을 수는 없을까
#일단 이 코드는 실행하지 않고 데이터를 처리해보자
Train_files = open("datasets/train_datasets.tsv", "r")
Train_files.read().split(',')

Test_files = open("datasets/test_datasets.tsv", "r")
Test_files.read().split(',')[:3]

import torchtext
from torchtext.data import TabularDataset
from torchtext.data import Field
from torchtext.data import Iterator


def tokenizer(text):
    return list(text.lower())


#필드 생성
TEXT = Field(sequential=True,  # for sequence data like sentence, set this argument True
             use_vocab=True,
             tokenize=tokenizer,  # you can define your own tokenizer
             lower=True, 
             batch_first=True)  

LABEL = Field(sequential=False,  # for non-sequence data like label, set this argument False
              use_vocab=False,   # since label is a number, we don't have to use vocabulary
              preprocessing = lambda x: int(x),  # this preprocessing is used after Tokenize and before Numericalize
              batch_first=True)

#데이터셋 생성 >>>> 여기서 그냥 train_list, test_list를 직접 입력하면
train_data = TabularDataset(path='datasets/train_datasets.tsv',
                                  format = 'tsv',
                                  fields = [('type', LABEL), ('text', TEXT)])

test_data = TabularDataset(path='datasets/test_datasets.tsv',
                                  format = 'tsv',
                                  fields = [('type', LABEL), ('text', TEXT)])


 #단어장 생성
TEXT.build_vocab(train_data)
#TEXT.build_vocab(test_data)

#단어장 생성 확인
print('Total vocabulary: {}'.format(len(TEXT.vocab)))
print('Token for "<unk>": {}'.format(TEXT.vocab.stoi['<unk>']))
print('Token for "<pad>": {}'.format(TEXT.vocab.stoi['<pad>']))

#데이터로더 생성
train_loader, test_loader = TabularDataset.splits((train_data, test_data),
                                                 batch_size = 3,
                                                 device=None, #gpu -> "cuda"
                                                 repeat=False)

In [33]:
#이런 방법도 시도. custom dataset tabulardataset
import torchtext
from torchtext import data


def tokenizer(s): 
    return [w.text.lower() for w in nlp(clean(s))]

def clean(text):
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
    text = re.sub(r'https?:/\/\S+', ' ', text) # remove links
    return text.strip()



LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)

TEXT = data.Field(sequential=True, 
                        tokenize=tokenizer, 
                        include_lengths=True, 
                        use_vocab=True)




train, test = data.TabularDataset.splits(
        path='datasets/', train='train_datasets.tsv',
        test='test_datasets.tsv', format='tsv',
        fields=[('Text', TEXT), ('Label', LABEL)])


TEXT.build_vocab(train, vectors="glove.6B.100d")

NameError: name 'nlp' is not defined

In [None]:
import torch
import torchtext
from torchtext.data import TabularDataset
from torchtext.data import Field
from torchtext.data import Iterator


LABEL = Field(sequential=False,  
              use_vocab=False,   
              preprocessing = lambda x: int(x),  
              batch_first=True)

TEXT = Field(sequential=True,
             use_vocab=True,
             tokenize=str.split,
             lower=True, 
             batch_first=True)

NGRAMS = 2
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader, test_loader = TabularDataset.splits((Input_Train_dataset_List,Input_Test_dataset_List),
                                                    batch_size = BATCH_SIZE,
                                                    device = device,
                                                    repeat = False) 

TEXT.build_vocab(train_data)



In [25]:
dataloader = torch.utils.data.DataLoader(train_dataset,
                                         batch_size=16)
for data in dataloader:
    print(data['input'].shape, data['label'])

KeyError: 0

In [23]:
import pandas as pd
import torch

# determine the supported device
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

# convert a df to tensor to be used in pytorch
def df_to_tensor(df):
    device = get_device()
    return torch.from_numpy(df.values).float().to(device)

df_tensor = df_to_tensor(train_dataset)
#series_tensor = df_to_tensor(series)
df_tensor[0:3]

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

In [46]:
#원본코드인데.. 이미 가공된 데이터셋을 ngrams 처리해서 불러오기 때문에 입력데이터를 dataset에 맞게 수정해야 한다.

import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('./data2'):
    os.mkdir('./data2')
    
    
#text_classification.DATASETS의 구조를 보고 결과데이터를 어떻게 생성하는지 분석하거나 이하 학습코드를 분석하자    
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./data2', ngrams=NGRAMS, vocab=None) 
#ref : https://pytorch.org/text/datasets.html#ag-news



BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

120000lines [00:08, 14727.53lines/s]
120000lines [00:15, 7826.84lines/s]
7600lines [00:00, 7879.59lines/s]


In [48]:
test_dataset[0]

(2,
 tensor([  1169,     12,    111,    187,   2115,     40,    233,   2575,   7136,
            535,     22,  13915, 262719,    315,     90,     52,     17,   8110,
             17,     40,    233,     20,  31666,   3797,    436,    260,  25589,
              2,  34651,      0,      0,      0,      0,  38519,      0,  61363,
         432460,  10954,  62400,      0,      0,   3701,   1976, 319981, 485989,
         728324,  94121,  38519,   1416,      0,      0, 159317,      0, 774722,
              0]))

In [26]:
train_dataset[0:2]

[(2,
  tensor([    572,     564,       2,    2326,   49106,     150,      88,       3,
             1143,      14,      32,      15,      32,      16,  443749,       4,
              572,     499,      17,      10,  741769,       7,  468770,       4,
               52,    7019,    1050,     442,       2,   14341,     673,  141447,
           326092,   55044,    7887,     411,    9870,  628642,      43,      44,
              144,     145,  299709,  443750,   51274,     703,   14312,      23,
          1111134,  741770,  411508,  468771,    3779,   86384,  135944,  371666,
             4052])),
 (2,
  tensor([  55003,    1474,    1150,    1832,    7559,      14,      32,      15,
               32,      16,    1262,    1072,     436,   55003,     131,       4,
           142576,      33,       6,    8062,      12,     756,  475640,       9,
           991346,    3186,       8,       3,     698,     329,       4,      33,
             6764, 1040465,   13979,      11,     278,     483,   

In [8]:
data_verif = []
data_verif = test_dataset
data_verif_len = len(data_verif)
data_verif_len

7600

In [66]:
os.path.exists('./data')
os.listdir('./data')

['.DS_Store', 'showingTelling.xlsx', 'showingTelling_csv.csv']

In [67]:
os.getcwd()

'/Users/kimkwangil/Project/01EssayFitAI/showing_telling'

In [31]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [32]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

AttributeError: 'DataFrame' object has no attribute 'get_vocab'

In [33]:
VOCAB_SIZE

NameError: name 'VOCAB_SIZE' is not defined

In [124]:
NUN_CLASS

4

In [4]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [7]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [8]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 25 seconds
	Loss: 0.0260(train)	|	Acc: 84.8%(train)
	Loss: 0.0001(valid)	|	Acc: 90.5%(valid)
Epoch: 2  | time in 0 minutes, 24 seconds
	Loss: 0.0118(train)	|	Acc: 93.7%(train)
	Loss: 0.0000(valid)	|	Acc: 89.3%(valid)
Epoch: 3  | time in 0 minutes, 24 seconds
	Loss: 0.0068(train)	|	Acc: 96.4%(train)
	Loss: 0.0001(valid)	|	Acc: 90.6%(valid)
Epoch: 4  | time in 0 minutes, 25 seconds
	Loss: 0.0038(train)	|	Acc: 98.2%(train)
	Loss: 0.0000(valid)	|	Acc: 90.8%(valid)
Epoch: 5  | time in 0 minutes, 24 seconds
	Loss: 0.0022(train)	|	Acc: 99.1%(train)
	Loss: 0.0000(valid)	|	Acc: 91.3%(valid)


In [9]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0002(test)	|	Acc: 89.3%(test)


In [12]:

import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

vocab = train_dataset.get_vocab()
model = model.to("cpu")

In [13]:
print("This is a %s news" %ag_news_label[predict(ex_text_str, model, vocab, 2)])

This is a Sports news
