# Classification showing and telling with pytorch

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/showingTelling_csv.csv', delimiter = ',')

data.index.name = "index"
data.columns = ["type", "text"]
data

Unnamed: 0_level_0,type,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,The house was creepy.
1,1,I heard footsteps creeping behind me and it ma...
2,1,She was my best friend. I could tell her almos...
3,1,She hated it there because it smelled bad.
4,1,When they embraced she could tell he had been ...
...,...,...
253,2,"Her hand reached for the massive, iron door ha..."
254,2,The way the door decisively slammed behind her...
255,2,Dust coated every last surface. He ran his fin...
256,2,The lime green patio umbrella flapped happily ...


In [2]:
def shuffle(df, n=1, axis=0): #데이터가 1111 0000이라 잘 섞어주자. 
    df = data.copy()
    for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
    return df

shuffle(data)

train, test = train_test_split(data, test_size=0.2)
print(len(train))
print(len(test))

206
52


In [3]:
train.head() # 잘 섞였군만! 하지만 데이터 전처리가 필요하다.

Unnamed: 0_level_0,type,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
66,1,The weather was bad.
256,2,The lime green patio umbrella flapped happily ...
167,2,"Archie scrabbling up the stairs, as usual curs..."
65,1,Jim was so angry that Blair was afraid.
147,2,"Whenever I am scheduled to give a speech, I su..."


In [4]:
test.head()

Unnamed: 0_level_0,type,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
129,2,Old Mr Chan used a tissue to wipe the sweat fr...
125,2,"championships, sings lead vocals in a rock ban..."
41,1,The father went to the hoghouse to kill the sm...
98,1,I saw clouds and lightning from the sea. I was...
227,2,The flowers in the front garden were long dead...


In [5]:
import re
import pandas
import numpy
import json
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [6]:
def preprocessing(review, remove_stopwords=False):
        
    # 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    review_text = re.sub("[^a-zA-Z]", " ", review)

    # 대문자들을 소문자로 바꾸고 공백단위로 텍스트들 나눠서 리스트로 만든다.
    words = review_text.lower().split()

    if remove_stopwords: 
        # 불용어들을 제거
    
        #영어에 관련된 불용어 불러오기
        stops = set(stopwords.words("english"))
        # 불용어가 아닌 단어들로 이루어진 새로운 리스트 생성
        words = [w for w in words if not w in stops]
        # 단어 리스트를 공백을 넣어서 하나의 글로 합친다.
        clean_review = ' '.join(words)

    else: # 불용어 제거하지 않을 때
        clean_review = ' '.join(words)

    return clean_review

In [7]:
clean_train_ = []
for review in train['text']:
    clean_train_.append(preprocessing(review, remove_stopwords=True))

# 전처리된 데이터 확인. 잘됨 !! ㅎ
clean_train_[0]

'weather bad'

In [8]:
clean_train_[:5]

['weather bad',
 'lime green patio umbrella flapped happily breeze covered strawberry slushies watermelon pies bright cheerful stacks donuts emily stepped outside feature crystal pitcher pink lemonade spray warm cookies center table favorite summer dress knew soon stepped patio gate life would never',
 'archie scrabbling stairs usual cursing blinding wilting weight boxes clara could carry two three time without effort clara taking break squinting warm may sunshine trying get bearings peeled little purple vest leaned front gate kind place thing see ft sure',
 'jim angry blair afraid',
 'whenever scheduled give speech suffer wet clammy hands']

In [9]:
clean_test_ = []
for review in test['text']:
    clean_test_.append(preprocessing(review, remove_stopwords=True))
    
# 전처리된 데이터 확인. 잘됨 !! ㅎ
clean_test_[0]

'old mr chan used tissue wipe sweat face got ready'

In [10]:
clean_test_[:5]

['old mr chan used tissue wipe sweat face got ready',
 'championships sings lead vocals rock band speaks five languages',
 'father went hoghouse kill smallest pig born last night fern ft want father kill',
 'saw clouds lightning sea nervous upcoming storm avoided rain canopy ft see anyone streets filled bad smell sewers darkness',
 'flowers front garden long dead grass knee high paint flaking window frames pushed open front door rotten smell hit patches damp mold crept walls took one step forward stepping onto uneven creaky floorboard']

In [11]:
print(train["type"]) #showing telling 컬럽값을 확인해보고

index
66     1
256    2
167    2
65     1
147    2
      ..
113    2
101    1
5      1
186    2
78     1
Name: type, Length: 206, dtype: int64


In [12]:
train['cleaned_text'] = clean_train_ # 이제 전처리된 내용을 한눈에 비교해 볼 수 있다.
train[:5] #데이터 앞부분 5개반 확인해보자

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,type,text,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
66,1,The weather was bad.,weather bad
256,2,The lime green patio umbrella flapped happily ...,lime green patio umbrella flapped happily bree...
167,2,"Archie scrabbling up the stairs, as usual curs...",archie scrabbling stairs usual cursing blindin...
65,1,Jim was so angry that Blair was afraid.,jim angry blair afraid
147,2,"Whenever I am scheduled to give a speech, I su...",whenever scheduled give speech suffer wet clam...


In [13]:
test['cleaned_text'] = clean_test_
test[:5] #test 데이터셋도 전처리된 결과를 확인할 수 있다.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,type,text,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
129,2,Old Mr Chan used a tissue to wipe the sweat fr...,old mr chan used tissue wipe sweat face got ready
125,2,"championships, sings lead vocals in a rock ban...",championships sings lead vocals rock band spea...
41,1,The father went to the hoghouse to kill the sm...,father went hoghouse kill smallest pig born la...
98,1,I saw clouds and lightning from the sea. I was...,saw clouds lightning sea nervous upcoming stor...
227,2,The flowers in the front garden were long dead...,flowers front garden long dead grass knee high...


In [14]:
train_dataset = train[['type', 'cleaned_text']] #전처리 1차 끝!
train_dataset[:5]

Unnamed: 0_level_0,type,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
66,1,weather bad
256,2,lime green patio umbrella flapped happily bree...
167,2,archie scrabbling stairs usual cursing blindin...
65,1,jim angry blair afraid
147,2,whenever scheduled give speech suffer wet clam...


In [15]:
test_dataset = test[['type', 'cleaned_text']]  #전처리 1차 끝!
test_dataset.head()

Unnamed: 0_level_0,type,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
129,2,old mr chan used tissue wipe sweat face got ready
125,2,championships sings lead vocals rock band spea...
41,1,father went hoghouse kill smallest pig born la...
98,1,saw clouds lightning sea nervous upcoming stor...
227,2,flowers front garden long dead grass knee high...


In [16]:
df_test = test_dataset.astype(str)
df_test.dtypes

type            object
cleaned_text    object
dtype: object

In [17]:
df_train = train_dataset.astype(str)
df_train.dtypes

type            object
cleaned_text    object
dtype: object

In [35]:
train_dataset.to_csv('datasets/train_datasets.csv', index=False, header=False, sep=',')
test_dataset.to_csv('datasets/test_datasets.csv', index=False, header=False, sep=',')

In [39]:
from torchtext import data # torchtext.data 임포트

# 필드 정의

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=20)


from torchtext.data import TabularDataset


train_data, test_data = TabularDataset.splits(
        path='datasets/', train='train_datasets.csv', test='test_datasets.csv', format='csv',
        fields=[('label', LABEL), ('text', TEXT)], skip_header=True)


print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))

#ref: https://wikidocs.net/60314

훈련 샘플의 개수 : 205
테스트 샘플의 개수 : 51


In [40]:
print(vars(train_data[2])) # text, label이 구분됨

{'label': '1', 'text': ['jim', 'angry', 'blair', 'afraid']}


In [41]:
 #단어장 생성
TEXT.build_vocab(train_data)
#TEXT.build_vocab(test_data)

#단어장 생성 확인
print('Total vocabulary: {}'.format(len(TEXT.vocab)))
print('Token for "<unk>": {}'.format(TEXT.vocab.stoi['<unk>']))
print('Token for "<pad>": {}'.format(TEXT.vocab.stoi['<pad>']))



Total vocabulary: 536
Token for "<unk>": 0
Token for "<pad>": 1


In [42]:
print(train_data.fields.items()) # tex, label 로 구분되어 있는 것을 확인할 수 있다.

dict_items([('label', <torchtext.data.field.Field object at 0x10ab523d0>), ('text', <torchtext.data.field.Field object at 0x10ab52410>)])


In [43]:
print(test_data.fields.items())

dict_items([('label', <torchtext.data.field.Field object at 0x10ab523d0>), ('text', <torchtext.data.field.Field object at 0x10ab52410>)])


In [44]:
TEXT.build_vocab(train_data, min_freq=1, max_size=10000)

In [45]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))

단어 집합의 크기 : 1590


In [46]:
print(TEXT.vocab.stoi) # 생성된 집합 내 단어들을 확인해보자

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x1a4e950d10>>, {'<unk>': 0, '<pad>': 1, 'ft': 2, 'h': 3, 'fs': 4, 'could': 5, 'like': 6, 'one': 7, 'behind': 8, 'house': 9, 'cold': 10, 'time': 11, 'door': 12, 'felt': 13, 'first': 14, 'long': 15, 'sun': 16, 'back': 17, 'black': 18, 'c': 19, 'came': 20, 'face': 21, 'garden': 22, 'hands': 23, 'little': 24, 'looked': 25, 'never': 26, 'night': 27, 'said': 28, 'walked': 29, 'window': 30, 'away': 31, 'blood': 32, 'every': 33, 'glass': 34, 'hand': 35, 'james': 36, 'knew': 37, 'made': 38, 'old': 39, 'red': 40, 'room': 41, 'saw': 42, 'see': 43, 'stairs': 44, 'another': 45, 'boat': 46, 'bright': 47, 'castle': 48, 'dark': 49, 'day': 50, 'feet': 51, 'hair': 52, 'home': 53, 'light': 54, 'man': 55, 'two': 56, 'village': 57, 'angry': 58, 'arms': 59, 'beautiful': 60, 'books': 61, 'breath': 62, 'darkness': 63, 'dress': 64, 'eyes': 65, 'find': 66, 'gi': 67, 'go': 68, 'got': 69, 'hot': 70, 'last': 71, 'late': 72, 'le

In [47]:
#토치텍스트의 테이터로더 생성

from torchtext.data import Iterator

batch_size = 16

In [48]:
train_loader = Iterator(dataset=train_data, batch_size = batch_size)
test_loader = Iterator(dataset=test_data, batch_size = batch_size)

print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_loader)))

훈련 데이터의 미니 배치 수 : 13
테스트 데이터의 미니 배치 수 : 4


In [49]:
print(type(batch)) #미니배치 자료형 확인. 토치텍스늬 데이터로더는  'torchtext.data.batch.Batch'라는 객체를 가져온다. 

<class 'torchtext.data.batch.Batch'>


In [50]:
batch = next(iter(train_loader)) # 16개씩 묶어줬음. 첫번째 미니배치에 저장

print(batch.text) #첫번째 미니 배치의 text 필드를 호출해서 확인해봄

tensor([[  10,  646,  598,  301,  900, 1045,   88, 1008,  799,   79,  962,  587,
          131,  365, 1551,  366,  120,    5,   93,  999],
        [  72,  321,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [ 263,  337,  139,   82,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [ 274, 1124,  693,  740,  965,  973,  632, 1226, 1073,  279,  947, 1383,
         1510,  382,  238,  540,    4, 1498,  644,  825],
        [ 358, 1378,  837, 1451, 1249,  679,  444, 1272,  530,  682,  597, 1068,
           78,  416,    4,   32,  706,  237,    1,    1],
        [ 298,  460,  207, 1559,   52,  217,   12, 1054,   24,   95,  158,  705,
          969,    1,    1,    1,    1,    1,    1,    1],
        [ 380,   10,  299,  883,   97,  738,  692,  227, 1182,  274,  786,  797,
         1423,  207, 1083,  435,  714,    7, 1181,  382],
        [ 264,  192,  860, 

In [51]:
batch = next(iter(train_loader)) # 첫번째 미니배치
print(batch.text[0]) # 첫번째 미니배치 중 첫번째 샘플

tensor([ 72, 321,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1])


In [None]:
# 여기가지 진행완료!!!

In [37]:
#원본코드인데.. 이미 가공된 데이터셋을 ngrams 처리해서 불러오기 때문에 입력데이터를 dataset에 맞게 수정해야 한다.

import torch
import torchtext
from torchtext.datasets import text_classification

NGRAMS = 2

import os

if not os.path.isdir('./data2'):
    os.mkdir('./data2')
    
    
#text_classification.DATASETS의 구조를 보고 결과데이터를 어떻게 생성하는지 분석하거나 이하 학습코드를 분석    
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./data2', ngrams=NGRAMS, vocab=None) 
#ref : https://pytorch.org/text/datasets.html#ag-news



BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

120000lines [00:09, 12860.97lines/s]
120000lines [00:17, 6766.86lines/s]
7600lines [00:01, 5807.08lines/s]


In [41]:
test_dataset[10:16]

[(3,
  tensor([    131,       5,   23258,      27,    2922,     357,    2688,     769,
              814,      14,      32,      15,      16,       6,     131,       7,
              230,     293,     452,     836,    6438,      85,       2,      51,
            43647,       2,   24372,       4,   60885,      51,  281059,       2,
                0,       9,   21969,     115,       2,      51,  108539,       2,
            36279,       4,      11,      81,      31,      90,      39,   23258,
                6,      27,     357,    4090,    1698,      53,       5,     273,
              821,       3,    1507,       7,       3,    1473,    3049,       2,
             9821,   53919,  115850,   75043,   30252,  478273,  244818,     822,
             4291,      43,      44,      46,     296,    2486,    2022,    8893,
            23909,   49141,  381768,    9169,   19041,      89,     122,       0,
            43648,   24031,   69185,   94356, 1174101,       0,  450795,       0,
           

In [40]:
train_dataset[0:10]

[(2,
  tensor([    572,     564,       2,    2326,   49106,     150,      88,       3,
             1143,      14,      32,      15,      32,      16,  443749,       4,
              572,     499,      17,      10,  741769,       7,  468770,       4,
               52,    7019,    1050,     442,       2,   14341,     673,  141447,
           326092,   55044,    7887,     411,    9870,  628642,      43,      44,
              144,     145,  299709,  443750,   51274,     703,   14312,      23,
          1111134,  741770,  411508,  468771,    3779,   86384,  135944,  371666,
             4052])),
 (2,
  tensor([  55003,    1474,    1150,    1832,    7559,      14,      32,      15,
               32,      16,    1262,    1072,     436,   55003,     131,       4,
           142576,      33,       6,    8062,      12,     756,  475640,       9,
           991346,    3186,       8,       3,     698,     329,       4,      33,
             6764, 1040465,   13979,      11,     278,     483,   

In [42]:
data_verif = []
data_verif = test_dataset
data_verif_len = len(data_verif)
data_verif_len

7600

In [65]:
data_verif_train = []
data_verif_train = train_dataset
data_verif_tr_len = len(data_verif_train)
data_verif_len

7600

In [43]:
os.path.exists('./data')
os.listdir('./data')

['.DS_Store', 'showingTelling.xlsx', 'showingTelling_csv.csv']

In [44]:
os.getcwd()

'/Users/kimkwangil/Project/01EssayFitAI/showing_telling'

In [45]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [46]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [47]:
VOCAB_SIZE

1308844

In [48]:
NUN_CLASS

4

In [4]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [7]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [8]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 25 seconds
	Loss: 0.0260(train)	|	Acc: 84.8%(train)
	Loss: 0.0001(valid)	|	Acc: 90.5%(valid)
Epoch: 2  | time in 0 minutes, 24 seconds
	Loss: 0.0118(train)	|	Acc: 93.7%(train)
	Loss: 0.0000(valid)	|	Acc: 89.3%(valid)
Epoch: 3  | time in 0 minutes, 24 seconds
	Loss: 0.0068(train)	|	Acc: 96.4%(train)
	Loss: 0.0001(valid)	|	Acc: 90.6%(valid)
Epoch: 4  | time in 0 minutes, 25 seconds
	Loss: 0.0038(train)	|	Acc: 98.2%(train)
	Loss: 0.0000(valid)	|	Acc: 90.8%(valid)
Epoch: 5  | time in 0 minutes, 24 seconds
	Loss: 0.0022(train)	|	Acc: 99.1%(train)
	Loss: 0.0000(valid)	|	Acc: 91.3%(valid)


In [9]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0002(test)	|	Acc: 89.3%(test)


In [12]:

import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

vocab = train_dataset.get_vocab()
model = model.to("cpu")

In [13]:
print("This is a %s news" %ag_news_label[predict(ex_text_str, model, vocab, 2)])

This is a Sports news
