In [6]:
import pandas as pd

df_train = pd.read_csv('../dataset/ag_news/train.csv', index_col=None)
df_test = pd.read_csv('../dataset/ag_news/test.csv', index_col=None)

In [13]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


## Data Preprocess

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import logging
import pandas as pd
from torchtext.data import Iterator, BucketIterator, TabularDataset
from torchtext import data
from torchtext.vocab import Vectors, GloVe

In [2]:
import spacy
spacy_en = spacy.load('en_core_web_sm')
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [3]:
#获取数据并构建数据迭代器
def get_data_iter(train_csv, test_csv, fix_length):

    #定义text的对象，定长，小写，tokenizer
    TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer, fix_length=fix_length, batch_first=True)
    #定义LABEL对象
    LABEL = data.Field(sequential=False, use_vocab=False)

    train_fields = [("label", LABEL), ("title", None), ("text", TEXT)]
    train = TabularDataset(path=train_csv, format="csv", fields=train_fields, skip_header=True)
    train_iter = BucketIterator(train, batch_size=32, device=-1, sort_key=lambda x: len(x.text),
                                sort_within_batch=False, repeat=False)

    test_fields = [("label", LABEL), ("title", None), ("text", TEXT)]
    test = TabularDataset(path=test_csv, format="csv", fields=test_fields, skip_header=True)
    test_iter = Iterator(test, batch_size=32, device=-1, sort=False, sort_within_batch=False, repeat=False)

    # vectors = Vectors(name=word2vec_dir)
    #构建词表
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))

    vocab = TEXT.vocab

    return train_iter, test_iter, vocab

In [4]:
train_iter, test_iter, vocab = get_data_iter('../dataset/ag_news/train.csv', '../dataset/ag_news/test.csv', 128)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [29]:
# for batch in train_iter:
    # print(batch.text.shape)

)
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Si

In [6]:
#fasttext 模型
class FastText(nn.Module):
    def __init__(self, vocab, vec_dim, label_size, hidden_size):
        super(FastText, self).__init__()
        #创建embedding
        self.embed = nn.Embedding(len(vocab), vec_dim)
        # 若使用预训练的词向量，需在此处指定预训练的权重
        self.embed.weight.data.copy_(vocab.vectors)
        self.embed.weight.requires_grad = True

        self.fc = nn.Sequential(
            nn.Linear(vec_dim, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, label_size)
        )

    def forward(self, x):
        # print('1', x.shape)
        x = self.embed(x)
        # print('2', x.shape)
        # print(x.shape)
        # print('3', torch.mean(x, dim=1).shape)
        out = self.fc(torch.mean(x, dim=1))
        # x = torch.flatten(x, start_dim=1)
        # out = self.fc(x)
        # print('4', out.shape)
        # print(out.shape)
        return out

In [7]:
emb_dim = 300
hidden_size = 200
label_size = 4

epoch = 10
lr = 0.001
#batchsize=32

net = FastText(vocab=vocab, vec_dim=emb_dim, label_size=label_size, hidden_size=hidden_size)
print(net)

FastText(
  (embed): Embedding(81641, 300)
  (fc): Sequential(
    (0): Linear(in_features=300, out_features=200, bias=True)
    (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Linear(in_features=200, out_features=4, bias=True)
  )
)


In [8]:
def train_model(net, train_iter, epoch, lr):
    print("begin training")
    net.train()  # 必备，将模型设置为训练模式
    optimizer = optim.Adam(net.parameters(), lr=lr)

    criterion = nn.CrossEntropyLoss()

    for i in range(epoch):  # 多批次循环
        for batch_idx, batch in enumerate(train_iter):
            # 注意target=batch.label - 1，因为数据集中的label是1，2，3，4，但是pytorch的label默认是从0开始，所以这里需要减1
            data, target = batch.text, batch.label - 1
            # print('*',data.shape)

            optimizer.zero_grad()  # 清除所有优化的梯度
            
            output = net(data)  # 传入数据并前向传播获取输出
            # print('**', output.shape)
            
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            batch_size = 32
            # 打印状态信息
            logging.info(
                "train epoch=" + str(i) + ",batch_id=" + str(batch_idx) + ",loss=" + str(loss.item() / batch_size))

    print('Finished Training')

In [9]:
def model_test(net, test_iter):
    net.eval()  # 必备，将模型设置为训练模式
    correct = 0
    total = 0

    with torch.no_grad():

        for i, batch in enumerate(test_iter):
            # 注意target=batch.label - 1，因为数据集中的label是1，2，3，4，但是pytorch的label默认是从0开始，所以这里需要减1
            data, label = batch.text, batch.label - 1
            logging.info("test batch_id=" + str(i))
            outputs = net(data)
            # torch.max()[0]表示最大值的值，troch.max()[1]表示回最大值的每个索引
            # print(outputs)
            _, predicted = torch.max(outputs.data, 1)  # 每个output是一行n列的数据，取一行中最大的值
            # print(predicted)
            total += label.size(0)
            correct += (predicted == label).sum().item()
    print('Accuracy of the network on test set: %d %%' % (100 * correct / total))
            # test_acc += accuracy_score(torch.argmax(outputs.data, dim=1), label)
            # logging.info("test_acc=" + str(test_acc))

In [10]:
#train the model
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)

#save model
# net_dir = "ag_fasttext_model.pkl"
emb_dim = 300
hidden_size = 200
label_size = 4

epoch = 10
lr = 0.001
#batchsize=32

net = FastText(vocab=vocab, vec_dim=emb_dim, label_size=label_size, hidden_size=hidden_size)

logging.info("开始训练模型")
train_model(net, train_iter, epoch, lr)
# torch.save(net, net_dir)

2021-06-06 00:29:43,544:INFO: 开始训练模型
begin training
2021-06-06 00:29:43,807:INFO: train epoch=0,batch_id=0,loss=0.045218661427497864
2021-06-06 00:29:43,880:INFO: train epoch=0,batch_id=1,loss=0.04231252148747444
2021-06-06 00:29:43,952:INFO: train epoch=0,batch_id=2,loss=0.03718787804245949
2021-06-06 00:29:44,027:INFO: train epoch=0,batch_id=3,loss=0.03754382207989693
2021-06-06 00:29:44,103:INFO: train epoch=0,batch_id=4,loss=0.02481074631214142
2021-06-06 00:29:44,179:INFO: train epoch=0,batch_id=5,loss=0.02969186194241047
2021-06-06 00:29:44,252:INFO: train epoch=0,batch_id=6,loss=0.023758629336953163
2021-06-06 00:29:44,328:INFO: train epoch=0,batch_id=7,loss=0.026689499616622925
2021-06-06 00:29:44,405:INFO: train epoch=0,batch_id=8,loss=0.022018836811184883
2021-06-06 00:29:44,479:INFO: train epoch=0,batch_id=9,loss=0.01987730897963047
2021-06-06 00:29:44,562:INFO: train epoch=0,batch_id=10,loss=0.02105369046330452
2021-06-06 00:29:44,642:INFO: train epoch=0,batch_id=11,loss=0.

KeyboardInterrupt: 

In [58]:
net_dir = "ag_fasttext_model.pkl"
torch.save(net, net_dir)

In [65]:
model_test(net, test_iter)

2021-05-27 02:23:54,839:INFO: test batch_id=0
2021-05-27 02:23:54,843:INFO: test batch_id=1
2021-05-27 02:23:54,847:INFO: test batch_id=2
2021-05-27 02:23:54,851:INFO: test batch_id=3
2021-05-27 02:23:54,854:INFO: test batch_id=4
2021-05-27 02:23:54,857:INFO: test batch_id=5
2021-05-27 02:23:54,860:INFO: test batch_id=6
2021-05-27 02:23:54,863:INFO: test batch_id=7
2021-05-27 02:23:54,866:INFO: test batch_id=8
2021-05-27 02:23:54,869:INFO: test batch_id=9
2021-05-27 02:23:54,871:INFO: test batch_id=10
2021-05-27 02:23:54,874:INFO: test batch_id=11
2021-05-27 02:23:54,878:INFO: test batch_id=12
2021-05-27 02:23:54,881:INFO: test batch_id=13
2021-05-27 02:23:54,884:INFO: test batch_id=14
2021-05-27 02:23:54,887:INFO: test batch_id=15
2021-05-27 02:23:54,889:INFO: test batch_id=16
2021-05-27 02:23:54,892:INFO: test batch_id=17
2021-05-27 02:23:54,895:INFO: test batch_id=18
2021-05-27 02:23:54,899:INFO: test batch_id=19
2021-05-27 02:23:54,902:INFO: test batch_id=20
2021-05-27 02:23:54,905

In [17]:
## 使用模型预测
import torch
#在入模型
emb_dim = 300
hidden_size = 200
label_size = 4

# model = torch.load("ag_fasttext_model.pkl")
# model.eval()


import spacy
spacy_en = spacy.load('en_core_web_sm')
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]


In [18]:
#赋予标签
ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}
#要预测的句子
ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

In [30]:
def predict(text):
    model = torch.load("ag_fasttext_model.pkl")
    model.eval()
    text = tokenizer(text)
    text = torch.tensor([vocab[t] for t in text[:128]])
    text = torch.unsqueeze(text, 0)
    # print(text.size())
    with torch.no_grad():
        outputs = model(text)
        # print(outputs.shape)
        print(outputs)
        predicted = torch.max(outputs.data, 1)
    return predicted[1]

In [35]:
s = predict(ex_text_str)
ag_news_label[int(s)+1]

tensor([[-70.8558,  -1.5746, -26.5518,  34.4366]])


'Sci/Tec'