In [6]:
import pandas as pd

df_train = pd.read_csv('../dataset/ag_news/train.csv', index_col=None)
df_test = pd.read_csv('../dataset/ag_news/test.csv', index_col=None)

In [13]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


## Data Preprocess

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import logging
import pandas as pd
from torchtext.data import Iterator, BucketIterator, TabularDataset
from torchtext import data
from torchtext.vocab import Vectors, GloVe

In [4]:
import spacy
spacy_en = spacy.load('en_core_web_sm')
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
#获取数据并构建数据迭代器
def get_data_iter(train_csv, test_csv, fix_length):

    #定义text的对象，定长，小写，tokenizer
    TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer, fix_length=fix_length, batch_first=True)
    #定义LABEL对象
    LABEL = data.Field(sequential=False, use_vocab=False)

    train_fields = [("label", LABEL), ("title", None), ("text", TEXT)]
    train = TabularDataset(path=train_csv, format="csv", fields=train_fields, skip_header=True)
    train_iter = BucketIterator(train, batch_size=32, device=-1, sort_key=lambda x: len(x.text),
                                sort_within_batch=False, repeat=False)

    test_fields = [("label", LABEL), ("title", None), ("text", TEXT)]
    test = TabularDataset(path=test_csv, format="csv", fields=test_fields, skip_header=True)
    test_iter = Iterator(test, batch_size=32, device=-1, sort=False, sort_within_batch=False, repeat=False)

    # vectors = Vectors(name=word2vec_dir)
    #构建词表
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))

    vocab = TEXT.vocab

    return train_iter, test_iter, vocab

In [6]:
train_iter, test_iter, vocab = get_data_iter('../dataset/ag_news/train.csv', '../dataset/ag_news/test.csv', 128)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [29]:
for batch in train_iter:
    print(batch.text.shape)

)
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Si

In [8]:
#fasttext 模型
class FastText(nn.Module):
    def __init__(self, vocab, vec_dim, label_size, hidden_size):
        super(FastText, self).__init__()
        #创建embedding
        self.embed = nn.Embedding(len(vocab), vec_dim)
        # 若使用预训练的词向量，需在此处指定预训练的权重
        self.embed.weight.data.copy_(vocab.vectors)
        self.embed.weight.requires_grad = True

        self.fc = nn.Sequential(
            nn.Linear(vec_dim, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, label_size)
        )

    def forward(self, x):
        x = self.embed(x)
        # print(x.shape)
        out = self.fc(torch.mean(x, dim=1))
        # print(out.shape)
        return out

In [56]:
def train_model(net, train_iter, epoch, lr):
    print("begin training")
    net.train()  # 必备，将模型设置为训练模式
    optimizer = optim.Adam(net.parameters(), lr=lr)

    criterion = nn.CrossEntropyLoss()

    for i in range(epoch):  # 多批次循环
        for batch_idx, batch in enumerate(train_iter):
            # 注意target=batch.label - 1，因为数据集中的label是1，2，3，4，但是pytorch的label默认是从0开始，所以这里需要减1
            data, target = batch.text, batch.label - 1
            
            optimizer.zero_grad()  # 清除所有优化的梯度
            
            output = net(data)  # 传入数据并前向传播获取输出
            
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            batch_size = 32
            # 打印状态信息
            logging.info(
                "train epoch=" + str(i) + ",batch_id=" + str(batch_idx) + ",loss=" + str(loss.item() / batch_size))

    print('Finished Training')

In [64]:
def model_test(net, test_iter):
    net.eval()  # 必备，将模型设置为训练模式
    correct = 0
    total = 0

    with torch.no_grad():

        for i, batch in enumerate(test_iter):
            # 注意target=batch.label - 1，因为数据集中的label是1，2，3，4，但是pytorch的label默认是从0开始，所以这里需要减1
            data, label = batch.text, batch.label - 1
            logging.info("test batch_id=" + str(i))
            outputs = net(data)
            # torch.max()[0]表示最大值的值，troch.max()[1]表示回最大值的每个索引
            # print(outputs)
            _, predicted = torch.max(outputs.data, 1)  # 每个output是一行n列的数据，取一行中最大的值
            # print(predicted)
            total += label.size(0)
            correct += (predicted == label).sum().item()
    print('Accuracy of the network on test set: %d %%' % (100 * correct / total))
            # test_acc += accuracy_score(torch.argmax(outputs.data, dim=1), label)
            # logging.info("test_acc=" + str(test_acc))

In [57]:
#train the model
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)

#save model
# net_dir = "ag_fasttext_model.pkl"
emb_dim = 300
hidden_size = 200
label_size = 4

epoch = 10
lr = 0.001
#batchsize=32

net = FastText(vocab=vocab, vec_dim=emb_dim, label_size=label_size, hidden_size=hidden_size)

logging.info("开始训练模型")
train_model(net, train_iter, epoch, lr)
# torch.save(net, net_dir)

,833:INFO: train epoch=9,batch_id=3515,loss=2.4113987819873728e-05
2021-05-27 01:54:04,906:INFO: train epoch=9,batch_id=3516,loss=0.0008529243059456348
2021-05-27 01:54:04,979:INFO: train epoch=9,batch_id=3517,loss=0.00017464750271756202
2021-05-27 01:54:05,049:INFO: train epoch=9,batch_id=3518,loss=0.00026773958234116435
2021-05-27 01:54:05,123:INFO: train epoch=9,batch_id=3519,loss=0.00011159404675709084
2021-05-27 01:54:05,197:INFO: train epoch=9,batch_id=3520,loss=0.0001217150638694875
2021-05-27 01:54:05,267:INFO: train epoch=9,batch_id=3521,loss=0.00012172410060884431
2021-05-27 01:54:05,339:INFO: train epoch=9,batch_id=3522,loss=0.001380242989398539
2021-05-27 01:54:05,412:INFO: train epoch=9,batch_id=3523,loss=4.545645788311958e-05
2021-05-27 01:54:05,483:INFO: train epoch=9,batch_id=3524,loss=0.003364759497344494
2021-05-27 01:54:05,556:INFO: train epoch=9,batch_id=3525,loss=0.00028856907738372684
2021-05-27 01:54:05,628:INFO: train epoch=9,batch_id=3526,loss=0.000329203699948

FileNotFoundError: [Errno 2] No such file or directory: 'model/ag_fasttext_model.pkl'

In [58]:
net_dir = "ag_fasttext_model.pkl"
torch.save(net, net_dir)

In [65]:
model_test(net, test_iter)

2021-05-27 02:23:54,839:INFO: test batch_id=0
2021-05-27 02:23:54,843:INFO: test batch_id=1
2021-05-27 02:23:54,847:INFO: test batch_id=2
2021-05-27 02:23:54,851:INFO: test batch_id=3
2021-05-27 02:23:54,854:INFO: test batch_id=4
2021-05-27 02:23:54,857:INFO: test batch_id=5
2021-05-27 02:23:54,860:INFO: test batch_id=6
2021-05-27 02:23:54,863:INFO: test batch_id=7
2021-05-27 02:23:54,866:INFO: test batch_id=8
2021-05-27 02:23:54,869:INFO: test batch_id=9
2021-05-27 02:23:54,871:INFO: test batch_id=10
2021-05-27 02:23:54,874:INFO: test batch_id=11
2021-05-27 02:23:54,878:INFO: test batch_id=12
2021-05-27 02:23:54,881:INFO: test batch_id=13
2021-05-27 02:23:54,884:INFO: test batch_id=14
2021-05-27 02:23:54,887:INFO: test batch_id=15
2021-05-27 02:23:54,889:INFO: test batch_id=16
2021-05-27 02:23:54,892:INFO: test batch_id=17
2021-05-27 02:23:54,895:INFO: test batch_id=18
2021-05-27 02:23:54,899:INFO: test batch_id=19
2021-05-27 02:23:54,902:INFO: test batch_id=20
2021-05-27 02:23:54,905

In [14]:
## 使用模型预测
import torch
#在入模型
emb_dim = 300
hidden_size = 200
label_size = 4

model = torch.load("ag_fasttext_model.pkl")
# model.eval()


import spacy
spacy_en = spacy.load('en_core_web_sm')
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]


In [15]:
#赋予标签
ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}
#要预测的句子
ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

In [30]:
def predict(text):
    text = tokenizer(text)
    text = torch.tensor([vocab[t] for t in text[:128]])
    outputs = model(text)
    predicted = torch.max(outputs.data, 1)
    return predicted

In [31]:
predict(ex_text_str)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x128 and 300x200)