# 达观数据竞赛（无预训练的词向量）

In [1]:
from fastai.text import Path
import os
import warnings
from collections import defaultdict
import numpy as np
import re
import math
import pickle
warnings.filterwarnings("always")

PATH = Path("/home/liucc/data/new_data")

DATA_PATH = Path('/home/liucc/Workspace/NLP/da_guan_race/data_lm')
DATA_PATH.mkdir(exist_ok=True)



## 处理数据

In [169]:
import numpy as np
import re
import math

def get_tokens(texts):
    tok = [['_NEW_']+re.split(r'\s+',t) for t in texts]
    return tok

In [3]:
import pandas as pd
df_train = pd.read_csv(PATH/'train_set.csv')
df_train.drop(columns = ['word_seg','id'], inplace = True)

texts = df_train['article'].values
labels = (df_train['class']-1).values

toks = get_tokens(texts)
print(len(toks))
print(toks[0][:100])

## 建立词汇表

In [25]:
from collections import Counter
freq = Counter(w for t in toks for w in t)

In [26]:
print(len(freq))

13517


In [27]:
itos = [o for o,c in freq.most_common() if c>2]
itos.insert(0,'_unk_') #未知字符
itos.insert(0,'_pad_') 

In [28]:
len(itos)

10158

In [30]:
from collections import defaultdict
stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

ids = np.array([[stoi[c] for c in t] for t in toks])
print(ids.shape)
print(ids[0][:100])

(102277,)
[250, 3, 377, 34, 27, 2, 245, 171, 4, 211, 452, 387, 27, 387, 28, 2, 47, 60, 221, 396, 6, 121, 42, 9, 3726, 4059, 5, 21, 249, 171, 305, 159, 672, 203, 7, 53, 17, 20, 6, 121, 2, 333, 284, 3163, 455, 245, 171, 211, 452, 4, 562, 510, 5, 81, 83, 2, 820, 315, 2589, 1096, 146, 100, 224, 507, 69, 241, 79, 507, 183, 302, 12, 6, 136, 237, 187, 325, 660, 2311, 1547, 588, 2, 447, 484, 1058, 253, 122, 688, 420, 758, 273, 4, 4354, 55, 520, 1868, 47, 532, 43, 40, 2]


In [101]:
import pickle
# 序列化保存词汇表
with open(DATA_PATH/'itos.pkl', 'wb') as f:
    pickle.dump(itos, f)
    
with open(DATA_PATH/'stoi.pkl', 'wb') as f:
    stoi_ = {k:v for k,v in stoi.items()}
    pickle.dump(stoi_, f)
    
np.save(DATA_PATH/'ids.npy',ids)
np.save(DATA_PATH/'labels.npy',labels)

with open(DATA_PATH/'classes.pkl', 'wb') as f:
    pickle.dump(set(labels), f)

In [34]:
% ls -l --block-size=M data_lm

总用量 278M
-rw-r--r-- 1 liucc root 277M 9月   1 09:56 ids.npy
-rw-r--r-- 1 liucc root   1M 9月   1 09:55 itos.pkl
-rw-r--r-- 1 liucc root   1M 9月   1 09:56 labels.npy
-rw-r--r-- 1 liucc root   1M 9月   1 09:55 stoi.pkl


In [4]:
# 反序列化读取词汇表
with open(DATA_PATH/'itos.pkl', 'rb') as f:
    itos = pickle.load(f)
    
with open(DATA_PATH/'stoi.pkl', 'rb') as f:
    stoi_ = pickle.load(f)
    stoi = defaultdict(lambda:0,{k:v for k,v in stoi_.items()})

ids = np.load(DATA_PATH/'ids.npy')
labels = np.load(DATA_PATH/'labels.npy')    

In [5]:
#将文字与标签匹配
def get_paired_c2l(ids,labels):
    if not len(ids)==len(labels):
        raise RuntimeError('维度不匹配')
    
    cids = []
    clabels = []
    for t,l in zip(ids,labels):
        cids += t
        clabels += ([l]*len(t))
    return np.array(cids),np.array(clabels)
    

In [6]:
cids,clabels = get_paired_c2l(ids,labels)
print(len(cids))
print(cids[:20])
print(len(clabels))
print(clabels[:20])

120492550
[250   3 377  34  27   2 245 171   4 211 452 387  27 387  28   2  47  60 221 396]
120492550
[13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]


In [7]:
np.save(DATA_PATH/'cids.npy',cids)
np.save(DATA_PATH/'clabels.npy',clabels)


## 建立数据模型

In [37]:
class CharModel(object):
    # x: length * words
    # y: length 
    def __init__(self,x,y,bs,bptt):
        super().__init__()
        self.bs = bs
        self.bptt = bptt
        self.useGPU = False
        self.batch_num = -1
        
        bl = np.size(x,0)//bs
        
        x = x[:bl*bs]
        y = y[:bl*bs]
        
        xs = np.array([[x[i+j] for i in range(bl)] for j in range(0,len(x)-bl+1,bl)])
        ys = np.array([[y[i+j] for i in range(bl)] for j in range(0,len(y)-bl+1,bl)])
                
        #截断成bs段再拼上
        self.x = torch.LongTensor(xs).transpose(0,1) # length * batch_size
        
        self.y = torch.LongTensor(ys).transpose(0,1) # length * batch_size
           
    
    def set_bptt(self,bptt):
        self.bptt = bptt
    
    def cuda(self):
        self.useGPU = True
        
        
    def cpu(self):
        self.useGPU = False
        
    def mini_batch(self,num):
        self.batch_num = num
     
    def __len__(self):
        return self.batch_num if self.batch_num>0 else self.x.size(0)//self.bptt
        
    
    #每次返回一行
    def __iter__(self):
        for i in range(0,self.x.size(0)-self.bptt+1,self.bptt):
            if self.batch_num>0 and self.batch_num<=i//self.bptt:
                break
            else:
                if self.useGPU:
                    yield self.x[i:i+self.bptt,:].cuda(),self.y[i:i+self.bptt,:].cuda()
                else:
                    yield self.x[i:i+self.bptt,:],self.y[i:i+self.bptt,:]
                    

In [38]:
import numpy as np
import torch

cids = np.load(DATA_PATH/'cids.npy')
clabels = np.load(DATA_PATH/'clabels.npy')

bs,bptt = 64,10

dm = CharModel(cids,clabels,bs,bptt)
print(dm.x.shape)
print(dm.y.shape)

torch.Size([1882696, 64])
torch.Size([1882696, 64])


In [39]:
import pickle
# 序列化保存词汇表
with open(DATA_PATH/'dm.pkl', 'wb') as f:
    pickle.dump(dm, f)


## 构建LSTM网络进行训练

In [2]:
from fastai.text import Path
import pickle
import collections
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import nn
import pickle

In [41]:
with open(DATA_PATH/'dm.pkl', 'rb') as f:
    dm = pickle.load(f)

with open(DATA_PATH/'classes.pkl', 'rb') as f:
    classes = pickle.load(f)
    
with open(DATA_PATH/'itos.pkl', 'rb') as f:
    itos = pickle.load(f)

In [3]:
#预训练的语言模型
class LModelLSTM(nn.Module):
    def __init__(self, vocab_size, class_size, em_sz, bs, nl, n_hidden):
        super().__init__()
        self.vocab_size,self.nl,self.class_size = vocab_size,nl,class_size
        self.bs,self.n_hidden = bs,n_hidden
        
        self.e = nn.Embedding(vocab_size, em_sz)
        self.rnn = nn.LSTM(em_sz, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, class_size)
        self.init_hidden()
        
    def forward(self, cs):
        bs = cs.size(1)
        if bs!=self.bs:
            self.bs = bs
            self.init_hidden()
        
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = self.repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, bs, self.class_size)
    
    def init_hidden(self):
        bs = self.bs
        
        if next(self.parameters()).is_cuda:
            self.h = (Variable(torch.zeros(self.nl, bs, self.n_hidden)).cuda(),
                  Variable(torch.zeros(self.nl, bs, self.n_hidden)).cuda())
        else:
            self.h = (Variable(torch.zeros(self.nl, bs, self.n_hidden)),
                  Variable(torch.zeros(self.nl, bs, self.n_hidden)))     
        
    def repackage_var(self,h):
        """Wraps h in new Variables, to detach them from their history."""
        if next(self.parameters()).is_cuda:
            return tuple(self.repackage_var(v) for v in h) if type(h) == tuple else Variable(h.data).cuda()
        else:
            return tuple(self.repackage_var(v) for v in h) if type(h) == tuple else Variable(h.data)
        

In [43]:
em_sz,n_hidden,nl = 200,512,2
vocab_size = len(itos)
class_size = len(classes)

print(vocab_size)
print(class_size)

10158
19


In [22]:
m = LModelLSTM(vocab_size,class_size, em_sz, bs, nl, n_hidden)
opt_fn = torch.optim.Adam(m.parameters(),lr=1e-3)


In [44]:
import time
def train(model,data_train,data_val,opt,loss_f,epoch,useGPU=True):
    
    if useGPU and torch.cuda.is_available():
        model = model.cuda()
        data_train.cuda()
        print(f'epoch  train_loss  train_acc')        
    
    for i in range(epoch):
        
        loss_train = []
        model.train()
        
        true_ls = []
        pred_ls = []
        #true_y: batch_size
        model.init_hidden()
        
        stime = time.time()
        bind = 0
        tind = len(data_train)
        for x,y in data_train:
            
            model.zero_grad()

            pred_y = model(Variable(x)) #bptt*batch*classes
            
            pred_y = pred_y.view(-1,pred_y.size(-1))
            true_y = y.view(-1)
            
            _,pred_l = pred_y.max(1)
            pred_ls.append(pred_l.data.cpu().numpy())
            true_ls.append(true_y.cpu().numpy())

            loss = loss_f(pred_y,Variable(true_y))
            loss.backward()
            opt.step()

            closs = loss.data.cpu().numpy()
            loss_train.append(closs)
        
            ctime = time.time()
            ptime = (ctime-stime)/60
            bind+=1
            rtime = ptime*(tind-bind)/bind
            print(f'\rProgress: {ptime:.2f}<{rtime:.2f}  loss:{float(closs):.4f}',end="")

        loss_train = np.mean(np.array(loss_train))
        
        true_ls = np.concatenate(true_ls,axis=0)
        pred_ls = np.concatenate(pred_ls,axis=0)
        acc_train = (true_ls==pred_ls).sum()/np.size(true_ls,0)



        print(f'\r{i:5}  {loss_train:10.4f}  {acc_train:9.4f}')
    


In [26]:
import time

dm.mini_batch(-1)
opt_fn = torch.optim.Adam(m.parameters(),lr=0.0001)
train(m,dm,None,opt_fn,nn.NLLLoss(),1, useGPU=True)



epoch  train_loss  train_acc
    0      0.5893     0.8238.359199970960617076


In [27]:
torch.save(m,DATA_PATH/'char_model_pbtt10')

  "type " + obj.__name__ + ". It won't be checked "


In [8]:
m = torch.load(DATA_PATH/'char_model_pbtt10')

In [28]:
dm.mini_batch(-1)
dm.set_bptt(20)
opt_fn = torch.optim.Adam(m.parameters(),lr=0.0001)
train(m,dm,None,opt_fn,nn.NLLLoss(),1, useGPU=True)


epoch  train_loss  train_acc
    0      0.7475     0.7788.62251842021942146


In [29]:
torch.save(m,DATA_PATH/'char_model_pbtt20')

  "type " + obj.__name__ + ". It won't be checked "


In [8]:
m = torch.load(DATA_PATH/'char_model_pbtt20')

In [30]:
dm.mini_batch(-1)
dm.set_bptt(50)
opt_fn = torch.optim.Adam(m.parameters(),lr=0.0001)
train(m,dm,None,opt_fn,nn.NLLLoss(),15, useGPU=True)


epoch  train_loss  train_acc
    0      0.8368     0.7522.80672907829284677
    1      0.7547     0.7737.69244801998138436
    2      0.6979     0.7905.63341861963272126
    3      0.6467     0.8051.56036633253097536
    4      0.5993     0.8192.57578408718109135
    5      0.5465     0.8339.40625992417335516
    6      0.4974     0.8486.37763863801956177
    7      0.4484     0.8626.31454861164093023
    8      0.3999     0.8773.27238628268241884
    9      0.3597     0.8895.34817832708358765
   10      0.3192     0.9016.283030599355697636
   11      0.2845     0.9120.220218718051910475
   12      0.2551     0.9209.399712592363357545
   13      0.2230     0.9298.166752323508262635
   14      0.2020     0.9367.145413517951965335


In [31]:
torch.save(m,DATA_PATH/'char_model_pbtt50')

  "type " + obj.__name__ + ". It won't be checked "


In [48]:
m = torch.load(DATA_PATH/'char_model_pbtt50',map_location=lambda storage,loc:storage)

## 文本分类模型

### 准备数据

In [4]:
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
import pickle

# 反序列化读取词汇表
with open(DATA_PATH/'itos.pkl', 'rb') as f:
    itos = pickle.load(f)
    
with open(DATA_PATH/'stoi.pkl', 'rb') as f:
    stoi_ = pickle.load(f)
    stoi = defaultdict(lambda:0,{k:v for k,v in stoi_.items()})

ids = np.load(DATA_PATH/'ids.npy')
labels = np.load(DATA_PATH/'labels.npy')  

In [104]:
TRAIN_RATIO = 0.9
VAL_RATIO = 0.1

ids = np.array([np.array(i) for i in ids])

rid = np.random.permutation(len(ids))
ids = ids[rid]
labels = labels[rid]

trn_num = math.floor(len(ids)*TRAIN_RATIO)
val_num = math.floor(len(ids)*VAL_RATIO)

trn_ids = ids[:trn_num]
trn_label = labels[:trn_num]
val_ids = ids[-val_num:]
val_label = labels[-val_num:]

In [143]:
class BatchItem(object):
    #end_inds: 结束的列id
    #sqz_inds: 消失的列id
    def __init__(self,xs,ys,ids,end_ids=None,sqz_ids=None,useGPU=False):
        super().__init__()
        
        self.xs = torch.from_numpy(np.stack(xs,axis=1)).type(torch.LongTensor)
        self.ys = torch.from_numpy(np.stack(ys,axis=1)).type(torch.LongTensor)
        self.ids = ids
        self.end_ids = torch.LongTensor(end_ids) if end_ids is not None else torch.LongTensor([])
        self.sqz_ids = torch.LongTensor(sqz_ids) if sqz_ids is not None else torch.LongTensor([])
        
        if useGPU:
            self.xs = self.xs.cuda()
            self.ys = self.ys.cuda()
            self.end_ids = self.end_ids.cuda()
            self.sqz_ids = self.sqz_ids.cuda()

class DocItem(object):
    # x: length
    # y: number
    def __init__(self,x,y,index,bptt,pad_id):
        super().__init__()
        self.x = np.array(x)
        self.y = y
        self.index = index #文章序号
        self.bptt = bptt
        self.pad_id = pad_id
                
        self.cur_i = 0
        
    def reset(self):
        self.cur_i = 0
        
    def __len__(self):
        return math.floor(len(self.x)/self.bptt)
    
    def has_next(self):
        return True if self.cur_i<self.__len__() else False

    def next_batch(self):
        
        if self.has_next():
            lens = len(self.x)
            i = self.cur_i*self.bptt
            self.cur_i += 1
            
            if i+self.bptt>lens:
                x_b = np.concatenate((self.x[i:],np.ones(i+self.bptt-lens)*self.pad_id))
            else:
                x_b = self.x[i:i+self.bptt]
            
            y_b = np.ones(len(x_b))*self.y
            
            return x_b,y_b,self.index
                
        
    
class DocModel(object):
    # x: length * words list
    # y: length  list
    def __init__(self,x,y,bs,bptt,pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.bptt = bptt
        self.bs = bs
        self.useGPU = False
        self.doc_num = -1
                       
        #截断成bs段再拼上
        self.x = x # length * batch_size
        self.y = y if y is not None else [-1]*len(x) # length
        
        self.cur_ind = 0
           
    def get_curid(self):
        return self.cur_ind
    
    def set_bptt(self,bptt):
        self.bptt = bptt
    
    def cuda(self):
        self.useGPU = True
        
        
    def cpu(self):
        self.useGPU = False
        
    def mini_num(self,num):
        self.doc_num = num
     
    def __len__(self):
        return self.doc_num if self.doc_num>0 else len(self.x)
        
    def get_doc(self,i):
        return DocItem(self.x[i],self.y[i],i,self.bptt,self.pad_id)
    
    def get_batch(self,ctner,end_ids,sqz_ids):
        xs,ys,ids = [],[],[]
        for c in ctner:
            if c is not None:
                bx,by,bid = c.next_batch()
                xs.append(bx)
                ys.append(by)
                ids.append(bid)
                
        return BatchItem(xs,ys,ids,end_ids,sqz_ids,self.useGPU)
    
    #每次返回一行
    def __iter__(self):
        self.cur_ind = 0
        lens = self.doc_num if self.doc_num>0 else len(self.y)
        
        ctner = []
        batch_1 = min(self.bs,lens)
        for i in range(batch_1):
            ctner.append(self.get_doc(i))
        
        ctner = np.array(ctner)
        self.cur_ind = batch_1
        
        go_on = True
        
        while go_on:
            #判断是否有文章已结束
            end_ids = [i for i,c in enumerate(ctner) if not c.has_next()]
            ctner[end_ids] = None   
            
            #填入新的文章
            for e_id in end_ids:
                if self.cur_ind>=lens:
                    break
                ctner[e_id] = self.get_doc(self.cur_ind)
                self.cur_ind += 1
                
            sqz_ids = [i for i,c in enumerate(ctner) if c is None]    
            
            #生成batch对象
            if len(sqz_ids)<len(ctner):
                ctner = np.array([c for c in ctner if c is not None])
                yield self.get_batch(ctner,end_ids,sqz_ids)
            else:
                go_on = False
        
                    

In [144]:
bptt,bs = 50,64
trn_dm = DocModel(trn_ids,trn_label,bs,bptt,stoi['_pad_'])
val_dm = DocModel(val_ids,val_label,bs,bptt,stoi['_pad_'])

In [151]:
trn_dm.mini_num(200)
doc_re = {}
doci_re = {}
for item in trn_dm:
    for i,did in enumerate(item.ids):
        if did not in doc_re.keys():
            doc_re[did]=[]
            doci_re[did] = int(item.ys[:,i][0])
            
        doc_re[did]+= [int(c) for c in item.xs[:,i] if int(c)>0]
    
for i in doc_re:
    o_text = [int(c) for c in trn_ids[i] if int(c)>0]
    o_text = o_text[:len(doc_re[i])]
    print(o_text==doc_re[i],doci_re[i]==trn_label[i])
    
trn_dm.mini_num(-1)

True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True


In [9]:
class ClasLSTM(nn.Module):
    def __init__(self, m):
        super().__init__()
        self.vocab_size,self.nl,self.class_size = m.vocab_size,m.nl,m.class_size
        self.n_hidden = m.n_hidden
        self.bs = m.bs
        self.bs_old = m.bs
        
        self.e = nn.Embedding(m.e.weight.data.size(0), m.e.weight.data.size(1))
        self.rnn = nn.LSTM(m.e.weight.data.size(1), self.n_hidden, self.nl, dropout=0.5)
        self.l_out = nn.Linear(self.n_hidden, self.class_size)
        
        self.e.weight.data.copy_(m.e.weight.data)
        for w1,w2 in zip(self.rnn.parameters(),m.rnn.parameters()):
            w1.data.copy_(w2.data)
            
        self.l_out.weight.data.copy_(m.l_out.weight.data)
              
        
        self.init_hidden()
        
    def restore(self):
        self.bs = self.bs_old
        
    def forward(self, cs, end_ids, sqz_ids):
        self.h = self.change_hidden(end_ids,sqz_ids)
        if self.bs!=cs.size(1):
            print(f'batch size is different self.bs:{self.bs} x.bs:{cs.size(1)}!')
            self.bs = cs.size(1)
            self.init_hidden()
        
        outp,h = self.rnn(self.e(cs), self.h)
        #self.h = self.repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.bs, self.class_size)
    
    def init_hidden(self):
        
        if next(self.parameters()).is_cuda:
            self.h = (Variable(torch.zeros(self.nl, self.bs, self.n_hidden)).cuda(),
                  Variable(torch.zeros(self.nl, self.bs, self.n_hidden)).cuda())
        else:
            self.h = (Variable(torch.zeros(self.nl, self.bs, self.n_hidden)),
                  Variable(torch.zeros(self.nl, self.bs, self.n_hidden)))     
            
    def change_hidden(self,end_ids,sqz_ids):
        
        end_ids = end_ids.cpu()
        sqz_ids = sqz_ids.cpu()
        
        h0 = torch.zeros_like(self.h[0].data.cpu())
        h1 = torch.zeros_like(self.h[1].data.cpu())
        
        h0.copy_(self.h[0].data.cpu())
        h1.copy_(self.h[1].data.cpu())
        if len(end_ids)>0:
            h0[:,end_ids,:] = 0
            h1[:,end_ids,:] = 0
            
                
        if len(sqz_ids)>0:
            left_ids = [i for i in range(h0.size(1)) if i not in set(sqz_ids)]
            h0 = h0[:,left_ids,:].contiguous()
            h1 = h1[:,left_ids,:].contiguous()
            
            self.bs -= len(sqz_ids)
            
        if next(self.parameters()).is_cuda:
            return (Variable(h0).cuda(),Variable(h1).cuda())
        else:
            return (Variable(h0),Variable(h1))    
            
                   
    def repackage_var(self,h):
        """Wraps h in new Variables, to detach them from their history."""
        if next(self.parameters()).is_cuda:
            return tuple(self.repackage_var(v) for v in h) if type(h) == tuple else Variable(h.data).cuda()
        else:
            return tuple(self.repackage_var(v) for v in h) if type(h) == tuple else Variable(h.data)

In [52]:
import time
def train_cm(model,data_train,data_val,opt,loss_f,epoch,useGPU=True):
    
    if useGPU and torch.cuda.is_available():
        model = model.cuda()
        data_train.cuda()
        if not data_val is None:
            data_val.cuda()
            print(f'epoch  train_loss  val_loss  train_acc  val_acc')
        
        else:
            print(f'epoch  train_loss  train_acc')        
    
    
    for i in range(epoch):
        
        ep_result = step_cm(model,data_train,opt,loss_f,True)
                
        acc_train = ep_result['ep_acc']
        loss_train = np.mean(np.array(ep_result['ep_loss']))
            
        
        if data_val is not None:
            ep_result = step_cm(model,data_val,opt,loss_f,False)
                        
            acc_val = ep_result['ep_acc']
            loss_val = np.mean(np.array(ep_result['ep_loss']))
                
            print(f'\r{i:5}  {loss_train:10.4f}  {loss_val:8.4f}  {acc_train:9.4f}  {acc_val:7.4f}')
            
        else:
            print(f'\r{i:5}  {loss_train:10.4f}  {acc_train:9.4f}')
            
    

def step_cm(model,data,opt=None,loss_f=None,train=True):
    model.restore()
    if train:
        model.train()
    else:
        model.eval()
      
        
    stime = time.time()
    bind = 0
    tind = len(data) #文章总数
    model.init_hidden()

    result_epoch = {}
    ep_pred_y = {} #per doc
    ep_true_y = {}
    ep_pred_dis = {}
    
    ep_loss = []
    ep_acc = 0
    for item in data:
        
                        
        assert item.xs.shape==item.ys.shape, f'x,y 维度不匹配. x:{item.xs.shape} y:{item.ys.shape}'

        if train:
            model.zero_grad()

        pred_y = model(Variable(item.xs),item.end_ids,item.sqz_ids) #bptt*batch_size*classes

        true_y = item.ys

        _,pred_l = pred_y.max(2)

        pred_l = pred_l.data.cpu().numpy() #bptt*batch_size
        true_l = true_y.cpu().numpy()

        rt_num = (pred_l==true_l).sum()
        tt_num = pred_l.size
        ct_acc = rt_num/tt_num
        
        ep_acc = 0.99*ep_acc+0.01*ct_acc if ep_acc>0 else ct_acc

        if loss_f is not None:
            loss = loss_f(pred_y.view(-1,pred_y.size(-1)),Variable(true_y.view(-1)))
            closs = loss.data.cpu().numpy()
            ep_loss.append(closs)
        
        if train:
            loss.backward()
            opt.step()

        

        ctime = time.time()
        ptime = (ctime-stime)/60
        bind = data.get_curid()
        rtime = ptime*(tind-bind)/bind
        print(f'\rProgress: {ptime:.2f}<{rtime:.2f}  loss:{float(closs):.4f}',end="")
    
    
    return {'ep_loss':ep_loss,'ep_acc':ep_acc}
    

In [136]:
import time
m = torch.load(DATA_PATH/'char_model_pbtt50',map_location=lambda storage,loc:storage)
cm = ClasLSTM(m)

In [58]:
trn_dm.mini_num(-1)
val_dm.mini_num(-1)
opt_fn = torch.optim.Adam(cm.parameters(),lr=0.0001)
train_cm(cm,trn_dm,val_dm,opt_fn,nn.NLLLoss(),1, useGPU=True)

epoch  train_loss  val_loss  train_acc  val_acc
    0      1.6918    9.5757     0.9401   0.6755
    1      1.6288    9.8589     0.9476   0.6746
    2      1.5848   10.8773     0.9524   0.6705


In [59]:
torch.save(cm,DATA_PATH/'classify_model_pbtt50')

  "type " + obj.__name__ + ". It won't be checked "


In [103]:
torch.cuda.empty_cache()

In [106]:
cm = torch.load(DATA_PATH/'classify_model_pbtt50',map_location=lambda storage,loc:storage)

## 预测测试集

In [None]:
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
import pickle
import pandas as pd

# 反序列化读取词汇表
with open(DATA_PATH/'itos.pkl', 'rb') as f:
    itos = pickle.load(f)
    
with open(DATA_PATH/'stoi.pkl', 'rb') as f:
    stoi_ = pickle.load(f)
    stoi = defaultdict(lambda:0,{k:v for k,v in stoi_.items()})

df_test = pd.read_csv(PATH/'test_set.csv')
df_test.drop(columns = ['word_seg','id'], inplace = True)

texts_t = df_test['article'].values

In [170]:
toks_t = get_tokens(texts_t)
ids_t = np.array([[stoi[c] for c in t] for t in toks_t])

np.save(DATA_PATH/'ids_test.npy',ids_t)

In [None]:
ids_t = np.load(DATA_PATH/'ids_test.npy')

In [171]:
bptt,bs = 50,64
tst_dm = DocModel(ids_t,None,bs,bptt,stoi['_pad_'])

In [172]:
from sklearn.metrics import classification_report
from collections import Counter
def predict_cm(model,data,useGPU=True):
    
    if useGPU and torch.cuda.is_available():
        model = model.cuda()
        data.cuda()
        
    model.eval()
        
    
    return avg_one(model,data)
    
def avg_one(model,data):
     
    stime = time.time()
    bind = 0
    tind = len(data) #文章总数
    model.init_hidden()

    res = dict()
    for item in data:
                            
        
        pred_y = model(Variable(item.xs),item.end_ids,item.sqz_ids) #bptt*batch_size*classes

        _,pred_l = pred_y.max(2)

        pred_l = pred_l.data.cpu().numpy()[-1,:] #batch_size
        
        for i,doc_id in enumerate(item.ids):
            if doc_id not in res.keys():
                res[doc_id] = [pred_l[i]]
            else:
                res[doc_id].append(pred_l[i])
        
        ctime = time.time()
        ptime = (ctime-stime)/60
        bind = data.get_curid()
        rtime = ptime*(tind-bind)/bind
        print(f'\rProgress: {ptime:.2f}<{rtime:.2f}',end="")
    
    pred_ys = []
    for doc_id in range(len(res)):
        c = Counter(res[doc_id]).most_common(1)[0][0]
        pred_ys.append(c)
    
    return pred_ys

def last(model,data):
     
    stime = time.time()
    bind = 0
    tind = len(data) #文章总数
    model.init_hidden()

    res = dict()
    for item in data:
                            
        
        pred_y = model(Variable(item.xs),item.end_ids,item.sqz_ids) #bptt*batch_size*classes

        _,pred_l = pred_y.max(2)

        pred_l = pred_l.data.cpu().numpy()[-1,:] #batch_size
        
        for i,doc_id in enumerate(item.ids):
            res[doc_id] = pred_l[i]
        
        ctime = time.time()
        ptime = (ctime-stime)/60
        bind = data.get_curid()
        rtime = ptime*(tind-bind)/bind
        print(f'\rProgress: {ptime:.2f}<{rtime:.2f}',end="")
    
    pred_ys = []
    for doc_id in range(len(res)):
        pred_ys.append(res[doc_id])
    
    return pred_ys


In [173]:
pred_ys = predict_cm(cm,tst_dm,True)

batch size is different self.bs:1 x.bs:64!
Progress: 5.13<0.00

In [164]:
from sklearn.metrics import classification_report

print(classification_report(trn_dm.y[:5000], pred_ys))

             precision    recall  f1-score   support

          0       0.70      0.49      0.58       250
          1       0.87      0.71      0.78       143
          2       0.94      0.82      0.87       390
          3       0.89      0.88      0.88       168
          4       0.75      0.91      0.82       109
          5       0.94      0.85      0.89       317
          6       0.80      0.75      0.77       162
          7       0.65      0.86      0.74       356
          8       0.97      0.87      0.92       387
          9       0.85      0.70      0.77       243
         10       0.66      0.80      0.72       187
         11       0.80      0.63      0.70       276
         12       0.75      0.80      0.77       383
         13       0.76      0.83      0.79       323
         14       0.89      0.93      0.91       381
         15       0.84      0.23      0.36       138
         16       0.82      0.79      0.80       155
         17       0.83      0.90      0.86   

In [177]:
df_result = pd.DataFrame({'class':pred_ys})
df_result['class'] = df_result['class']+1
df_result.to_csv(DATA_PATH/'result_char.csv',index_label='id')

In [175]:
len(pred_ys)

102277