# 任务一：基于机器学习的文本分类

In [1]:
import collections
import os
import random
import numpy as np
import pandas as pd
import time
import sys
sys.path.append('..')

### 第一步：数据获取与处理

In [2]:
df_train=pd.read_csv('E:/NLP_jupyternotebook/Fudan_NLP_beginner/data/train.tsv',header=0,delimiter='\t')
df_test=pd.read_csv('E:/NLP_jupyternotebook/Fudan_NLP_beginner/data/test.tsv',header=0,delimiter='\t')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [3]:
df_train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
print(df_train.Sentiment.value_counts()/df_train.Sentiment.count())

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64


In [5]:
df_train.Phrase.str.len().max()

283

#### 由于只给了一个数据，要将数据集划分为训练集和测试集

In [6]:
X=df_train['Phrase']
y=df_train['Sentiment']
ceshi_data=df_test['Phrase']
all_data=list(X)
all_lables=list(y)

In [7]:
len(all_data)

156060

In [8]:
all_data[:20]

['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
 'A series of escapades demonstrating the adage that what is good for the goose',
 'A series',
 'A',
 'series',
 'of escapades demonstrating the adage that what is good for the goose',
 'of',
 'escapades demonstrating the adage that what is good for the goose',
 'escapades',
 'demonstrating the adage that what is good for the goose',
 'demonstrating the adage',
 'demonstrating',
 'the adage',
 'the',
 'adage',
 'that what is good for the goose',
 'that',
 'what is good for the goose',
 'what',
 'is good for the goose']

#### 分词与词典：采用2-gram词袋模型

In [9]:
def get_tokenized_sentiment(data):
    def tokenizer(text):
        return[tok.lower() for tok in text.split(' ')]
    tokenized_data=[tokenizer(review) for review in data]
    def ngram(tokenized_data):
        data=[]
        for text in tokenized_data:
            if len(text)==1:
                data.append(text)
            else:
                ng=[(a+' '+b)for a,b in zip(text[:-1],text[1:])]
                data.append(ng)
        return data
    ngram_data=ngram(tokenized_data)
    return ngram_data
ngram_data=get_tokenized_sentiment(all_data)

In [10]:
def get_vocab(ngram_data):
    counter=collections.Counter([tk for st in ngram_data for tk in st])
    idx_to_char=[item[0] for item in counter.items()]
    char_to_idx=dict([(char,idx) for idx,char in enumerate(idx_to_char)])
    return idx_to_char,char_to_idx
idx_to_char,char_to_idx=get_vocab(ngram_data)
print('字典长度为:%d'%len(idx_to_char))

字典长度为:100664


In [11]:
c=list(zip(all_data,all_lables))
random.shuffle(c)
all_data[:],all_lables[:]=zip(*c)
length_train=int(len(all_data)*0.8)
train_data=all_data[:length_train]
train_lables=all_lables[:length_train]
test_data=all_data[length_train:]
test_lables=all_lables[length_train:]
len(train_data),len(train_lables),len(test_data),len(test_lables)

(124848, 124848, 31212, 31212)

In [12]:
ngram_train=get_tokenized_sentiment(train_data)
ngram_test=get_tokenized_sentiment(test_data)

In [16]:
def sentence2idx(sentence,char_to_idx):
    try:
        return[char_to_idx[token]for token in sentence]
    except (KeyError,TypeError):
        print(sentence)

In [92]:
def dataloader(train_data,label,batch_size):
    data=[]
    batch_num=len(train_data)//batch_size
    #print(batch_num)
    for i in range(batch_num):
        x=train_data[max(i*batch_size,0):min((i+1)*batch_size,len(train_data))]
        y=label[max(i*batch_size,0):min((i+1)*batch_size,len(train_data))]
        ngram=[sentence2idx(sentence,char_to_idx) for sentence in x]
        data.append((ngram,y))
    return data

测试迭代器

In [93]:
batch_size=16
train_iter=dataloader(ngram_train,train_lables,batch_size)
test_iter=dataloader(ngram_test,test_lables,batch_size)

In [94]:
for x,y in train_iter:
    print(x,y,len(x),len(y))
    break

[[95702], [73172], [8798, 65928, 65929], [11927], [32635, 45577], [81190], [95018, 95019], [10735, 59234, 4975], [80529, 80530], [311, 50950], [68204, 90006, 90007, 90008, 24947, 24948, 1817, 90009, 90010, 53022, 2049, 10928, 90011, 90012, 90013, 90014, 90015], [96796], [4405, 56836, 56837], [18898, 6650, 5903, 64888, 64889, 9136, 64890, 64891], [7298, 8840], [2015, 27036, 27037, 27038, 27039, 27040, 5130, 27041, 27042]] [2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 4, 2, 3, 1, 2, 3] 16 16


至此，数据集已经创建完毕，每次通过next(iter)调用出的数据已经是2gram词索引序列和对应的label了

### 第二步：建立模型

feature的维度为vocab_size=100664，分类的n_class=5，选择softmax regression分类器，以交叉熵损失函数作为loss，采用随机梯度下降SGD

In [109]:
from tqdm import tqdm
def softmax(x):
    x_exp=np.exp(x)
    partion=np.sum(x_exp,axis=1,keepdims=True)
    return x_exp/partion
def feature(x):
    batch_size=len(x)
    feature_size=len(idx_to_char)
    inputs=np.zeros((batch_size,feature_size))
    #print(x)
    #print(inputs)
    for b,i in enumerate(x):
        for idx in i:
            inputs[b][idx]=1
    return inputs
def train(train_data,test_data,lr,num_epoch,W,b,batch_size):
    for epoch in range(num_epoch): 
        l_sum,start,n=0.0,time.time(),0
        train_iter=iter(train_data)
        test_iter=iter(test_data)
        for x,y in tqdm(train_iter):
            x=feature(x) #[batch_size,feature]
            probability=softmax(np.matmul(x,W)+b) #[batch_size,n_class]
            #print(probability.shape,len(y))
            loss= np.sum(-np.log(probability[range(probability.shape[0]), y]))
            grad_w,grad_b=backward(x,probability,y)
            #print(x.shape,probability.shape,loss.shape,grad_w.shape,grad_b.shape)
            #print(grad_w,grad_b)
            W=W-lr*grad_w
            b=b-lr*grad_b
            l_sum+=loss
            n+=1
            #print(loss)
        print("epoch %d ,loss %.3f ,test_acc %.2f,time %.2f"%(epoch+1,l_sum/n,evaluate(test_iter,W,b),time.time()-start))  
    return W,b
def backward(x,probability,y):
    probability[range(probability.shape[0]), y]-=1
    dw=x.T.dot(probability)/batch_size #feature_size*n_class
    db=np.sum(probability,axis=0)/batch_size #n_class
    return dw,db
def evaluate(test_iter,W,b):
    right=0.0
    n=0.0
    for x,y in test_iter:
        n+=batch_size
        x=feature(x)
        probability=softmax(np.matmul(x,W)+b)
        right+=np.sum(np.argmax(probability,axis=1)==y)
    return right/n

In [110]:
batch_size=64
train_iter=dataloader(ngram_train,train_lables,batch_size)
test_iter=dataloader(ngram_test,test_lables,batch_size)
feature_size=len(idx_to_char)
n_class=5
W=np.random.normal(0,0.01,(feature_size,n_class))
b=np.zeros(n_class)
lr,num_epoch=0.01,10

In [111]:
train(train_iter,test_iter,lr,num_epoch,W,b,batch_size)

1950it [01:00, 32.14it/s]
3it [00:00, 29.20it/s]

epoch 1 ,loss 84.937 ,test_acc 0.51,time 68.37


1950it [01:00, 32.14it/s]
4it [00:00, 32.61it/s]

epoch 2 ,loss 81.675 ,test_acc 0.51,time 68.32


1950it [01:00, 32.11it/s]
4it [00:00, 32.09it/s]

epoch 3 ,loss 81.237 ,test_acc 0.51,time 68.43


1950it [01:00, 32.23it/s]
4it [00:00, 31.09it/s]

epoch 4 ,loss 80.892 ,test_acc 0.51,time 68.18


1950it [01:00, 32.16it/s]
4it [00:00, 33.15it/s]

epoch 5 ,loss 80.584 ,test_acc 0.51,time 68.24


1950it [01:01, 31.81it/s]
4it [00:00, 30.38it/s]

epoch 6 ,loss 80.302 ,test_acc 0.51,time 68.90


1950it [01:00, 32.16it/s]
4it [00:00, 33.70it/s]

epoch 7 ,loss 80.040 ,test_acc 0.51,time 68.32


1950it [01:00, 31.97it/s]
4it [00:00, 33.15it/s]

epoch 8 ,loss 79.795 ,test_acc 0.51,time 68.66


1950it [01:00, 32.21it/s]
4it [00:00, 32.34it/s]

epoch 9 ,loss 79.563 ,test_acc 0.51,time 68.14


1950it [01:00, 32.46it/s]


epoch 10 ,loss 79.343 ,test_acc 0.52,time 68.10


一开始Loss有所下降，然后就几乎不变，测试集的准确率没有改变，经过一轮epoch就陷入了局部最优，也可能是模型过于简单导致的