In [1]:
# import libs
import pandas as pd
import numpy as np
import sklearn as sk
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext.legacy.data as data
from torchtext.vocab import Vectors

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

import jieba
import os
import re

# Preprocessing:

In [2]:
# read dataset
train = pd.read_csv('./train.csv')

In [3]:
test = pd.read_csv('./test.csv')

In [4]:
unlabeled = pd.read_csv('./Unlabeled.csv')

In [6]:
train.head(2)

Unnamed: 0,Ofiicial Account Name,Title,News Url,Image Url,Report Content,label
0,环球人物,中国反腐风刮到阿根廷，这个美到让人瘫痪的女总统，因为8个本子摊上大事了,http://mp.weixin.qq.com/s?__biz=MTAzNDI4MDc2MQ...,http://mmbiz.qpic.cn/mmbiz_jpg/hpcO6kWnPm6cX3M...,内容不符,0
1,西湖之声,腾讯为《如懿传》道歉？这部3亿大剧上映第一天遭网友狂吐槽：愣是拍成村头恋曲...,http://mp.weixin.qq.com/s?__biz=MTA2Mjk0MTE2MA...,http://mmbiz.qpic.cn/mmbiz_jpg/vQCGoQzHAbaAXRr...,满口胡言,0


In [30]:
test.head(2)

Unnamed: 0,Ofiicial Account Name,Title,News Url,Image Url,Report Content,label
0,私家车第一广播,国务院宣布：生孩子有补助了！明年1月起实施，浙江属于这档！,http://mp.weixin.qq.com/s?__biz=MTA1NTc0MjE0MA...,http://mmbiz.qpic.cn/mmbiz_jpg/j27ttKHs7TlFAL5...,国务院没有发布过类似信息,0
1,杭州交通918,"4个年轻帅小伙突然人没了, 身亡真相惊呆所有人! 太可惜了",http://mp.weixin.qq.com/s?__biz=MTA5Mzc3MDQyMA...,http://mmbiz.qpic.cn/mmbiz_jpg/0y9ibmULDTbDuCt...,？？？？,0


In [8]:
unlabeled.head(2)

Unnamed: 0,Image Url,News Url,Ofiicial Account Name,Report Content,Title
0,http://mmbiz.qpic.cn/mmbiz_jpg/hNIfUeDqtnzpxX5...,http://mp.weixin.qq.com/s?__biz=MTAyNTI4NDgyMQ...,电子竞技,所属内容不实,直言不讳 | 为什么要包容RNG？
1,http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3qFM10...,http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...,腾讯大秦网,欺诈,31省份最低工资排行出炉：上海2420最高，陕西是……


In [9]:
# cut dfs to fit size of dfs in paper

In [10]:
train_cut = pd.concat([train[train["label"] == 0][:2000], train[train["label"] == 1][:2000]])
train_cut["label"].value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [11]:
test_cut = pd.concat([test[test["label"] == 0][:1600], test[test["label"] == 1][:1400]])
test_cut["label"].value_counts()

0    1600
1    1400
Name: label, dtype: int64

In [12]:
unlabeled_cut = unlabeled[:30000]
unlabeled_cut.shape

(30000, 5)

In [13]:
# drop useless columns for feature extraction
temp = train_cut[["Report Content", "label"]]
temp.to_csv("train_annotator.csv", index = False)
temp = test_cut[["Report Content", "label"]]
temp.to_csv("test_annotator.csv", index = False)
temp = unlabeled_cut[["Report Content"]]
temp.to_csv("unlabeled_annotator.csv", index = False)

# split dataset for supervised learning of neural nets
temp = train_cut[["Title", "label"]]
temp.to_csv("train_supervise.csv", index = False)
temp = test_cut[["Title", "label"]]
temp.to_csv("test_supervise.csv", index = False)
# temp = unlabeled_cut[["Report Content"]]
# temp.to_csv("unlabeled_annotator.csv", index = False)

# Logisitic annotator

In [14]:
# 
regex = re.compile(r'[^\u4e00-\u9fa5aA-Za-z0-9]')


def word_cut(text):
    text = regex.sub(' ', text)
    return [word for word in jieba.cut(text) if word.strip()]


def clean_str(string):
    string = re.sub(r"[0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

In [124]:
train_cut["Report Content"] + train_cut["Title"]

0                 内容不符中国反腐风刮到阿根廷，这个美到让人瘫痪的女总统，因为8个本子摊上大事了
1            满口胡言腾讯为《如懿传》道歉？这部3亿大剧上映第一天遭网友狂吐槽：愣是拍成村头恋曲...
2                            ？ 顺风车司机奸杀20岁女乘客，落网视频曝光！滴滴道歉…
3        领个屁证，过你妹的七夕，几天前的图在今天拿来博眼球偶遇鹿晗关晓彤旅行过七夕，小情侣是真滴甜...
4                           事件不实。赵丽颖和冯绍峰即将公布恋情？网友：曝不曝没区别啊
                              ...                        
9839                                          不实倪萍大姐终于走了！
9840      标题与内容不符##骗点击##欺骗阅读者。马容竟然去了非诚匆扰，一出场24盏灯全灭，孟爷爷都笑了
9841             主题于内容不符，欺骗读者##信息不实杨幂证实已离婚独自带娃！刘恺威终于正面回应！
9842                  内容##题目虚假41岁刘涛出轨选择离婚，震惊娱乐圈，她将何去何从...
9843    标题不实，为了打广告##刘涛怎么外遇了，说那么多不就是想让大家买你那化妆品吗41岁刘涛外遇选...
Length: 4000, dtype: object

In [163]:
train_cut_label = train_cut["label"]
train_cut_corpus = train_cut["Report Content"]
y_test = test_cut.label
X_test = test_cut["Report Content"]
X_train, X_val, y_train, y_val = train_test_split(train_cut_corpus, train_cut_label, test_size=0.2, random_state=42)

In [164]:
logit_vec = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = logit_vec.fit_transform(X_train)
X_val_tfidf = logit_vec.transform(X_val)
X_test_tfidf = logit_vec.transform(X_test)

In [165]:
logit_annotator = LogisticRegression()
logit_annotator.fit(X_train_tfidf, y_train)
print(classification_report(y_test, logit_annotator.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.6843    0.7113    0.6975      1600
           1     0.6545    0.6250    0.6394      1400

    accuracy                         0.6710      3000
   macro avg     0.6694    0.6681    0.6685      3000
weighted avg     0.6704    0.6710    0.6704      3000



# Logisitic annotator concat

In [135]:
train_cut_label = train_cut["label"]
train_cut_corpus = train_cut["Report Content"] + train_cut["Title"]
y_test = test_cut.label
X_test = test_cut["Report Content"] + test_cut["Title"]
X_train, X_val, y_train, y_val = train_test_split(train_cut_corpus, train_cut_label, test_size=0.2, random_state=42)

In [144]:
logit_con = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = logit_con.fit_transform(X_train)
X_val_tfidf = logit_con.transform(X_val)
X_test_tfidf = logit_con.transform(X_test)

In [142]:
logit_concat = LogisticRegression()
logit_concat.fit(X_train_tfidf, y_train)
print(classification_report(y_test, logit_concat.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.7265    0.8850    0.7980      1600
           1     0.8249    0.6193    0.7075      1400

    accuracy                         0.7610      3000
   macro avg     0.7757    0.7521    0.7527      3000
weighted avg     0.7724    0.7610    0.7557      3000



# CNN feature extractor & Annotator

In [18]:
#Creating field for text and label
TEXT = data.Field(sequential=True, tokenize=word_cut)
LABEL = data.Field(sequential=False)

# TEXT.preprocessing = data.Pipeline(clean_str)

In [19]:
train_datafield = [('text', TEXT),  ('label', LABEL)]
train_cnn = data.TabularDataset(path ='./train_annotator.csv',  
                             format='csv',
                             skip_header=True,
                             fields=train_datafield)


#%%
test_datafield = [('text', TEXT),  ('label',LABEL)]

test_cnn = data.TabularDataset(path ='./test_annotator.csv', 
                       format='csv',
                       skip_header=True,
                       fields=test_datafield)

In [20]:
train_cnn[0].text

['内容', '不符']

In [21]:
train_cnn[0].label

'0'

In [22]:
# build vocab
TEXT.build_vocab(train_cnn)
LABEL.build_vocab(train_cnn)
vocab = TEXT.vocab

In [23]:
TEXT.vocab.stoi['内容']

5

In [24]:
train_iter = data.Iterator(
        train_cnn, 
        batch_size=64,
        device=torch.device('cuda'), 
        sort_within_batch=False,
        repeat=False)

test_iter = data.Iterator(test_cnn, batch_size=64, device=torch.device('cuda'), 
                     sort_within_batch=False, repeat=False)

In [25]:
# CNN module
# output binary decision of given input report contents
class textCNN(nn.Module):
    
    def __init__(self, vocab_built, emb_dim, dim_channel, kernel_wins, num_class):
        super(textCNN, self).__init__()
        #load pretrained embedding in embedding layer.
        self.embed = nn.Embedding(len(vocab_built), emb_dim)
        # self.embed.weight.data.copy_(vocab_built.vectors)
    
        #Convolutional Layers with different window size kernels
        self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, emb_dim)) for w in kernel_wins])
        #Dropout layer
        self.dropout = nn.Dropout(0.6)
        
        #FC layer
        self.fc = nn.Linear(len(kernel_wins)*dim_channel, num_class)
        
    def forward(self, x):
        emb_x = self.embed(x)
        emb_x = emb_x.unsqueeze(1)

        con_x = [conv(emb_x) for conv in self.convs]

        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x]
        
        fc_x = torch.cat(pool_x, dim=1)
        
        fc_x = fc_x.squeeze(-1)

        fc_x = self.dropout(fc_x)
        logit = self.fc(fc_x)
        return logit

In [78]:
# train, eval for module
def train(model, device, train_itr, optimizer, epoch, max_epoch):
    model.train()
    corrects, train_loss = 0.0,0
    for batch in train_itr:
        text, target = batch.text, batch.label
        text = torch.transpose(text,0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        optimizer.zero_grad()
        logit = model(text)
        
        loss = F.cross_entropy(logit, target)
        loss.backward()
        optimizer.step()
        
        train_loss+= loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()
    
    size = len(train_itr.dataset)
    train_loss /= size 
    accuracy = 100.0 * corrects/size
  
    return train_loss, accuracy
    
def valid(model, device, test_itr):
    model.eval()
    corrects, test_loss = 0.0,0
    fake_tp, fake_target_truth, fake_predicted_truth = 0, 0, 0
    real_tp, real_target_truth, real_predicted_truth = 0, 0, 0
    for batch in test_itr:
        text, target = batch.text, batch.label
        text = torch.transpose(text,0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        logit = model(text)
        loss = F.cross_entropy(logit, target)

        
        test_loss += loss.item()
        # output prediction class with argmax
        result = torch.max(logit,1)[1]
        
        # get tp, fp, fn of each batch
        confusion_vector = result / target
        fake_target_truth += torch.sum(result == 0).item()
        real_target_truth += torch.sum(result == 1).item()
        fake_predicted_truth += torch.sum(target == 0).item()
        real_predicted_truth += torch.sum(target == 1).item()
        fake_tp += torch.sum(torch.isnan(confusion_vector)).item()
        real_tp += torch.sum(confusion_vector == 1).item()
        corrects += (result.view(target.size()).data == target.data).sum()
    size = len(test_itr.dataset)
    test_loss /= size 
    accuracy = 100.0 * corrects/size

    # calculate evaluation metrics
    # fake news as tp
    fake_recall = fake_tp/fake_target_truth
    fake_precision = fake_tp/fake_predicted_truth
    fake_f1 = 2 * fake_precision * fake_recall/(fake_precision + fake_recall)

    # real news as tp
    real_recall = real_tp/real_target_truth
    real_precision = real_tp/real_predicted_truth
    real_f1 = 2 * real_precision * real_recall/(real_precision + real_recall)
    stat = {
        'fake_recall': fake_recall,
        'fake_precision': fake_precision,
        'fake_f1': fake_f1,
        'real_recall': real_recall,
        'real_precision': real_precision,
        'real_f1': real_f1,
    }
    
    return test_loss, accuracy, stat

In [28]:
# fine tuning CNN annotator
annotator = textCNN(vocab, 128, 40, [1,2,3,4,5,6] , 2).to('cuda')
train_loss = []
train_acc = []
test_loss = []
test_acc = []
best_test_acc = -1

# Use GPU if it is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#optimizer
optimizer = optim.Adam(annotator.parameters(), lr=0.001)

# fine tuning
for epoch in range(1, 30):
    #train loss
    tr_loss, tr_acc = train(annotator, device, train_iter, optimizer, epoch, 100)
    print('Train Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, tr_loss, tr_acc))
    
    ts_loss, ts_acc, stat = valid(annotator, device, test_iter)
    print('Valid Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, ts_loss, ts_acc))
    
    if ts_acc > best_test_acc:
        best_test_acc = ts_acc
        print(stat)
        #save paras(snapshot)
        print("model saves at {} accuracy".format(best_test_acc))
        torch.save(annotator.state_dict(), "textCNN_best")
        
    train_loss.append(tr_loss)
    train_acc.append(tr_acc)
    test_loss.append(ts_loss)
    test_acc.append(ts_acc)

Train Epoch: 1 	 Loss: 0.011251639351248741 	 Accuracy: 58.72500228881836%
Valid Epoch: 1 	 Loss: 0.010385637879371643 	 Accuracy: 61.79999923706055%
{'fake_recall': 0.6603107344632768, 'fake_precision': 0.584375, 'fake_f1': 0.620026525198939, 'real_recall': 0.5801767676767676, 'real_precision': 0.6678571428571428, 'real_f1': 0.6209369715714471}
model saves at 61.79999923706055 accuracy
Train Epoch: 2 	 Loss: 0.009418256893754005 	 Accuracy: 68.1500015258789%
Valid Epoch: 2 	 Loss: 0.010040067176024119 	 Accuracy: 65.43333435058594%
{'fake_recall': 0.6749533871970168, 'fake_precision': 0.67875, 'fake_f1': 0.6768463695855407, 'real_recall': 0.6304816678648454, 'real_precision': 0.7757142857142857, 'real_f1': 0.6955981282678443}
model saves at 65.43333435058594 accuracy
Train Epoch: 3 	 Loss: 0.008765803605318069 	 Accuracy: 72.0%
Valid Epoch: 3 	 Loss: 0.009958785692850749 	 Accuracy: 64.43333435058594%
Train Epoch: 4 	 Loss: 0.008033710584044456 	 Accuracy: 74.92500305175781%
Valid Epo

# Supervised setting: Predict given labeled data

## split dataset & tokenize

In [30]:
train_cut_label = train_cut["label"]
train_cut_corpus = train_cut["Title"]
y_test = test_cut.label
X_test = test_cut["Title"]
X_train, X_val, y_train, y_val = train_test_split(train_cut_corpus, train_cut_label, test_size=0.2, random_state=42)

In [31]:
X_train.shape

(3200,)

In [32]:
X_val.shape

(800,)

In [33]:
# get tfidf
vec = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = vec.fit_transform(X_train)
X_val_tfidf = vec.transform(X_val)
X_test_tfidf = vec.transform(X_test)

## Linear Regression

In [43]:
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
print(classification_report(y_test, lr.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.6977    0.8912    0.7827      1600
           1     0.8180    0.5586    0.6638      1400

    accuracy                         0.7360      3000
   macro avg     0.7578    0.7249    0.7232      3000
weighted avg     0.7538    0.7360    0.7272      3000



## Linear SVC

In [42]:
svc = LinearSVC()
svc.fit(X_train_tfidf, y_train)
print(classification_report(y_test, svc.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.7499    0.8844    0.8116      1600
           1     0.8338    0.6629    0.7386      1400

    accuracy                         0.7810      3000
   macro avg     0.7918    0.7736    0.7751      3000
weighted avg     0.7890    0.7810    0.7775      3000



## Random Forest

In [41]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
print(classification_report(y_test, rf.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.7261    0.9113    0.8082      1600
           1     0.8569    0.6071    0.7107      1400

    accuracy                         0.7693      3000
   macro avg     0.7915    0.7592    0.7595      3000
weighted avg     0.7871    0.7693    0.7627      3000



## CNN

In [79]:
# prepare data for nn
nn_text = data.Field(sequential=True, tokenize=word_cut)
nn_label = data.Field(sequential=False)
train_nn_datafield = [('text', nn_text),  ('label', nn_label)]
test_nn_datafield = [('text', nn_text),  ('label', nn_label)]
train_supervise = data.TabularDataset(path ='./train_supervise.csv',  
                             format='csv',
                             skip_header = True,
                             fields = train_nn_datafield)
test_supervise = data.TabularDataset(path ='./test_supervise.csv', 
                       format='csv',
                       skip_header = True,
                       fields=test_nn_datafield)
nn_text.build_vocab(train_supervise)
nn_label.build_vocab(train_supervise)
nn_vocab = nn_text.vocab

# set iterator for batch optimization
train_iter = data.Iterator(
        train_supervise, 
        batch_size=64,
        device=torch.device('cuda'), 
        sort_within_batch=False,
        repeat=False)

test_iter = data.Iterator(test_supervise, batch_size=64, device=torch.device('cuda'), 
                     sort_within_batch=False, repeat=False)

In [80]:
# train CNN
# fine tuning CNN model
model = textCNN(nn_vocab, 200, 40, [1,2,3,4,5,6] , 2).to('cuda')
train_loss = []
train_acc = []
test_loss = []
test_acc = []
best_test_acc = -1

# Use GPU if it is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# fine tuning
for epoch in range(1, 30):
    #train loss
    tr_loss, tr_acc = train(model, device, train_iter, optimizer, epoch, 100)
    print('Train Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, tr_loss, tr_acc))
    
    ts_loss, ts_acc, stat = valid(model, device, test_iter)
    print('Valid Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, ts_loss, ts_acc))
    
    if ts_acc > best_test_acc:
        best_test_acc = ts_acc
        print(stat)
        #save paras(snapshot)
        print("model saves at {} accuracy".format(best_test_acc))
        torch.save(model.state_dict(), "textCNN_supervise_best")
        
    train_loss.append(tr_loss)
    train_acc.append(tr_acc)
    test_loss.append(ts_loss)
    test_acc.append(ts_acc)

Train Epoch: 1 	 Loss: 0.008621974140405655 	 Accuracy: 70.4000015258789%
Valid Epoch: 1 	 Loss: 0.009188692440589268 	 Accuracy: 72.63333129882812%
{'fake_recall': 0.6780978509373571, 'fake_precision': 0.926875, 'fake_f1': 0.7832057037232638, 'real_recall': 0.8560885608856088, 'real_precision': 0.49714285714285716, 'real_f1': 0.6290103931314958}
model saves at 72.63333129882812 accuracy
Train Epoch: 2 	 Loss: 0.004617636788636446 	 Accuracy: 87.92500305175781%
Valid Epoch: 2 	 Loss: 0.010179349114497502 	 Accuracy: 75.23332977294922%
{'fake_recall': 0.6932792061344158, 'fake_precision': 0.960625, 'fake_f1': 0.8053445113963846, 'real_recall': 0.9195402298850575, 'real_precision': 0.5142857142857142, 'real_f1': 0.6596426935409986}
model saves at 75.23332977294922 accuracy
Train Epoch: 3 	 Loss: 0.002801218992099166 	 Accuracy: 93.32500457763672%
Valid Epoch: 3 	 Loss: 0.01182908237973849 	 Accuracy: 76.46666717529297%
{'fake_recall': 0.7073283858998145, 'fake_precision': 0.953125, 'fake

# Weekly supervised:

## Using previously trained annotator to generate weak label

In [44]:
# generate weak label using annotator
def predict(model, device, unlabeled):
    model.eval()
    for index, row in unlabeled.iterrows():
        text = TEXT.preprocess(row['Report Content'])
        text = [[TEXT.vocab.stoi[x] for x in text]]
        if len(text[0]) <= 6:
            for j in range(6 - len(text[0])):
                text[0].append(1)
        text = torch.Tensor(text).long()
        # text = torch.transpose(text, 0, 1)
        text = text.to(device)

        # target.data.sub_(1)
        logit = model(text)

        # output prediction class with argmax
        result = torch.max(logit,1)[1]

        # assign label to row
        unlabeled.loc[index, "weak label"] = result[0].item()

In [45]:
unlabeled["weak label"] = 0

In [46]:
predict(annotator, device, unlabeled)

In [146]:
unlabeled.head()

Unnamed: 0,Image Url,News Url,Ofiicial Account Name,Report Content,Title,weak label,lr label,lr prob
0,http://mmbiz.qpic.cn/mmbiz_jpg/hNIfUeDqtnzpxX5...,http://mp.weixin.qq.com/s?__biz=MTAyNTI4NDgyMQ...,电子竞技,所属内容不实,直言不讳 | 为什么要包容RNG？,0,0,0.531563
1,http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3qFM10...,http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...,腾讯大秦网,欺诈,31省份最低工资排行出炉：上海2420最高，陕西是……,1,1,0.680933
2,http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3pia6u...,http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...,腾讯大秦网,清谷田园并未使用青岛工厂提供的原料##该品牌果汁的原料跟发生烂苹果事件的工厂无任何关系。##...,可怕！国产果汁潜规则曝光，2毛一斤腐烂果被加工成高端果汁！,0,0,0.685111
3,http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3pia6u...,http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...,腾讯大秦网,鱼化寨要拆了不实信息,鱼化寨要拆了？再见了，西安“小香港”？,0,1,0.520913
4,http://mmbiz.qpic.cn/mmbiz_jpg/2EVtnKem0SUGprk...,http://mp.weixin.qq.com/s?__biz=MTA0MzM2MTc4MQ...,不弄头发就闹心,我是云南人，没听过这种陋习,云南摸-奶节真实体验 场面不忍直视,0,0,0.617848


## label with concat annotator

In [150]:
unlabeled_tfidf = logit_con.transform(unlabeled_cut["Report Content"] + unlabeled_cut["Title"])
prediction = logit_concat.predict(unlabeled_tfidf)
unlabeled_con = unlabeled_cut
unlabeled_con["con label"] = prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [154]:
unlabeled_label = unlabeled_con["con label"]
unlabeled_title = unlabeled_con["Title"]
train_label = train_cut["label"]
train_title = train_cut["Title"]

In [155]:
train_weak_label = unlabeled_label
train_weak_corpus = unlabeled_title
y_test = test_cut.label
X_test = test_cut["Report Content"] + test_cut["Title"]
X_train, X_val, y_train, y_val = train_test_split(train_weak_corpus, train_weak_label, test_size=0.2, random_state=42)

In [156]:
# get tfidf
vec = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = vec.fit_transform(X_train)
X_val_tfidf = vec.transform(X_val)
X_test_tfidf = vec.transform(X_test)

In [157]:
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
print(classification_report(y_test, lr.predict(X_test_tfidf), digits = 4))

              precision    recall  f1-score   support

           0     0.6196    0.9744    0.7575      1600
           1     0.9153    0.3164    0.4703      1400

    accuracy                         0.6673      3000
   macro avg     0.7675    0.6454    0.6139      3000
weighted avg     0.7576    0.6673    0.6235      3000



In [158]:
svc = LinearSVC()
svc.fit(X_train_tfidf, y_train)
print(classification_report(y_test, svc.predict(X_test_tfidf), digits = 4))

              precision    recall  f1-score   support

           0     0.6514    0.9425    0.7704      1600
           1     0.8657    0.4236    0.5688      1400

    accuracy                         0.7003      3000
   macro avg     0.7585    0.6830    0.6696      3000
weighted avg     0.7514    0.7003    0.6763      3000



In [159]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
print(classification_report(y_test, rf.predict(X_test_tfidf), digits = 4))

              precision    recall  f1-score   support

           0     0.6185    0.9637    0.7535      1600
           1     0.8856    0.3207    0.4709      1400

    accuracy                         0.6637      3000
   macro avg     0.7521    0.6422    0.6122      3000
weighted avg     0.7432    0.6637    0.6216      3000



## label with logit annotator

In [48]:
unlabeled_tfidf = logit_vec.transform(unlabeled["Report Content"])
prediction = logit_annotator.predict(unlabeled_tfidf)

In [49]:
prob = logit_annotator.predict_proba(unlabeled_tfidf)

In [50]:
unlabeled_lr = unlabeled

In [51]:
unlabeled_lr['lr label'] = prediction
unlabeled_lr['lr prob'] = np.amax(prob, axis = 1)

## Perform detection on train and unlabeled dataset combined

### With CNN annotator

In [53]:
unlabeled_label = unlabeled["weak label"][:30000]
unlabeled_title = unlabeled["Title"][:30000]
train_label = train_cut["label"]
train_title = train_cut["Title"]

In [54]:
# split training and testing set
# train_weak_label = pd.concat([train_label, unlabeled_label], ignore_index=True)
# train_weak_corpus = pd.concat([train_title, unlabeled_title], ignore_index=True)
train_weak_label = unlabeled_label
train_weak_corpus = unlabeled_title
y_test = test_cut.label
X_test = test_cut["Title"]
X_train, X_val, y_train, y_val = train_test_split(train_weak_corpus, train_weak_label, test_size=0.2, random_state=42)

In [55]:
X_train.shape

(24000,)

In [56]:
# get tfidf
vec = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = vec.fit_transform(X_train)
X_val_tfidf = vec.transform(X_val)
X_test_tfidf = vec.transform(X_test)

In [61]:
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
print(classification_report(y_test, lr.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.5911    0.9531    0.7297      1600
           1     0.8214    0.2464    0.3791      1400

    accuracy                         0.6233      3000
   macro avg     0.7063    0.5998    0.5544      3000
weighted avg     0.6986    0.6233    0.5661      3000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [62]:
svc = LinearSVC()
svc.fit(X_train_tfidf, y_train)
print(classification_report(y_test, svc.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.6257    0.8369    0.7160      1600
           1     0.6965    0.4279    0.5301      1400

    accuracy                         0.6460      3000
   macro avg     0.6611    0.6324    0.6231      3000
weighted avg     0.6587    0.6460    0.6293      3000



In [60]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
print(classification_report(y_test, rf.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.6094    0.8794    0.7199      1600
           1     0.7207    0.3557    0.4763      1400

    accuracy                         0.6350      3000
   macro avg     0.6650    0.6175    0.5981      3000
weighted avg     0.6613    0.6350    0.6062      3000



In [63]:
# convert weak supervised training data to csv file
train_weak_cnn = pd.DataFrame(columns=["Title", "label"])
train_weak_cnn["Title"] = train_weak_corpus
train_weak_cnn["label"] = train_weak_label
train_weak_cnn.to_csv("./train_weak.csv", index = False)

In [64]:
# prepare data for nn
nn_text = data.Field(sequential=True, tokenize=word_cut)
nn_label = data.Field(sequential=False)
train_nn_datafield = [('text', nn_text),  ('label', nn_label)]
test_nn_datafield = [('text', nn_text),  ('label', nn_label)]
train_supervise = data.TabularDataset(path ='./train_weak.csv',  
                             format='csv',
                             skip_header = True,
                             fields = train_nn_datafield)
test_supervise = data.TabularDataset(path ='./test_supervise.csv', 
                       format='csv',
                       skip_header = True,
                       fields=test_nn_datafield)
nn_text.build_vocab(train_supervise)
nn_label.build_vocab(train_supervise)
nn_vocab = nn_text.vocab

# set iterator for batch optimization
train_iter = data.Iterator(
        train_supervise, 
        batch_size=64,
        device=torch.device('cuda'), 
        sort_within_batch=False,
        repeat=False)

test_iter = data.Iterator(test_supervise, batch_size=64, device=torch.device('cuda'), 
                     sort_within_batch=False, repeat=False)

In [None]:
# train CNN
# fine tuning CNN model
model = textCNN(nn_vocab, 200, 40, [1,2,3,4,5,6] , 2).to('cuda')
train_loss = []
train_acc = []
test_loss = []
test_acc = []
best_test_acc = -1

# Use GPU if it is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# fine tuning
for epoch in range(1, 30):
    #train loss
    tr_loss, tr_acc = train(model, device, train_iter, optimizer, epoch, 100)
    print('Train Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, tr_loss, tr_acc))
    
    ts_loss, ts_acc, stat = valid(model, device, test_iter)
    print('Valid Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, ts_loss, ts_acc))
    
    if ts_acc > best_test_acc:
        best_test_acc = ts_acc
        print(stat)
        #save paras(snapshot)
        print("model saves at {} accuracy".format(best_test_acc))
        torch.save(model.state_dict(), "textCNN_supervise_best")
        
    train_loss.append(tr_loss)
    train_acc.append(tr_acc)
    test_loss.append(ts_loss)
    test_acc.append(ts_acc)

### With LR annotator

In [82]:
unlabeled_label = unlabeled["lr label"][:30000]
unlabeled_title = unlabeled["Title"][:30000]
train_label = train_cut["label"]
train_title = train_cut["Title"]

In [83]:
# split training and testing set
# train_weak_label = pd.concat([train_label, unlabeled_label], ignore_index=True)
# train_weak_corpus = pd.concat([train_title, unlabeled_title], ignore_index=True)
train_weak_label = unlabeled_label
train_weak_corpus = unlabeled_title
y_test = test_cut.label
X_test = test_cut["Title"]
X_train, X_val, y_train, y_val = train_test_split(train_weak_corpus, train_weak_label, test_size=0.2, random_state=42)

In [69]:
# get tfidf
vec = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = vec.fit_transform(X_train)
X_val_tfidf = vec.transform(X_val)
X_test_tfidf = vec.transform(X_test)

In [71]:
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
print(classification_report(y_test, lr.predict(X_test_tfidf), digits = 4))

              precision    recall  f1-score   support

           0     0.6506    0.9087    0.7583      1600
           1     0.8092    0.4421    0.5718      1400

    accuracy                         0.6910      3000
   macro avg     0.7299    0.6754    0.6651      3000
weighted avg     0.7246    0.6910    0.6713      3000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [72]:
svc = LinearSVC()
svc.fit(X_train_tfidf, y_train)
print(classification_report(y_test, svc.predict(X_test_tfidf), digits = 4))

              precision    recall  f1-score   support

           0     0.6630    0.7819    0.7175      1600
           1     0.6864    0.5457    0.6080      1400

    accuracy                         0.6717      3000
   macro avg     0.6747    0.6638    0.6628      3000
weighted avg     0.6739    0.6717    0.6664      3000



In [73]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
print(classification_report(y_test, rf.predict(X_test_tfidf), digits = 4))

              precision    recall  f1-score   support

           0     0.6346    0.8694    0.7336      1600
           1     0.7413    0.4279    0.5426      1400

    accuracy                         0.6633      3000
   macro avg     0.6880    0.6486    0.6381      3000
weighted avg     0.6844    0.6633    0.6445      3000



In [84]:
# convert weak supervised training data to csv file
train_weak_cnn = pd.DataFrame(columns=["Title", "label"])
train_weak_cnn["Title"] = train_weak_corpus
train_weak_cnn["label"] = train_weak_label
train_weak_cnn.to_csv("./train_weak.csv", index = False)

In [85]:
# prepare data for nn
nn_text = data.Field(sequential=True, tokenize=word_cut)
nn_label = data.Field(sequential=False)
train_nn_datafield = [('text', nn_text),  ('label', nn_label)]
test_nn_datafield = [('text', nn_text),  ('label', nn_label)]
train_supervise = data.TabularDataset(path ='./train_weak.csv',  
                             format='csv',
                             skip_header = True,
                             fields = train_nn_datafield)
test_supervise = data.TabularDataset(path ='./test_supervise.csv', 
                       format='csv',
                       skip_header = True,
                       fields=test_nn_datafield)
nn_text.build_vocab(train_supervise)
nn_label.build_vocab(train_supervise)
nn_vocab = nn_text.vocab

# set iterator for batch optimization
train_iter = data.Iterator(
        train_supervise, 
        batch_size=64,
        device=torch.device('cuda'), 
        sort_within_batch=False,
        repeat=False)

test_iter = data.Iterator(test_supervise, batch_size=64, device=torch.device('cuda'), 
                     sort_within_batch=False, repeat=False)

In [86]:
# train CNN
# fine tuning CNN model
model = textCNN(nn_vocab, 200, 40, [1,2,3,4,5,6] , 2).to('cuda')
train_loss = []
train_acc = []
test_loss = []
test_acc = []
best_test_acc = -1

# Use GPU if it is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# fine tuning
for epoch in range(1, 30):
    #train loss
    tr_loss, tr_acc = train(model, device, train_iter, optimizer, epoch, 100)
    print('Train Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, tr_loss, tr_acc))
    
    ts_loss, ts_acc, stat = valid(model, device, test_iter)
    print('Valid Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, ts_loss, ts_acc))
    
    if ts_acc > best_test_acc:
        best_test_acc = ts_acc
        print(stat)
        #save paras(snapshot)
        print("model saves at {} accuracy".format(best_test_acc))
        torch.save(model.state_dict(), "textCNN_supervise_best")
        
    train_loss.append(tr_loss)
    train_acc.append(tr_acc)
    test_loss.append(ts_loss)
    test_acc.append(ts_acc)

Train Epoch: 1 	 Loss: 0.0103516497194767 	 Accuracy: 64.01000213623047%
Valid Epoch: 1 	 Loss: 0.010906264046827952 	 Accuracy: 54.46666717529297%
{'fake_recall': 0.539795918367347, 'fake_precision': 0.991875, 'fake_f1': 0.6991189427312775, 'real_recall': 0.7833333333333333, 'real_precision': 0.03357142857142857, 'real_f1': 0.06438356164383562}
model saves at 54.46666717529297 accuracy
Train Epoch: 2 	 Loss: 0.009653223114212354 	 Accuracy: 67.09333038330078%
Valid Epoch: 2 	 Loss: 0.010121845960617066 	 Accuracy: 62.666664123535156%
{'fake_recall': 0.5909090909090909, 'fake_precision': 0.975, 'fake_f1': 0.7358490566037736, 'real_recall': 0.8888888888888888, 'real_precision': 0.22857142857142856, 'real_f1': 0.3636363636363636}
model saves at 62.666664123535156 accuracy
Train Epoch: 3 	 Loss: 0.009187939013044039 	 Accuracy: 69.49666595458984%
Valid Epoch: 3 	 Loss: 0.009624328861633936 	 Accuracy: 66.83333587646484%
{'fake_recall': 0.6370638876302673, 'fake_precision': 0.87875, 'fake_

# Automatically annotated

In [87]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
import random
import math

In [108]:
# compute cosine similarity matrix
feature = logit_vec.transform(unlabeled_lr["Title"][:30000])

In [109]:
cos_sim = cosine_similarity(feature)
cos_sim.shape

(30000, 30000)

In [110]:
# detector
train_cut_label = train_cut["label"]
train_cut_corpus = train_cut["Title"]
y_test = test_cut.label
X_test = test_cut["Title"]
X_train, X_val, y_train, y_val = train_test_split(train_cut_corpus, train_cut_label, test_size=0.2, random_state=42)

vec = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = vec.fit_transform(X_train)
X_val_tfidf = vec.transform(X_val)
X_test_tfidf = vec.transform(X_test)

lr_automatic = LogisticRegression()
lr_automatic.fit(X_train_tfidf, y_train)
print(classification_report(y_val, lr_automatic.predict(X_val_tfidf)))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       422
           1       0.93      0.92      0.92       378

    accuracy                           0.93       800
   macro avg       0.93      0.93      0.93       800
weighted avg       0.93      0.93      0.93       800



In [112]:
# define reinforcement learning pipeline
def action(w1, state):
    # calculate prob given state and weight
    prob = sigmoid(np.dot(w1, state))
    # randomly choose action from gaussian distribution
    x = random.uniform(0,1)
    act = 0
    if x <= prob:
        act = 1
    return act, prob
def get_state(index, row, chosen, prev_state):
    feature = vec.transform([row["Title"]])
    prob = lr_automatic.predict_proba(feature)

    # get log prob from annotator and detector
    prob_annotator = row["lr prob"]
    prob_detector = np.amax(prob, axis = 1)[0]

    # cosine similarity between chosen samples and current one
    cos = max(cos_sim[index][chosen])

    # weak label
    label = row["lr label"]

    # combined
    temp = [prob_annotator, prob_detector, cos, label]

    # taking average on the previously chosen states
    avg = np.mean(prev_state, axis=0)
    
    # return concat state
    return temp, np.concatenate((temp, avg), axis = None)

def sigmoid(x):
    return 1/(1 + math.exp(-x))

def relu(x):
    return np.maximum(0, x)

In [113]:
# main loop of reinforcement learning
# stop until
alpha = 0.001
count = 0
bag_size = 100
bag_count = 0
teach = 0
chosen = [0]
p = []
act_total = []
state_total = []
prev_state = [[0,0,0,0]]
w1 = np.random.randn(8)
best_acc = 0
w_best = None
acc = accuracy_score(y_test, lr_automatic.predict(X_test_tfidf))
for index, row in unlabeled_lr[:30000].iterrows():
    # store the values of prob and action for gradients
    cur_state, all_state = get_state(index, row, chosen, prev_state)
    act, prob = action(w1, all_state)
        
    # record state informatin if chosen
    if act == 1:
        chosen.append(index)
        prev_state.append(cur_state)
        p.append(prob)
    else:
        p.append(1 - prob)
    act_total.append(act)
    state_total.append(all_state)

    # update weights & switch until bag size reached
    if bag_count >= bag_size:
        # compute policy gradient with the gradient of log sigmoid
        # (a - p) * x
        sub = np.array([act_total]) - np.array([p])
        gradient = np.dot(sub, state_total)

        # compute R with acc and acc_k
        selected = unlabeled_lr.iloc[chosen]
        selected_label = selected["lr label"]
        selected_corpus = selected["Title"]
        train_label = pd.concat([y_train, selected_label], ignore_index=True)
        train_corpus = pd.concat([X_train, selected_corpus], ignore_index=True)
        X_train_tfidf = vec.transform(train_corpus)

        # retrain the LR
        lr_temp = LogisticRegression().fit(X_train_tfidf, train_label)
        acc_k = accuracy_score(y_test, lr_temp.predict(X_test_tfidf))
        print("Currently at step " + str(count) + ": ")
        print(acc_k)


        # update weights
        w1 = w1 + alpha * gradient * (acc_k - acc)
        if acc_k > best_acc:
            w_best = w1
        # update parameters
        teach += 1
        bag_count = 0
        chosen = [0]
        prev_state = [[0,0,0,0]]
        p = []
        act_total = []
        state_total = []
        continue
    bag_count += 1
    count += 1

Currently at step 100: 
0.7406666666666667
Currently at step 200: 
0.7373333333333333
Currently at step 300: 
0.751
Currently at step 400: 
0.7346666666666667
Currently at step 500: 
0.733
Currently at step 600: 
0.7436666666666667
Currently at step 700: 
0.7356666666666667
Currently at step 800: 
0.7416666666666667
Currently at step 900: 
0.7363333333333333
Currently at step 1000: 
0.7356666666666667
Currently at step 1100: 
0.7463333333333333
Currently at step 1200: 
0.744
Currently at step 1300: 
0.7413333333333333
Currently at step 1400: 
0.7456666666666667
Currently at step 1500: 
0.7406666666666667
Currently at step 1600: 
0.739
Currently at step 1700: 
0.742
Currently at step 1800: 
0.7416666666666667
Currently at step 1900: 
0.7336666666666667
Currently at step 2000: 
0.7423333333333333
Currently at step 2100: 
0.7556666666666667
Currently at step 2200: 
0.7386666666666667
Currently at step 2300: 
0.7363333333333333
Currently at step 2400: 
0.742
Currently at step 2500: 
0.7493

In [114]:
# Using trained policy network to select samples
chosen = [0]
total_chosen = []
prev_state = [[0,0,0,0]]
acc = accuracy_score(y_test, lr_automatic.predict(X_test_tfidf))
for index, row in unlabeled_lr[:30000].iterrows():
    # store the values of prob and action for gradients
    cur_state, all_state = get_state(index, row, chosen, prev_state)
    act, prob = action(w_best, all_state)
        
    # record state informatin if chosen
    if act == 1:
        chosen.append(index)
        total_chosen.append(index)
        prev_state.append(cur_state)

    # update weights & switch until bag size reached
    if bag_count >= bag_size:
        # update parameters
        bag_count = 0
        chosen = [0]
        prev_state = [[0,0,0,0]]
        continue
    bag_count += 1
    count += 1
    # teacher force

In [115]:
len(total_chosen)

24495

# Without training samples

In [121]:
# get rid of first place holder
selected = unlabeled_lr.iloc[total_chosen[1:]]
selected_label = selected["lr label"]
selected_corpus = selected["Title"]
# train_label = pd.concat([y_train, selected_label], ignore_index=True)
# train_corpus = pd.concat([X_train, selected_corpus], ignore_index=True)
train_label = selected_label
train_corpus = selected_corpus
final_vec = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = final_vec.fit_transform(train_corpus)
X_test_tfidf = final_vec.transform(X_test)

# retrain the LR
lr_final = LogisticRegression().fit(X_train_tfidf, train_label)
print(classification_report(y_test, lr_final.predict(X_test_tfidf), digits = 4))

              precision    recall  f1-score   support

           0     0.6451    0.9281    0.7611      1600
           1     0.8352    0.4164    0.5558      1400

    accuracy                         0.6893      3000
   macro avg     0.7402    0.6723    0.6585      3000
weighted avg     0.7338    0.6893    0.6653      3000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [122]:
svc = LinearSVC()
svc.fit(X_train_tfidf, train_label)
print(classification_report(y_test, svc.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.6639    0.8000    0.7256      1600
           1     0.7015    0.5371    0.6084      1400

    accuracy                         0.6773      3000
   macro avg     0.6827    0.6686    0.6670      3000
weighted avg     0.6814    0.6773    0.6709      3000



In [123]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, train_label)
print(classification_report(y_test, rf.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.6340    0.8781    0.7364      1600
           1     0.7513    0.4207    0.5394      1400

    accuracy                         0.6647      3000
   macro avg     0.6927    0.6494    0.6379      3000
weighted avg     0.6887    0.6647    0.6444      3000



## With training samples

In [117]:
selected = unlabeled_lr.iloc[total_chosen[1:]]
selected_label = selected["lr label"]
selected_corpus = selected["Title"]
train_label = pd.concat([y_train, selected_label], ignore_index=True)
train_corpus = pd.concat([X_train, selected_corpus], ignore_index=True)
final_vec = TfidfVectorizer(tokenizer=word_cut)
X_train_tfidf = final_vec.fit_transform(train_corpus)
X_test_tfidf = final_vec.transform(X_test)

# retrain the LR
lr_final = LogisticRegression().fit(X_train_tfidf, train_label)
print(classification_report(y_test, lr_final.predict(X_test_tfidf), digits = 4))

              precision    recall  f1-score   support

           0     0.7242    0.9175    0.8095      1600
           1     0.8643    0.6007    0.7088      1400

    accuracy                         0.7697      3000
   macro avg     0.7943    0.7591    0.7591      3000
weighted avg     0.7896    0.7697    0.7625      3000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [118]:
svc = LinearSVC()
svc.fit(X_train_tfidf, train_label)
print(classification_report(y_test, svc.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.7510    0.8163    0.7823      1600
           1     0.7669    0.6907    0.7268      1400

    accuracy                         0.7577      3000
   macro avg     0.7589    0.7535    0.7545      3000
weighted avg     0.7584    0.7577    0.7564      3000



In [119]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, train_label)
print(classification_report(y_test, rf.predict(X_test_tfidf), digits=4))

              precision    recall  f1-score   support

           0     0.7347    0.8844    0.8026      1600
           1     0.8277    0.6350    0.7187      1400

    accuracy                         0.7680      3000
   macro avg     0.7812    0.7597    0.7606      3000
weighted avg     0.7781    0.7680    0.7634      3000



In [103]:
# convert weak supervised training data to csv file
train_weak_cnn = pd.DataFrame(columns=["Title", "label"])
train_weak_cnn["Title"] = train_corpus
train_weak_cnn["label"] = train_label
train_weak_cnn.to_csv("./train_auto.csv", index = False)

In [104]:
# prepare data for nn
nn_text = data.Field(sequential=True, tokenize=word_cut)
nn_label = data.Field(sequential=False)
train_nn_datafield = [('text', nn_text),  ('label', nn_label)]
test_nn_datafield = [('text', nn_text),  ('label', nn_label)]
train_supervise = data.TabularDataset(path ='./train_auto.csv',  
                             format='csv',
                             skip_header = True,
                             fields = train_nn_datafield)
test_supervise = data.TabularDataset(path ='./test_supervise.csv', 
                       format='csv',
                       skip_header = True,
                       fields=test_nn_datafield)
nn_text.build_vocab(train_supervise)
nn_label.build_vocab(train_supervise)
nn_vocab = nn_text.vocab

# set iterator for batch optimization
train_iter = data.Iterator(
        train_supervise, 
        batch_size=64,
        device=torch.device('cuda'), 
        sort_within_batch=False,
        repeat=False)

test_iter = data.Iterator(test_supervise, batch_size=64, device=torch.device('cuda'), 
                     sort_within_batch=False, repeat=False)

In [105]:
# train CNN
# fine tuning CNN model
model = textCNN(nn_vocab, 200, 40, [1,2,3,4,5,6] , 2).to('cuda')
train_loss = []
train_acc = []
test_loss = []
test_acc = []
best_test_acc = -1

# Use GPU if it is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# fine tuning
for epoch in range(1, 30):
    #train loss
    tr_loss, tr_acc = train(model, device, train_iter, optimizer, epoch, 100)
    print('Train Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, tr_loss, tr_acc))
    
    ts_loss, ts_acc, stat = valid(model, device, test_iter)
    print('Valid Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, ts_loss, ts_acc))
    
    if ts_acc > best_test_acc:
        best_test_acc = ts_acc
        print(stat)
        #save paras(snapshot)
        print("model saves at {} accuracy".format(best_test_acc))
        torch.save(model.state_dict(), "textCNN_supervise_best")
        
    train_loss.append(tr_loss)
    train_acc.append(tr_acc)
    test_loss.append(ts_loss)
    test_acc.append(ts_acc)

Train Epoch: 1 	 Loss: 0.009201257225148208 	 Accuracy: 71.47295379638672%
Valid Epoch: 1 	 Loss: 0.009107313921054204 	 Accuracy: 69.83333587646484%
{'fake_recall': 0.6483141271873666, 'fake_precision': 0.949375, 'fake_f1': 0.7704793304590414, 'real_recall': 0.8767123287671232, 'real_precision': 0.4114285714285714, 'real_f1': 0.5600388915896937}
model saves at 69.83333587646484 accuracy
Train Epoch: 2 	 Loss: 0.0069722345948929065 	 Accuracy: 80.7274169921875%
Valid Epoch: 2 	 Loss: 0.00834387515981992 	 Accuracy: 74.03333282470703%
{'fake_recall': 0.6955693187232015, 'fake_precision': 0.9125, 'fake_f1': 0.7894025412273586, 'real_recall': 0.8446170921198668, 'real_precision': 0.5435714285714286, 'real_f1': 0.661451542807475}
model saves at 74.03333282470703 accuracy
Train Epoch: 3 	 Loss: 0.005671867929589955 	 Accuracy: 85.21874237060547%
Valid Epoch: 3 	 Loss: 0.008752356012662253 	 Accuracy: 73.5999984741211%
Train Epoch: 4 	 Loss: 0.004938977327628855 	 Accuracy: 87.08258056640625