In [1]:
import sys
from collections import defaultdict
from collections import Counter
#import ftfy
import re
import numpy as np


In [2]:
class Word:
    def __init__(self, word, upos, lemma=None, xpos=None, feats=None, misc=None):
        self.word = word
        self.norm = strong_normalize(word)
        self.lemma = lemma if lemma else "_"
        self.upos = upos
        self.xpos = xpos if xpos else "_"
        self.xupos = self.upos + "|" + self.xpos
        self.feats = feats if feats else "_"
        self.feats_set = "_" #parse_features(self.feats)
        self.misc = misc if misc else "_"
        #self.lang = lang if lang else "_"
    

class FakeNews:
    def __init__(self, id, source, label, cite, author, lang_code, title, content, date, references):
        self.id = id
        self.source = source
        self.label = label
        self.cite = cite
        self.author = author
        self.lang_code = lang_code
        self.title = title
        self.content = content
        self.date = date
        self.references = references
        
        self.title_tokenized = []
        self.content_tokenized = []
        
        
    def show(self):
        print("ID: ", self.id)
        print("Source: ", self.source)
        print("Label: ", self.label)
        print("Title: ", self.title)
        print("Content: ", self.content)
        
    def tokenization(self):
        tokenized = self.title.split(" ")
        self.title_tokenized = [Word(token, "Noun") for token in tokenized]
        tokenized = self.content.split(" ")
        self.content_tokenized = [Word(token, "Noun") for token in tokenized]
        #print(token_list)
        

In [3]:
# Pytorch Models
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

class Encode_model(nn.Module):
    def __init__(self, 
                 num_word,
                 dim_word,# = 100,
                 num_extWord,
                 dim_extWord,# = 100,
                 num_pos,
                 dim_pos,# = 50,
                 num_label,
                 dropout_ratio,# = 0.33,
                 dim_lstm_hidden,# = 50,
                 num_lstm_layer,# = 50,
                 cuda_device# = 0
                ):
        super(Encode_model, self).__init__()
        
        self.dim_pos = dim_pos
        self.dim_lstm_hidden = dim_lstm_hidden
        self.num_lstm_layer = num_lstm_layer
        
        self.word_emb = nn.Embedding(num_embeddings=num_word + 3, embedding_dim=dim_word)
        self.pos_emb = nn.Embedding(num_embeddings=num_pos + 3, embedding_dim=dim_pos) if dim_pos > 0 else None
        
        self.input_size = dim_word + dim_pos
        self.enc_lstm = nn.LSTM(input_size=self.input_size, hidden_size=dim_lstm_hidden)
        self.enc_lstm_hidden = self.init_hidden()
        self.linear_classifier = nn.Linear(in_features=dim_lstm_hidden, out_features=num_label)
        
    def forward(self, word_seqs):
        
        word_seqs = torch.LongTensor(word_seqs)
        word_vecs = self.word_emb(word_seqs)
        if self.dim_pos > 0:
            pos_vecs = self.pos_emb(pos_seq)
            input_vecs = torch.cat(word_vecs, pos_vecs)
        else:
            input_vecs = word_vecs
            
        lstm_out, lstm_hidden = self.enc_lstm(word_vecs.view(len(word_vecs), 1, -1), self.enc_lstm_hidden)
        #print(lstm_out)
        #print(lstm_hidden)
        lin_out = self.linear_classifier(lstm_hidden[0])
        #lin_out2 = self.linear_classifier(lstm_out.view(len(word_seqs), -1))

        return lin_out.squeeze()
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(self.num_lstm_layer, 1, self.dim_lstm_hidden),
                torch.zeros(self.num_lstm_layer, 1, self.dim_lstm_hidden))


In [4]:
def read_fakenews(file):
    print("Read file: ", file)
    data_list = []
    
    with open(file=file, encoding="UTF-8") as file:
        idx = 0
        while True:
            idx+=1
            line = file.readline().split("\t")
            
            if line[0] is "":
                print("Finish to read: ", len(data_list), " sets")
                break
            else:
                data = FakeNews(line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7],line[8],line[9])
                data.tokenization()
                #data.tagging()
                #data.parsing()
                data_list.append(data)
                if idx > 10:
                    break
    return data_list


def build_voca(data_list, cutoff=1):
    print("build vocaburaries")
    
    wordsCount = Counter()
    charsCount = Counter()
    uposCount = Counter()
    labelCount = Counter()
    
    for data in data_list:
        wordsCount.update([token.norm for token in data.title_tokenized])
        wordsCount.update([token.norm for token in data.content_tokenized])
        for token in data.title_tokenized:
            charsCount.update(token.word)
        for token in data.content_tokenized:
            charsCount.update(token.word)
        
        uposCount.update([token.upos for token in data.title_tokenized])
        uposCount.update([token.upos for token in data.content_tokenized])
        
        labelCount.update([data.label])
    
    wordsCount = Counter({w: i for w, i in wordsCount.items() if i >= cutoff})
    print("Vocab containing {} words".format(len(wordsCount)))
    print("Charset containing {} chars".format(len(charsCount)))
    print("UPOS containing {} tags".format(len(uposCount)), uposCount)
    print("LABEL containing {} tags".format(len(labelCount)), labelCount)
 
    full_dictionary = {
        "vocab": list(wordsCount.keys()),
        "wordfreq": wordsCount,
        "charset": list(charsCount.keys()),
        "charfreq": charsCount,
        "upos": list(uposCount.keys()),
        "uposfreq": uposCount.keys(),
        "label": list(labelCount.keys())
    }
    
    return full_dictionary

    
def strong_normalize(word):
    w = word.lower()
    w = re.sub(r".+@.+", "*EMAIL*", w)
    w = re.sub(r"@\w+", "*AT*", w)
    w = re.sub(r"(https?://|www\.).*", "*url*", w)
    w = re.sub(r"([^\d])\1{2,}", r"\1\1", w)
    w = re.sub(r"([^\d][^\d])\1{2,}", r"\1\1", w)
    w = re.sub(r"``", '"', w)
    w = re.sub(r"''", '"', w)
    w = re.sub(r"\d", "0", w)
    return w

In [5]:
data_list = read_fakenews("./corpus/train.txt")

Read file:  ./corpus/train.txt


In [6]:
full_dic = build_voca(data_list)
data_list[0].title_tokenized[0].upos

build vocaburaries
Vocab containing 3088 words
Charset containing 90 chars
UPOS containing 1 tags Counter({'Noun': 10387})
LABEL containing 2 tags Counter({'trusted': 6, 'fakeNews': 5})


'Noun'

In [7]:
class Fakefinder:
    def __init__(self, train_file, test_file, cuda_device=-1):
        
        self.train_file = "./corpus/train.txt"
        self.test_file = "./corpus/dev.txt"
        self.train_data_list = read_fakenews(self.train_file)
        self.test_data_list = read_fakenews(self.test_file)
        self.full_dic = build_voca(self.train_data_list)
        self._set_vocab(self.full_dic)
        
        self.num_word = len(self._vocab)
        self.dim_word = 100
        self.num_extWord = len(self._vocab)
        self.dim_extWord = 100
        self.num_pos = len(self._upos)
        self.dim_pos = 0
        self.num_label = 3

        self.learning_rate = 0.01
        self.dropout_ratio = 0.33
        self.dim_lstm_hidden = 200
        self.num_lstm_layer = 1
        self.cuda_device = cuda_device
        
        self.enc_model = Encode_model(
                                    num_word = self.num_word,
                                    dim_word = self.dim_word,
                                    num_extWord = self.num_extWord,
                                    dim_extWord = self.dim_extWord,
                                    num_pos = self.num_pos ,
                                    dim_pos = self.dim_pos,
                                    num_label = self.num_label,
                                    dropout_ratio = self.dropout_ratio,
                                    dim_lstm_hidden = self.dim_lstm_hidden,
                                    num_lstm_layer = self.num_lstm_layer,
                                    cuda_device = self.cuda_device)
        
        self.optimizer = torch.optim.SGD(self.enc_model.parameters(), lr=self.learning_rate)
        self.criterion = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available() and self.cuda_device >= 0:
            self.encode_model.cuda(self.cuda_device)
    
    def _set_vocab(self, vocab):
    
        self._fullvocab = vocab
        self._upos = {p: i for i, p in enumerate(vocab["upos"])}
        self._iupos = vocab["upos"]
        self._vocab = {w: i + 3 for i, w in enumerate(vocab["vocab"])}
        self._wordfreq = vocab["wordfreq"]
        self._charset = {c: i + 3 for i, c in enumerate(vocab["charset"])}
        self._charfreq = vocab["charfreq"]
        self._label = {w: i for i, w in enumerate(vocab["label"])}
        
        self._vocab['*pad*'] = 0
        self._charset['*pad*'] = 0
        
        self._vocab['*root*'] = 1
        self._charset['*root*'] = 1

    def train(self, batch_size=32):
        
        print("Start running train sets")    
        self.enc_model.train()
        num_train = 0
        num_accurate = 0
        total_loss = 0
        
        for data in self.train_data_list:
            self.enc_model.zero_grad()
            num_train+=1
            
            words = [token.norm for token in data.content_tokenized]
            word_seqs = [self._vocab.get(token.norm, 0) for token in data.content_tokenized]
            label = [self._label.get(data.label)]
            #print(words[0])
            #print(word_seqs)
            #print(label_seq)
            logists = self.enc_model(word_seqs)
            #print(logists)
            loss = self.compute_loss(logists, label)
            accurate = self.compute_accuracy(logists, label) 
            num_accurate += accurate
            
            #print("accuracy", accuracy)
            #print("loss", loss.data[0])
            total_loss += loss.item()
            loss.backward()
            self.optimizer.step()
        print("Training loss: ", round(total_loss,2),  "(",num_train,"/",num_accurate.item(),")", "  accuracy: ", round((num_accurate.item()/num_train)*100,2) )

            
    def compute_loss(self, pred_logist, gold):
        
        #gold = gold[0]
        #gold = np.eye(3)[gold]
        gold = torch.LongTensor(gold)
        #print(gold)
        #print(pred_logist.squeeze())
        #print(F.softmax(pred_logist.squeeze()))
        pred = pred_logist.unsqueeze(dim=0)
        #print(pred)
        #loss = F.cross_entropy(pred_logist.squeeze(), F.softmax(gold))
        loss = self.criterion(pred, gold)
        return loss
    
    def compute_accuracy(self, pred_logist, gold):
        
        #value, predicted = torch.max(pred_logist.data, 1)
        #print(pred_logist.max())
        #print(pred_logist.max(0))
        pred = pred_logist.data.max(0)[1].cpu()
        #print("predicted " ,pred.data)
        #print("predicted2 " ,pred.data[0].cpu())
        #print("gold " ,gold[0])
        predicted = (pred == gold[0]) if gold[0] is not None else 0
        return predicted

    def test(self, test_file=None):
        print("Start running test sets")
        self.enc_model.train()
        
        num_test = 0
        num_accurate = 0
        
        test_data_list = read_fakenews(test_file) if self.test_data_list is None else self.test_data_list
        #print(test_data_list[0].show())
        for data in test_data_list:
            num_test+=1
            
            words = [token.norm for token in data.content_tokenized]
            word_seqs = [self._vocab.get(token.norm, 0) for token in data.content_tokenized]
            label = [self._label.get(data.label)]
            #print(label)

            logists = self.enc_model(word_seqs)
            accurate = self.compute_accuracy(logists, label) 
            num_accurate += accurate

        print("Testing results: ", "(",num_test,"/",num_accurate.item(),")", "  accuracy: ", round((num_accurate.item()/num_test)*100,2) )

        

    

In [8]:
fakefinder = Fakefinder("./corpus/train.txt","./corpus/dev.txt")

Read file:  ./corpus/train.txt
Read file:  ./corpus/dev.txt
build vocaburaries
Vocab containing 3088 words
Charset containing 90 chars
UPOS containing 1 tags Counter({'Noun': 10387})
LABEL containing 2 tags Counter({'trusted': 6, 'fakeNews': 5})


In [9]:

while range(10):
    fakefinder.train()
    fakefinder.test()

Start running train sets
Training loss:  11.84 ( 11 / 5 )   accuracy:  45.45
Start running test sets
Testing results:  ( 11 / 7 )   accuracy:  63.64
Start running train sets
Training loss:  11.27 ( 11 / 8 )   accuracy:  72.73
Start running test sets
Testing results:  ( 11 / 7 )   accuracy:  63.64
Start running train sets
Training loss:  10.74 ( 11 / 9 )   accuracy:  81.82
Start running test sets
Testing results:  ( 11 / 5 )   accuracy:  45.45
Start running train sets
Training loss:  10.23 ( 11 / 11 )   accuracy:  100.0
Start running test sets
Testing results:  ( 11 / 4 )   accuracy:  36.36
Start running train sets
Training loss:  9.75 ( 11 / 11 )   accuracy:  100.0
Start running test sets
Testing results:  ( 11 / 4 )   accuracy:  36.36
Start running train sets
Training loss:  9.28 ( 11 / 11 )   accuracy:  100.0
Start running test sets
Testing results:  ( 11 / 4 )   accuracy:  36.36
Start running train sets


KeyboardInterrupt: 