In [1]:
import sys
from collections import defaultdict
from collections import Counter
#import ftfy
import re
import numpy as np


In [2]:
class Word:
    def __init__(self, word, upos, lemma=None, xpos=None, feats=None, misc=None):
        self.word = word
        self.norm = strong_normalize(word)
        self.lemma = lemma if lemma else "_"
        self.upos = upos
        self.xpos = xpos if xpos else "_"
        self.xupos = self.upos + "|" + self.xpos
        self.feats = feats if feats else "_"
        self.feats_set = "_" #parse_features(self.feats)
        self.misc = misc if misc else "_"
        #self.lang = lang if lang else "_"
    

class FakeNews:
    def __init__(self, id, source, label, cite, author, lang_code, title, content, date, references):
        self.id = id
        self.source = source
        self.label = label
        self.cite = cite
        self.author = author
        self.lang_code = lang_code
        self.title = title
        self.content = content
        self.date = date
        self.references = references
        
        self.title_tokenized = []
        self.content_tokenized = []
        
        
    def show(self):
        print("ID: ", self.id)
        print("Source: ", self.source)
        print("Label: ", self.label)
        print("Title: ", self.title)
        print("Content: ", self.content)
        
    def tokenization(self):
        tokenized = self.title.split(" ")
        self.title_tokenized = [Word(token, "Noun") for token in tokenized]
        tokenized = self.content.split(" ")
        self.content_tokenized = [Word(token, "Noun") for token in tokenized]
        #print(token_list)
        

In [3]:
def read_fakenews(file, limit=1000000):
    print("Read file: ", file)
    data_list = []
    
    with open(file=file, encoding="UTF-8") as file:
        idx = 0
        while True:
            idx+=1
            line = file.readline().split("\t")
            
            if line[0] is "":
                print("Finish to read: ", len(data_list), " sets")
                break
            else:
                data = FakeNews(line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7],line[8],line[9])
                data.tokenization()
                #data.tagging()
                #data.parsing()
                data_list.append(data)
                if idx > limit:
                    break
    return data_list


def build_voca(data_list, cutoff=1):
    print("build vocaburaries")
    
    wordsCount = Counter()
    charsCount = Counter()
    uposCount = Counter()
    labelCount = Counter()
    
    for data in data_list:
        wordsCount.update([token.norm for token in data.title_tokenized])
        wordsCount.update([token.norm for token in data.content_tokenized])
        for token in data.title_tokenized:
            charsCount.update(token.word)
        for token in data.content_tokenized:
            charsCount.update(token.word)
        
        uposCount.update([token.upos for token in data.title_tokenized])
        uposCount.update([token.upos for token in data.content_tokenized])
        
        labelCount.update([data.label])
    
    wordsCount = Counter({w: i for w, i in wordsCount.items() if i >= cutoff})
    print("Vocab containing {} words".format(len(wordsCount)))
    print("Charset containing {} chars".format(len(charsCount)))
    print("UPOS containing {} tags".format(len(uposCount)), uposCount)
    print("LABEL containing {} tags".format(len(labelCount)), labelCount)
 
    full_dictionary = {
        "vocab": list(wordsCount.keys()),
        "wordfreq": wordsCount,
        "charset": list(charsCount.keys()),
        "charfreq": charsCount,
        "upos": list(uposCount.keys()),
        "uposfreq": uposCount.keys(),
        "label": list(labelCount.keys())
    }
    
    return full_dictionary

    
def strong_normalize(word):
    w = word.lower()
    w = re.sub(r".+@.+", "*EMAIL*", w)
    w = re.sub(r"@\w+", "*AT*", w)
    w = re.sub(r"(https?://|www\.).*", "*url*", w)
    w = re.sub(r"([^\d])\1{2,}", r"\1\1", w)
    w = re.sub(r"([^\d][^\d])\1{2,}", r"\1\1", w)
    w = re.sub(r"``", '"', w)
    w = re.sub(r"''", '"', w)
    w = re.sub(r"\d", "0", w)
    return w

In [4]:
#data_list = read_fakenews("./corpus/train.txt")

In [5]:
#full_dic = build_voca(data_list)
#data_list[0].title_tokenized[0].upos

In [6]:
# Pytorch Models
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

class Encode_model(nn.Module):
    def __init__(self, 
                 num_word,
                 dim_word,# = 100,
                 num_extWord,
                 dim_extWord,# = 100,
                 num_pos,
                 dim_pos,# = 50,
                 num_label,
                 dropout_ratio,# = 0.33,
                 dim_lstm_hidden,# = 50,
                 num_lstm_layer,# = 50,
                 cuda_device# = 0
                ):
        super(Encode_model, self).__init__()
        
        self.dim_pos = dim_pos
        self.dim_lstm_hidden = dim_lstm_hidden
        self.num_lstm_layer = num_lstm_layer
        
        self.word_emb = nn.Embedding(num_embeddings=num_word + 3, embedding_dim=dim_word)
        self.pos_emb = nn.Embedding(num_embeddings=num_pos + 3, embedding_dim=dim_pos) if dim_pos > 0 else None
        
        self.input_size = dim_word + dim_pos
        self.enc_lstm = nn.LSTM(input_size=self.input_size, hidden_size=dim_lstm_hidden)
        self.enc_lstm_hidden = self.init_hidden()
        self.linear_classifier = nn.Linear(in_features=dim_lstm_hidden, out_features=num_label)
        
    def forward(self, word_seqs):
        
        word_seqs = torch.LongTensor(word_seqs)
        word_vecs = self.word_emb(word_seqs)
        if self.dim_pos > 0:
            pos_vecs = self.pos_emb(pos_seq)
            input_vecs = torch.cat(word_vecs, pos_vecs)
        else:
            input_vecs = word_vecs
            
        lstm_out, lstm_hidden = self.enc_lstm(word_vecs.view(len(word_vecs), 1, -1), self.enc_lstm_hidden)
        #print(lstm_out)
        #print(lstm_hidden)
        lin_out = self.linear_classifier(lstm_hidden[0])
        #lin_out2 = self.linear_classifier(lstm_out.view(len(word_seqs), -1))

        return lin_out.squeeze()
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(self.num_lstm_layer, 1, self.dim_lstm_hidden),
                torch.zeros(self.num_lstm_layer, 1, self.dim_lstm_hidden))

In [7]:
class Fakefinder:
    def __init__(self, train_file, test_file, n_read, cuda_device=-1):
        
        self.train_file = "./corpus/train.txt"
        self.test_file = "./corpus/dev.txt"
        self.train_data_list = read_fakenews(self.train_file, limit=n_read)
        self.test_data_list = read_fakenews(self.test_file, limit=n_read)
        self.full_dic = build_voca(self.train_data_list)
        self._set_vocab(self.full_dic)
        
        self.num_word = len(self._vocab)
        self.dim_word = 100
        self.num_extWord = len(self._vocab)
        self.dim_extWord = 100
        self.num_pos = len(self._upos)
        self.dim_pos = 0
        self.num_label = 3

        self.learning_rate = 0.01
        self.dropout_ratio = 0.33
        self.dim_lstm_hidden = 200
        self.num_lstm_layer = 1
        self.cuda_device = cuda_device
        
        self.enc_model = Encode_model(
                                    num_word = self.num_word,
                                    dim_word = self.dim_word,
                                    num_extWord = self.num_extWord,
                                    dim_extWord = self.dim_extWord,
                                    num_pos = self.num_pos ,
                                    dim_pos = self.dim_pos,
                                    num_label = self.num_label,
                                    dropout_ratio = self.dropout_ratio,
                                    dim_lstm_hidden = self.dim_lstm_hidden,
                                    num_lstm_layer = self.num_lstm_layer,
                                    cuda_device = self.cuda_device)
        
        self.optimizer = torch.optim.SGD(self.enc_model.parameters(), lr=self.learning_rate)
        self.criterion = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available() and self.cuda_device >= 0:
            self.encode_model.cuda(self.cuda_device)
    
    def _set_vocab(self, vocab):
    
        self._fullvocab = vocab
        self._upos = {p: i for i, p in enumerate(vocab["upos"])}
        self._iupos = vocab["upos"]
        self._vocab = {w: i + 3 for i, w in enumerate(vocab["vocab"])}
        self._wordfreq = vocab["wordfreq"]
        self._charset = {c: i + 3 for i, c in enumerate(vocab["charset"])}
        self._charfreq = vocab["charfreq"]
        self._label = {w: i for i, w in enumerate(vocab["label"])}
        self._ilabel = vocab["label"]
        
        self._vocab['*pad*'] = 0
        self._charset['*pad*'] = 0
        
        self._vocab['*root*'] = 1
        self._charset['*root*'] = 1

    def train(self, batch_size=32):
        
        print("Running train sets")    
        self.enc_model.train()
        num_train = 0
        num_accurate = 0
        total_loss = 0
        
        for data in self.train_data_list:
            self.enc_model.zero_grad()
            num_train+=1
            
            words = [token.norm for token in data.content_tokenized]
            word_seqs = [self._vocab.get(token.norm, 0) for token in data.content_tokenized]
            label = [self._label.get(data.label)]
            #print(words[0])
            #print(word_seqs)
            #print(label_seq)
            logists = self.enc_model(word_seqs)
            #print(logists)
            loss = self.compute_loss(logists, label)
            accurate = self.compute_accuracy(logists, label) 
            num_accurate += accurate
            
            #print("accuracy", accuracy)
            #print("loss", loss.data[0])
            total_loss += loss.item()
            loss.backward()
            self.optimizer.step()
        print("Training loss: ", round(total_loss,2),  "(",num_train,"/",num_accurate.item(),")", "  accuracy: ", round((num_accurate.item()/num_train)*100,2) )

            
    def compute_loss(self, pred_logist, gold):
        
        #gold = gold[0]
        #gold = np.eye(3)[gold]
        gold = torch.LongTensor(gold)
        #print(gold)
        #print(pred_logist.squeeze())
        #print(F.softmax(pred_logist.squeeze()))
        pred = pred_logist.unsqueeze(dim=0)
        #print(pred)
        #loss = F.cross_entropy(pred_logist.squeeze(), F.softmax(gold))
        loss = self.criterion(pred, gold)
        return loss
    
    def compute_accuracy(self, pred_logist, gold):
        
        #value, predicted = torch.max(pred_logist.data, 1)
        #print(pred_logist.max())
        #print(pred_logist.max(0))
        pred = pred_logist.data.max(0)[1].cpu()
        #print("predicted " ,pred.data)
        #print("predicted2 " ,pred.data[0].cpu())
        #print("gold " ,gold[0])
        predicted = (pred == gold[0]) if gold[0] is not None else 0
        return predicted

    def test(self, test_file=None):
        print("Running test sets")
        self.enc_model.train()
        
        num_test = 0
        num_accurate = 0
        
        test_data_list = read_fakenews(test_file) if self.test_data_list is None else self.test_data_list
        #print(test_data_list[0].show())
        for data in test_data_list:
            num_test+=1
            
            words = [token.norm for token in data.content_tokenized]
            word_seqs = [self._vocab.get(token.norm, 0) for token in data.content_tokenized]
            label = [self._label.get(data.label)]
            #print(label)

            logists = self.enc_model(word_seqs)
            #print(logists)
            accurate = self.compute_accuracy(logists, label)
            #print(accurate)
            num_accurate += accurate

        print("Testing results: ", "(",num_test,"/",num_accurate.item(),")", "  accuracy: ", round((num_accurate.item()/num_test)*100,2) )

        
    def test_input(self, title, content):
        print("Running with an input text")
        self.enc_model.eval()
        
        num_test = 0
        num_accurate = 0
        #input_data = FakeNews(line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7],line[8],line[9])
        input_data = FakeNews("","","","","","","",content,title,"")
        input_data.tokenization()
        
        test_data_list = [input_data]
        print(test_data_list[0].show())
        for data in test_data_list:
            num_test+=1
            
            words = [token.norm for token in data.content_tokenized]
            word_seqs = [self._vocab.get(token.norm, 0) for token in data.content_tokenized]

            logists = self.enc_model(word_seqs)
            print("logists", logists)
            softLog = F.softmax(logists)
            print("예측값:", softLog)
            #print(softLog.cpu()[0])
            #print(softLog[1].item())
            for idx, label in enumerate(self._ilabel):
                print(label,":", round(softLog[idx].item(),3))

     
    

In [8]:
fakefinder = Fakefinder("./corpus/train.txt","./corpus/dev.txt", n_read=40)

Read file:  ./corpus/train.txt
Read file:  ./corpus/dev.txt
build vocaburaries
Vocab containing 9038 words
Charset containing 101 chars
UPOS containing 1 tags Counter({'Noun': 42187})
LABEL containing 2 tags Counter({'trusted': 24, 'fakeNews': 17})


In [9]:

while range(100):
    fakefinder.train()
    fakefinder.test()
    fakefinder.test_input("GSK makes Emma Walmsley most powerful woman in FTSE 100.",
                          "GlaxoSmithKline has appointed Emma Walmsley as chief executive, making her the most powerful woman in the industry and the UK’s FTSE 100 index. Britain’s biggest drugs company said Walmsley would take over from Sir Andrew Witty when he retires at the end of March after eight years at the helm. Walmsley joined GSK in 2010 and runs its £6bn consumer healthcare business, whose products include Sensodyne toothpaste, Horlicks malted drinks and Panadol painkillers. Walmsley will be one of just seven female chief executives in the FTSE 100. EasyJet, Imperial Brands, Whitbread, Royal Mail, Kingfisher and Severn Trent are also run by women. GSK, with a market value of £80bn, is more than twice as big as the next largest female-led company, the tobacco company Imperial Brands, which is valued at £37bn. Sir Philip Hampton, GSK’s chairman, is leading a government-commissioned review into increasing the number of women senior executives at Britain’s top companies. Part of the plan is to get more women into senior management positions who can go on to lead Britain’s biggest companies. Walmsley, 47, has been a member of GSK’s top executive team since 2011. Before joining GSK, she was at L’Oréal, the French cosmetics company, for 17 years and has worked in the UK, Europe, China and the US. Ketan Patel, a fund manager at EdenTree Investment Management, hailed the appointment as a “watershed moment” for the sector, with Walmsley becoming the first female CEO of a major global pharmaceuticals company. Patel said: “The sector has scored surprisingly poorly on diversity at the CEO level.”. The move came as a surprise to many investors, and as a disappointment to those who had hoped for the appointment of an outsider to push through a break-up of the company. GSK shares dipped 0.2% on the news but ended the day broadly flat. Simon Gergel, the UK equities chief investment officer at Allianz Global Investors, which has a 6% stake in GSK, said: “For those investors who have seen the company’s move towards a more diversified platform in a positive light, this continuation is encouraging. However, it may disappoint investors who have been calling for GSK to sharpen its focus and demerge its consumer health arm.”. Other challenges awaiting Walmsley include declining returns from research and development (R&D), increased competition from generic drugmakers and pricing pressure from healthcare providers, especially in the US. Hampton said: “Emma is an outstanding leader with highly valuable experience of building and running major global businesses and a strong track record of delivering growth and driving performance in healthcare.”. Panmure Gordon analyst David Cox said: “Being an internal appointment it’s less likely to bring any huge shake-ups, as external hires often like to put their stamp on things... She may face a steep learning curve on the pharma side since she has a consumer focused background, but will be surrounded by talented management. Fantastic to see a female at the top of such a significant UK company.”. GSK has been looking for a new chief executive since Whitty announced his departure in March. He had been under pressure to speed up turnaround plans for GSK following several years of weak financial performance and a damaging corruption scandal in China. The company has been hit by expiring patents on its top-selling Advair inhaler but reported strong second-quarter results in July. While the rest of the industry has focused on beefing up its drug portfolios, Witty has pursued a different strategy, reducing the company’s reliance on blockbuster drugs in favour of high-volume, lower-priced sales of toothpaste, painkillers and other consumer products, along with vaccines. GSK struck only one big deal during Witty’s tenure, the three-part, £15bn transaction with Swiss rival Novartis in 2014 to pool consumer healthcare assets and exchange cancer and vaccine businesses, which was seen as an innovative move in the industry. Walmsley has overseen the creation of the consumer joint venture with Novartis. She stressed that developing new drugs remained key to the business in an in-house video interview. “Obviously, R&D is the beating heart of our company and our success is and will continue to be defined most fundamentally by the strength of our pipeline... The number one priority, focus for me over coming months and years is going to be really making sure we are investing appropriately and strongly in our R&D organisation.”. GSK is reviewing its pay policy and declined to disclose details of how much Walmsley will be paid. Witty collected a pay and shares package of £6.7m last year. Citi analyst Andrew Baum said of Walmsley, a classics and modern languages graduate from Oxford: “A lack of an R&D background or a postgraduate science background does not preclude Ms Walmsley’s ability to materially improve R&D returns so long as she has the appetite and intent to critically assess GSK’s R&D outcomes and add senior external pharmaceutical and R&D hires.”. GSK is yet to appoint a successor for Moncef Slaoui, the chairman of its vaccines business, who will leave next June.")

Running train sets
Training loss:  44.58 ( 41 / 18 )   accuracy:  43.9
Running test sets
Testing results:  ( 41 / 18 )   accuracy:  43.9
Running with an input text
ID:  
Source:  
Label:  
Title:  
Content:  GlaxoSmithKline has appointed Emma Walmsley as chief executive, making her the most powerful woman in the industry and the UK’s FTSE 100 index. Britain’s biggest drugs company said Walmsley would take over from Sir Andrew Witty when he retires at the end of March after eight years at the helm. Walmsley joined GSK in 2010 and runs its £6bn consumer healthcare business, whose products include Sensodyne toothpaste, Horlicks malted drinks and Panadol painkillers. Walmsley will be one of just seven female chief executives in the FTSE 100. EasyJet, Imperial Brands, Whitbread, Royal Mail, Kingfisher and Severn Trent are also run by women. GSK, with a market value of £80bn, is more than twice as big as the next largest female-led company, the tobacco company Imperial Brands, which is value



Running train sets
Training loss:  40.03 ( 41 / 25 )   accuracy:  60.98
Running test sets
Testing results:  ( 41 / 19 )   accuracy:  46.34
Running with an input text
ID:  
Source:  
Label:  
Title:  
Content:  GlaxoSmithKline has appointed Emma Walmsley as chief executive, making her the most powerful woman in the industry and the UK’s FTSE 100 index. Britain’s biggest drugs company said Walmsley would take over from Sir Andrew Witty when he retires at the end of March after eight years at the helm. Walmsley joined GSK in 2010 and runs its £6bn consumer healthcare business, whose products include Sensodyne toothpaste, Horlicks malted drinks and Panadol painkillers. Walmsley will be one of just seven female chief executives in the FTSE 100. EasyJet, Imperial Brands, Whitbread, Royal Mail, Kingfisher and Severn Trent are also run by women. GSK, with a market value of £80bn, is more than twice as big as the next largest female-led company, the tobacco company Imperial Brands, which is val

Training loss:  33.65 ( 41 / 29 )   accuracy:  70.73
Running test sets
Testing results:  ( 41 / 21 )   accuracy:  51.22
Running with an input text
ID:  
Source:  
Label:  
Title:  
Content:  GlaxoSmithKline has appointed Emma Walmsley as chief executive, making her the most powerful woman in the industry and the UK’s FTSE 100 index. Britain’s biggest drugs company said Walmsley would take over from Sir Andrew Witty when he retires at the end of March after eight years at the helm. Walmsley joined GSK in 2010 and runs its £6bn consumer healthcare business, whose products include Sensodyne toothpaste, Horlicks malted drinks and Panadol painkillers. Walmsley will be one of just seven female chief executives in the FTSE 100. EasyJet, Imperial Brands, Whitbread, Royal Mail, Kingfisher and Severn Trent are also run by women. GSK, with a market value of £80bn, is more than twice as big as the next largest female-led company, the tobacco company Imperial Brands, which is valued at £37bn. Sir P

KeyboardInterrupt: 