# Rozwiązanie konkursu w ramach kursu  **[Przetwarzanie Języka Naturalnego](https://dataworkshop.eu/pl/nlp)** organizowanego przez DataWorkshop

#### [Konkurs](https://www.kaggle.com/c/fake-news-detect)
+ private leaderboard: 0.76990 (trzecie miejsce)
+ public leaderboard: 0.75638 (piąte miejsce)

Instalacja niezbędnych bibliotek:

In [None]:
!pip install flair
!pip install keras_bert
!pip install transformers
!pip install texthero
!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz#egg=en_core_web_lg==2.2.5

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, train_test_split, RepeatedStratifiedKFold, StratifiedKFold, KFold
from sklearn.metrics import f1_score, auc, roc_auc_score

from flair.data import Sentence, Dictionary
from flair.models import SequenceTagger, TextClassifier

from flair.datasets import ColumnCorpus, ClassificationCorpus
from flair.embeddings import WordEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings, FlairEmbeddings, TransformerDocumentEmbeddings, DocumentPoolEmbeddings
from flair.trainers import ModelTrainer
from torch.optim.adam import Adam
from flair.visual.training_curves import Plotter

from pathlib import Path

import spacy
import en_core_web_lg
import texthero as hero

import re
from collections import Counter

from segtok.segmenter import split_single

import xgboost as xgb

from tqdm.notebook import tqdm

from keras_bert import load_trained_model_from_checkpoint
from transformers import BertTokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
project_path = '/content/drive/My Drive/Colab Notebooks/DW_fake_news_detection'

In [7]:
train_fake = pd.read_csv(f'{project_path}/input/train_fake.csv')
test_fake = pd.read_csv(f'{project_path}/input/test_fake.csv')

fake_df = pd.concat([train_fake, test_fake])
fake_df.sample(4)

Unnamed: 0.1,Unnamed: 0,id,title,text,is_fake
4057,4057,8009,"baseball,recreation,sports",Thirty U.S. cities have Triple A baseball team...,
609,609,1224,Kurdish-Led Forces Eliminate 12 Daesh Militant...,09.04.2017Get short URL 0 85 12 Daesh terroris...,1.0
3179,3179,6291,crime,Georgia has the most restrictive ballot access...,
3562,3562,7064,BREAKING: Hilary Clinton Filed For Divorce In ...,,


Liczę ilość wystąpień wielkich liter:

In [9]:
def count_uppercase_letters(text):
    return np.sum([int(char.isupper()) for char in text])

def starts_with_lower(text):
    if isinstance(text, str):
        match = re.match(r'^[a-z]', text)
        return (1 if match else  0)
    else:
        return 0

fake_df['title_upper_cnt'] = fake_df['title'].map(lambda x: count_uppercase_letters(x) if isinstance(x, str) else 0)
fake_df['text_upper_cnt'] = fake_df['text'].map(lambda x: count_uppercase_letters(x) if isinstance(x, str) else 0)

In [None]:
fake_df['title_len'] = fake_df['title'].map(lambda x: len(str(x)) if isinstance(x, str) else 0)
fake_df['text_len'] = fake_df['text'].map(lambda x: len(str(x)) if isinstance(x, str) else 0)
fake_df['title_starts_with_lower'] = fake_df['title'].map(lambda x: starts_with_lower(x))

Liczę ilość słów:

In [16]:
word_counts_title = []
for _, row in fake_df.iterrows():
  if isinstance(row['title'], str):
    if row['title_starts_with_lower'] == 1:
      cnt = len(row['title'].split(','))
      word_counts_title.append(cnt)
    if row['title_starts_with_lower'] == 0:
      cnt = len(row['title'].split(' '))
      word_counts_title.append(cnt)
  else:
    word_counts_title.append(0)

fake_df['title_wrd_cnt'] = word_counts_title

word_counts_text = []
for _, row in fake_df.iterrows():
  if isinstance(row['text'], str):
    cnt = len(row['text'].split(' '))
    word_counts_text.append(cnt)
  else:
    word_counts_text.append(0)

fake_df['text_wrd_cnt'] = word_counts_text

#### Buduję statystyki występowania `ner` w dokumentach

Wykorzystujemy model `sequence_tegger` z biblioteki `flair` do określenia, które z wyrażeń (właściwe *tokenów*) stanowią nazwane jednostki (`ner`):

In [None]:
tagger = SequenceTagger.load('ner')

def get_flair_ner(text):
    
    sentence = Sentence(text)
    tagger.predict(sentence)
    
    return  sentence.to_tagged_string()

def get_flair_ner_ents(text):
    
    sentence = Sentence(text, use_tokenizer=True)
    tagger.predict(sentence)
    ents = []
    for entity in sentence.get_spans('ner'):
        ents.append(entity)
    
    return ents

2020-11-13 20:31:48,301 loading file /root/.flair/models/en-ner-conll03-v0.4.pt


In [None]:
%time fake_df['title_flair_ner_ents'] = fake_df['title'].apply(lambda x: f'{get_flair_ner_ents(x)}' if isinstance(x, str) else "")

CPU times: user 2min 46s, sys: 1min 28s, total: 4min 15s
Wall time: 4min 15s


In [None]:
%time fake_df['text_flair_ner_ents'] = fake_df['text'].apply(lambda x: f'{get_flair_ner_ents(x)}' if isinstance(x, str) else "")

CPU times: user 12min 26s, sys: 8min 46s, total: 21min 13s
Wall time: 21min 13s


Tworzymy listę `ner` (*LOC*, *PER*, *ORG*, etc.) dla każdej wiadomości (zarowno dla tytułów jaki treści):

In [None]:
def get_ner_list(text):
    tokens = re.sub(r'[\[\]]', "", text).split(r'>, <')
    ners = []
    for ent in tokens:
        ners.append(re.sub(r'(<*)(.*)(-span)(.*>*)', r'\2', str(ent)).strip())
    return ners

fake_df['text_flair_ners_list'] = fake_df['text_flair_ner_ents'].apply(lambda x: get_ner_list(x) if isinstance(x, str) else x)                
fake_df['title_flair_ners_list'] = fake_df['title_flair_ner_ents'].apply(lambda x: get_ner_list(x) if isinstance(x, str) else x)

Podsumujmy ile mamy `ner`'ów każdego typu we wszystkich dokumentach (tytuły i treść):

In [None]:
cnt_text_ners = Counter()
for ners in fake_df['text_flair_ners_list']:
    if isinstance(ners, list):
        for ner in ners:
            cnt_text_ners.update([ner])


cnt_title_ners = Counter()
for ners in fake_df['title_flair_ners_list']:
    if isinstance(ners, list):
        for ner in ners:
            cnt_title_ners.update([ner])

cnt_text_ners, cnt_title_ners

(Counter({'': 3785, 'LOC': 13828, 'MISC': 7471, 'ORG': 7761, 'PER': 5116}),
 Counter({'': 7338, 'LOC': 1348, 'MISC': 936, 'ORG': 1085, 'PER': 837}))

In [19]:
def count_ners(ner_list, ner_type='LOC'):
    ner_cnt = 0
    if isinstance(ner_list, list):
        for ner in ner_list:
            if ner == str(ner_type): 
                ner_cnt += 1
    return ner_cnt

In [None]:
fake_df['title'] = fake_df['title'].map(lambda x: x if isinstance(x, str) else "<UNK>")
fake_df['text'] = fake_df['text'].map(lambda x: x if isinstance(x, str) else "<UNK>")

for ner in title_ners:
   if len(ner) != 0:
      fake_df[f'flair_title_{ner}'] = fake_df['title_flair_ners_list'].map(lambda x: count_ners(x, ner))


for ner in text_ners:
   if len(ner) != 0:
      fake_df[f'flair_text_{ner}'] = fake_df['text_flair_ners_list'].map(lambda x: count_ners(x, ner))



Liczymy częstość występowania `ner`'ów:

In [None]:
fake_df['flair_text_LOC_freq'] = fake_df['flair_text_LOC'] / fake_df['text_wrd_cnt']
fake_df['flair_text_PER_freq'] = fake_df['flair_text_PER'] / fake_df['text_wrd_cnt']
fake_df['flair_text_ORG_freq'] = fake_df['flair_text_ORG'] / fake_df['text_wrd_cnt']
fake_df['flair_text_MISC_freq'] = fake_df['flair_text_MISC'] / fake_df['text_wrd_cnt']

fake_df['flair_title_LOC_freq'] = fake_df['flair_title_LOC'] / fake_df['title_wrd_cnt']
fake_df['flair_title_PER_freq'] = fake_df['flair_title_PER'] / fake_df['title_wrd_cnt']
fake_df['flair_title_ORG_freq'] = fake_df['flair_title_ORG'] / fake_df['title_wrd_cnt']
fake_df['flair_title_MISC_freq'] = fake_df['flair_title_MISC'] / fake_df['title_wrd_cnt']

Wykorzystując bibliotekę `spaCy` wyszukuję tagi POS (part of speech):

In [None]:
nlp = en_core_web_lg.load()

def get_pos(text):
    if isinstance(text, str):
        doc = nlp(text)
        return [token.pos_ for token in doc]
    else:
        return ""

fake_df['title_pos'] = fake_df['title'].apply(get_pos)
fake_df['text_pos'] = fake_df['text'].apply(get_pos)

pos_cntr_tit = set()
pos_cntr_txt = set()

for pos in fake_df['title_pos']:
    pos_cntr_tit.update(pos)

for pos in fake_df['text_pos']:
    pos_cntr_txt.update(pos)

Obliczam częstotliwość wystąpień poszczególnych *POS* w dokumentach:

In [None]:
def count_pos(text_pos):
  return np.sum([elem == pos if isinstance(text_pos, list) else 0 for elem in text_pos])

fake_df['cnt_token_txt'] = fake_df['text'].map(lambda x: len(nlp(x)) if isinstance(x, str) else 0)
fake_df['cnt_token_tit'] = fake_df['title'].map(lambda x: len(nlp(x)) if isinstance(x, str) else 0)

for pos in pos_cntr_tit:
  fake_df[f'{pos}_tit'] = fake_df.title_pos.apply(count_pos)
  fake_df[f'{pos}_tit_freq'] = fake_df[f'{pos}_tit'] / fake_df['cnt_token_tit']

for pos in pos_cntr_txt:
  fake_df[f'{pos}_txt'] = fake_df.text_pos.apply(count_pos)
  fake_df[f'{pos}_txt_freq'] = fake_df[f'{pos}_txt'] / fake_df['cnt_token_txt']

Obliczam reprezentację wektorową tytułów i treści wykorzystując model BERT (dzięki uprzejmości **Piotra**, który udostępnił swój [kod](https://practicalmlcourse.slack.com/files/UDJGW2XMY/F01DT5M5ACT/predict_bert.ipynb) na slack'u):

Określanie zmiennych parametrów modelu:

In [9]:
berts = {
    'model_large_uncased': {
        'bert_model_name': 'uncased_L-24_H-1024_A-16',
        'bert_model_date': '2018_10_18',
        'uncased': True
    },
    'model_base_uncased': {
        'bert_model_name': 'uncased_L-12_H-768_A-12',
        'bert_model_date': '2020_02_20',
        'uncased': True
    }
}
def get_bert_model(model = 'model_base_uncased', models = berts):
  items = models.get(model)
  return items.get('bert_model_name'), items.get('bert_model_date'), items.get('uncased')

bert_model_name, bert_model_date, uncased = get_bert_model()
bert_model_path = f'https://storage.googleapis.com/bert_models/{bert_model_date}/{bert_model_name}.zip'

model_dir = f'./{bert_model_name}'

data_dir = f'{project_path}/input'
train_dir = f'{project_path}/input/train_fake.csv'
test_dir = f'{project_path}/input/test_fake.csv'

out_dir = f'{project_path}/output'

In [23]:
!wget {bert_model_path}
!unzip -q {bert_model_name}.zip -d {model_dir}

--2020-11-14 11:37:19--  https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.63.128, 142.250.31.128, 172.217.164.144, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.63.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 408102251 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-11-14 11:37:21 (203 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [408102251/408102251]



Funkcje pomocnicze:

In [9]:
def init_tokenizer_and_load_bert_model(model_dir, model_name, model_trainable=True, lowercase=True):

    vocab_path = f'{model_dir}/vocab.txt'
    config_path = f'{model_dir}/bert_config.json'
    checkpoint_path = f'{model_dir}/bert_model.ckpt'
    
    tokenizer = BertTokenizer(vocab_path, do_lower_case=lowercase)
    model = load_trained_model_from_checkpoint(config_path, checkpoint_path, trainable=model_trainable)
    
    print('vocab_size:', len(tokenizer.vocab))
    print('loaded model: ', model_name)
    
    return tokenizer, model

def get_bert_vectors(tokenizer, df, col_name, vector_length=512):
    tokenize = lambda sentence: tokenizer.encode_plus(sentence, max_length=vector_length, padding='max_length', truncation=True)
    df[f'{col_name}_tokens'] = df[col_name].map(tokenize)

    df[f'{col_name}_input_ids'] = df[f'{col_name}_tokens'].map(lambda x: x['input_ids'])
    df[f'{col_name}_token_type_ids'] = df[f'{col_name}_tokens'].map(lambda x: x['token_type_ids'])
    df[f'{col_name}_attention_mask'] = df[f'{col_name}_tokens'].map(lambda x: x['attention_mask'])
    
    input_ids = np.stack(df[f'{col_name}_input_ids'])
    token_type_ids = np.stack(df[f'{col_name}_token_type_ids'])
    attention_mask = np.stack(df[f'{col_name}_attention_mask'])
    vectors = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': attention_mask}
    
    return vectors

def bert_predict_in_batches(vectors, num_batches, output_shape):

  vector_input_ids_batches = np.array_split(vectors['input_ids'], num_batches)
  vector_token_type_ids_batches = np.array_split(vectors['token_type_ids'], num_batches)
  vector_attention_mask_batches = np.array_split(vectors['attention_mask'], num_batches)

  X = np.array([]).reshape((0, output_shape))

  input_vectors = zip(vector_input_ids_batches, vector_token_type_ids_batches, vector_attention_mask_batches)
  for input_ids, token_type_ids, attention_mask in input_vectors:
    all_vectors = (input_ids, token_type_ids, attention_mask)
    predictions = bert_model.predict(all_vectors, verbose=1)

    X_batch = predictions[:, 0, :]
    print('current predictions shape: ', X_batch.shape)
    X = np.concatenate([X, X_batch])
    print('all predictions shape: ', X.shape)

  return X

Wczytywanie danych

In [10]:
train_fake = pd.read_csv(train_dir)
train_fake['is_fake'] = train_fake['is_fake'].astype('int8')
test_fake = pd.read_csv(test_dir)

train_fake.shape, test_fake.shape

train_fake.fillna('unknown', inplace=True)
test_fake.fillna('unknown', inplace=True)

In [10]:
tokenizer, bert_model = init_tokenizer_and_load_bert_model(model_dir, bert_model_name, model_trainable=True, lowercase=uncased)

vocab_size: 30522
loaded model:  uncased_L-12_H-768_A-12


Tokenizacja:

In [12]:
%%time
train_title_vectors = get_bert_vectors(tokenizer, train_fake, 'title', vector_length=512)
[vec.shape for vec in train_title_vectors.values()]

CPU times: user 2.01 s, sys: 11.6 ms, total: 2.03 s
Wall time: 2.03 s


In [13]:
%%time
train_text_vectors = get_bert_vectors(tokenizer, train_fake, 'text', vector_length=512)
[vec.shape for vec in train_text_vectors.values()]

CPU times: user 5.4 s, sys: 14.7 ms, total: 5.41 s
Wall time: 5.42 s


In [14]:
%%time
test_title_vectors = get_bert_vectors(tokenizer, test_fake, 'title', vector_length=512)
[vec.shape for vec in test_title_vectors.values()]

CPU times: user 2 s, sys: 50.9 ms, total: 2.05 s
Wall time: 2.05 s


In [15]:
%%time
test_text_vectors = get_bert_vectors(tokenizer, test_fake, 'text', vector_length=512)
[vec.shape for vec in test_text_vectors.values()]

CPU times: user 5.12 s, sys: 57.2 ms, total: 5.18 s
Wall time: 5.18 s


Uruchamiamy model:

In [16]:
bert_output_shape = bert_model.layers[-1].output_shape[2]
train_X_title = bert_predict_in_batches(train_title_vectors, 5, bert_output_shape)
np.save(f'{out_dir}/train_X_title_{bert_model_name}.npy', train_X_title)

current predictions shape:  (973, 768)
all predictions shape:  (973, 768)
current predictions shape:  (973, 768)
all predictions shape:  (1946, 768)
current predictions shape:  (972, 768)
all predictions shape:  (2918, 768)
current predictions shape:  (972, 768)
all predictions shape:  (3890, 768)
current predictions shape:  (972, 768)
all predictions shape:  (4862, 768)


In [17]:
train_X_text = bert_predict_in_batches(train_text_vectors, 15, bert_output_shape)
np.save(f'{out_dir}/train_X_text_{bert_model_name}_.npy', train_X_text)

current predictions shape:  (325, 768)
all predictions shape:  (325, 768)
current predictions shape:  (325, 768)
all predictions shape:  (650, 768)
current predictions shape:  (324, 768)
all predictions shape:  (974, 768)
current predictions shape:  (324, 768)
all predictions shape:  (1298, 768)
current predictions shape:  (324, 768)
all predictions shape:  (1622, 768)
current predictions shape:  (324, 768)
all predictions shape:  (1946, 768)
current predictions shape:  (324, 768)
all predictions shape:  (2270, 768)
current predictions shape:  (324, 768)
all predictions shape:  (2594, 768)
current predictions shape:  (324, 768)
all predictions shape:  (2918, 768)
current predictions shape:  (324, 768)
all predictions shape:  (3242, 768)
current predictions shape:  (324, 768)
all predictions shape:  (3566, 768)
current predictions shape:  (324, 768)
all predictions shape:  (3890, 768)
current predictions shape:  (324, 768)
all predictions shape:  (4214, 768)
current predictions shape:  

In [18]:
test_X_title = bert_predict_in_batches(test_title_vectors, 5, bert_output_shape)
np.save(f'{out_dir}/test_X_title_{bert_model_name}.npy', test_X_title)

current predictions shape:  (973, 768)
all predictions shape:  (973, 768)
current predictions shape:  (973, 768)
all predictions shape:  (1946, 768)
current predictions shape:  (973, 768)
all predictions shape:  (2919, 768)
current predictions shape:  (972, 768)
all predictions shape:  (3891, 768)
current predictions shape:  (972, 768)
all predictions shape:  (4863, 768)


In [19]:
test_X_text = bert_predict_in_batches(test_text_vectors, 15, bert_output_shape)
np.save(f'{out_dir}/test_X_text_{bert_model_name}.npy', test_X_text)

current predictions shape:  (325, 768)
all predictions shape:  (325, 768)
current predictions shape:  (325, 768)
all predictions shape:  (650, 768)
current predictions shape:  (325, 768)
all predictions shape:  (975, 768)
current predictions shape:  (324, 768)
all predictions shape:  (1299, 768)
current predictions shape:  (324, 768)
all predictions shape:  (1623, 768)
current predictions shape:  (324, 768)
all predictions shape:  (1947, 768)
current predictions shape:  (324, 768)
all predictions shape:  (2271, 768)
current predictions shape:  (324, 768)
all predictions shape:  (2595, 768)
current predictions shape:  (324, 768)
all predictions shape:  (2919, 768)
current predictions shape:  (324, 768)
all predictions shape:  (3243, 768)
current predictions shape:  (324, 768)
all predictions shape:  (3567, 768)
current predictions shape:  (324, 768)
all predictions shape:  (3891, 768)
current predictions shape:  (324, 768)
all predictions shape:  (4215, 768)
current predictions shape:  

## Klasyfikacja tekstu z wykorzystaniem flair i ...


#### ...distilbert:

In [None]:
def save_like_fasttext(df, text_feat, target_feat, dir_path):

    df['label'] = '__label__' + df['is_fake'].astype(str)

    df[ ['label', text_feat] ].iloc[0:int(len(df)*0.8)].to_csv(dir_path + '/train.txt', sep='\t', index=False, header=False)
    df[ ['label', text_feat] ].iloc[int(len(df)*0.8):int(len(df)*0.9)].to_csv(dir_path + '/test.txt', sep='\t', index=False, header=False)
    df[ ['label', text_feat] ].iloc[int(len(df)*0.9):].to_csv(dir_path + '/dev.txt', sep='\t', index=False, header=False);
    
    
    
save_like_fasttext(train_fake, "title", "is_fake", f"{project_path}/input/title_train")   
save_like_fasttext(train_fake, "text", "is_fake", f"{project_path}/input/text_train") 

dla *tekstu*:

In [36]:
data_folder_text = Path(f'{project_path}/input/text_train').resolve()


corpus_txt = ClassificationCorpus(
    data_folder_text,
    test_file='test.txt',
    dev_file='dev.txt',
    train_file='train.txt')


print(corpus_txt.obtain_statistics())

2020-11-14 11:46:02,010 Reading data from /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train
2020-11-14 11:46:02,012 Train: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/train.txt
2020-11-14 11:46:02,013 Dev: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/dev.txt
2020-11-14 11:46:02,015 Test: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/test.txt
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 2892,
        "number_of_documents_per_class": {
            "0": 1618,
            "1": 1274
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 163584,
            "min": 3,
            "max": 1428,
            "avg": 56.56431535269709
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 361,
        "number_of_documents_per_class": {
        

In [37]:
label_dict = corpus_txt.make_label_dictionary()
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
classifier_txt = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)
trainer_txt = ModelTrainer(classifier_txt, corpus_txt, optimizer=Adam)

2020-11-14 11:46:21,370 Computing label dictionary. Progress:


100%|██████████| 3253/3253 [00:06<00:00, 533.83it/s]

2020-11-14 11:46:28,038 [b'0', b'1']





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [38]:
trainer_txt.train(f'{project_path}/output/final/text_train',
              learning_rate=3e-5, # use very small learning rate
              mini_batch_size=16,
              #mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
              max_epochs=5, # terminate after 5 epochs
              )

2020-11-14 11:46:40,075 ----------------------------------------------------------------------------------------------------
2020-11-14 11:46:40,077 Model: "TextClassifier(
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in

{'dev_loss_history': [0.6241855025291443,
  0.6403789520263672,
  0.9891240000724792,
  1.4031282663345337,
  1.7087633609771729],
 'dev_score_history': [0.6369, 0.6816, 0.6257, 0.6341, 0.6704],
 'test_score': 0.6371,
 'train_loss_history': [0.6811314542978508,
  0.5596533140260211,
  0.2925370243167021,
  0.08809913441896418,
  0.03085332238665288]}

In [None]:
plotter = Plotter()
plotter.plot_training_curves(f'{project_path}/output/final/text_train/loss.tsv')

dla *tytułu*:

In [20]:
data_folder_title = Path(f'{project_path}/input/title_train').resolve()


corpus_tit = ClassificationCorpus(
    data_folder_title,
    test_file='test.txt',
    dev_file='dev.txt',
    train_file='train.txt')


print(corpus_tit.obtain_statistics())

2020-11-14 12:23:13,275 Reading data from /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train
2020-11-14 12:23:13,277 Train: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/train.txt
2020-11-14 12:23:13,280 Dev: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/dev.txt
2020-11-14 12:23:13,281 Test: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/test.txt
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 3889,
        "number_of_documents_per_class": {
            "0": 2354,
            "1": 1535
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 16924,
            "min": 1,
            "max": 59,
            "avg": 4.351761378246336
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 486,
        "number_of_documents_per_class": {
       

In [21]:
label_dict = corpus_tit.make_label_dictionary()
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
classifier_tit = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)
trainer_tit = ModelTrainer(classifier_tit, corpus_tit, optimizer=Adam)

2020-11-14 12:23:22,851 Computing label dictionary. Progress:


100%|██████████| 4375/4375 [00:01<00:00, 2257.54it/s]

2020-11-14 12:23:25,863 [b'0', b'1']





In [22]:
trainer_tit.train(f'{project_path}/output/final/title_train',
              learning_rate=3e-5, # use very small learning rate
              mini_batch_size=16,
              #mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
              max_epochs=5, # terminate after 5 epochs
              )

2020-11-14 12:23:38,884 ----------------------------------------------------------------------------------------------------
2020-11-14 12:23:38,887 Model: "TextClassifier(
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in

{'dev_loss_history': [0.5874093770980835,
  0.5968873500823975,
  0.6339479088783264,
  0.7761823534965515,
  0.7203871607780457],
 'dev_score_history': [0.6468, 0.655, 0.6653, 0.6304, 0.6427],
 'test_score': 0.6831,
 'train_loss_history': [0.6204379251013037,
  0.5390320489885377,
  0.4972720181599992,
  0.4621662915852226,
  0.45212755462185283]}

In [None]:
plotter = Plotter()
plotter.plot_training_curves(f'{project_path}/output/title_train/final/loss.tsv')

In [None]:
def make_pred(txt):
    sent = Sentence(str(txt))
    classifier.predict(sent)
    return int(sent.labels[0].value)

def make_pred_proba(txt, classifier):
    sent = Sentence(str(txt))
    classifier.predict(sent)
    return sent.labels

In [None]:
test_fake['is_fake'] = test_fake['text'].map(lambda x: make_pred_proba(x, classifier_txt))
test_fake[['id', 'is_fake']].to_csv(f'{project_path}/output/final/distill_text_test_5epch.csv', index=False)

In [None]:
test_fake['is_fake'] = test_fake['title'].map(lambda x: make_pred_proba(x, classifier_tit))
test_fake[['id', 'is_fake']].to_csv(f'{project_path}/output/final/distill_title_test_5epch.csv', index=False)

In [None]:
train_fake['is_fake_txt_proba'] = train_fake['text'].map(lambda x: make_pred_proba(x, classifier_txt))
train_fake[ ['id', 'is_fake_txt_proba', 'is_fake'] ].to_csv(f'{project_path}/output/final/distill_text_train_5epch.csv', index=False)

In [None]:
train_fake['is_fake_tit_proba'] = train_fake['title'].map(lambda x: make_pred_proba(x, classifier_tit))
train_fake[ ['id', 'is_fake_tit_proba', 'is_fake'] ].to_csv(f'{project_path}/output/final/distill_title_train_5epch.csv', index=False)

#### ...flair-news,
dla *tekstu*:

In [23]:
data_folder_text = Path(f'{project_path}/input/text_train').resolve()


corpus_txt = ClassificationCorpus(
    data_folder_text,
    test_file='test.txt',
    dev_file='dev.txt',
    train_file='train.txt')


print(corpus_txt.obtain_statistics())

2020-11-14 12:31:08,377 Reading data from /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train
2020-11-14 12:31:08,379 Train: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/train.txt
2020-11-14 12:31:08,380 Dev: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/dev.txt
2020-11-14 12:31:08,381 Test: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/test.txt
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 2892,
        "number_of_documents_per_class": {
            "0": 1618,
            "1": 1274
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 163584,
            "min": 3,
            "max": 1428,
            "avg": 56.56431535269709
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 361,
        "number_of_documents_per_class": {
        

In [24]:
label_dict = corpus_txt.make_label_dictionary()
word_embeddings = [FlairEmbeddings('news-forward')]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
classifier_txt = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)
trainer_txt = ModelTrainer(classifier_txt, corpus_txt, optimizer=Adam)

2020-11-14 12:31:23,659 Computing label dictionary. Progress:


100%|██████████| 3253/3253 [00:05<00:00, 625.62it/s]

2020-11-14 12:31:29,514 [b'0', b'1']





2020-11-14 12:31:29,959 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmp1w1c_fdq


100%|██████████| 73034624/73034624 [00:03<00:00, 23155801.32B/s]

2020-11-14 12:31:33,431 copying /tmp/tmp1w1c_fdq to cache at /root/.flair/embeddings/news-forward-0.4.1.pt





2020-11-14 12:31:33,573 removing temp file /tmp/tmp1w1c_fdq


In [None]:
trainer_txt.train(f'{project_path}/output/text_train/flair_news',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              shuffle=True,
              max_epochs=20)

2020-11-14 12:31:44,051 ----------------------------------------------------------------------------------------------------
2020-11-14 12:31:44,053 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2048, out_features=2048, bias=True)
    (rnn): GRU(2048, 256, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=256, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2020-11-14 12:31:44,055 ---------------------------------------------------------------------------------------

In [None]:
plotter.plot_training_curves(f'{project_path}/output/text_train/flair_news/loss.tsv')

...'flair-news' dla tytułów:

In [11]:
data_folder_title = Path(f'{project_path}/input/title_train').resolve()


corpus_tit = ClassificationCorpus(
    data_folder_title,
    test_file='test.txt',
    dev_file='dev.txt',
    train_file='train.txt')


print(corpus_tit.obtain_statistics())

2020-11-14 14:06:46,488 Reading data from /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train
2020-11-14 14:06:46,488 Train: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/train.txt
2020-11-14 14:06:46,489 Dev: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/dev.txt
2020-11-14 14:06:46,490 Test: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/test.txt
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 3889,
        "number_of_documents_per_class": {
            "0": 2354,
            "1": 1535
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 16924,
            "min": 1,
            "max": 59,
            "avg": 4.351761378246336
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 486,
        "number_of_documents_per_class": {
       

In [12]:
label_dict = corpus_tit.make_label_dictionary()
word_embeddings = [FlairEmbeddings('news-forward')]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
classifier_tit = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)
trainer_tit = ModelTrainer(classifier_tit, corpus_tit, optimizer=Adam)

2020-11-14 14:06:53,046 Computing label dictionary. Progress:


100%|██████████| 4375/4375 [00:01<00:00, 3128.88it/s]

2020-11-14 14:06:54,580 [b'0', b'1']





2020-11-14 14:06:55,016 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmp8ul9uyug


100%|██████████| 73034624/73034624 [00:03<00:00, 21111782.92B/s]

2020-11-14 14:06:58,794 copying /tmp/tmp8ul9uyug to cache at /root/.flair/embeddings/news-forward-0.4.1.pt
2020-11-14 14:06:58,859 removing temp file /tmp/tmp8ul9uyug





In [13]:
trainer_tit.train(f'{project_path}/output/title_train/flair_news',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              shuffle=True,
              max_epochs=20)

2020-11-14 14:07:20,262 ----------------------------------------------------------------------------------------------------
2020-11-14 14:07:20,264 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2048, out_features=2048, bias=True)
    (rnn): GRU(2048, 256, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=256, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2020-11-14 14:07:20,265 ---------------------------------------------------------------------------------------

{'dev_loss_history': [1.0374044179916382,
  1.2356168031692505,
  2.694348096847534,
  1.1304402351379395,
  1.4350321292877197,
  1.4485690593719482,
  1.2496886253356934,
  1.7898976802825928,
  0.9989030957221985,
  1.4439035654067993,
  0.9096037745475769,
  3.065790891647339,
  2.782984733581543,
  0.9415881037712097,
  1.9684253931045532,
  0.9624318480491638,
  0.7639271020889282,
  1.3455917835235596,
  1.0686461925506592,
  0.8424346446990967],
 'dev_score_history': [0.5832,
  0.5688,
  0.4867,
  0.5626,
  0.6016,
  0.5791,
  0.5975,
  0.4456,
  0.6037,
  0.5873,
  0.5031,
  0.4168,
  0.5832,
  0.5873,
  0.5914,
  0.4682,
  0.4969,
  0.5811,
  0.4312,
  0.5832],
 'test_score': 0.6049,
 'train_loss_history': [2.77339418061444,
  2.127983719110489,
  2.3825485491361773,
  2.9082503084276543,
  2.3060546136293256,
  2.701214604201864,
  2.9403194360068587,
  2.1404989952923823,
  2.3008467696729253,
  2.22692094230261,
  2.511417348853877,
  2.3301363729062627,
  1.92185890576878

In [None]:
plotter = Plotter()
plotter.plot_training_curves(f'{project_path}/output/title_train/loss.tsv')

In [None]:
test_fake['is_fake'] = test_fake['text'].map(lambda x: make_pred_proba(x, classifier_txt))
test_fake[['id', 'is_fake']].to_csv(f'{project_path}/output/flair_news_text_test_20epch.csv', index=False)

In [None]:
test_fake['is_fake'] = test_fake['title'].map(lambda x: make_pred_proba(x, classifier_tit))
test_fake[['id', 'is_fake']].to_csv(f'{project_path}/output/flair_news_title_test_20epch.csv', index=False)

In [None]:
train_fake['is_fake_txt_proba'] = train_fake['text'].map(lambda x: make_pred_proba(x, classifier_txt))
train_fake[ ['id', 'is_fake_txt_proba', 'is_fake'] ].to_csv(f'{project_path}/output/flair_news_text_train_20epch.csv', index=False)

In [None]:
train_fake['is_fake_tit_proba'] = train_fake['title'].map(lambda x: make_pred_proba(x, classifier_tit))
train_fake[ ['id', 'is_fake_tit_proba', 'is_fake'] ].to_csv(f'{project_path}/output/flair_news_title_train_20epch.csv', index=False)

####... 'bert-base-uncased'
dla *tekstu*:

In [16]:
data_folder_text = Path(f'{project_path}/input/text_train').resolve()


corpus_txt = ClassificationCorpus(
    data_folder_text,
    test_file='test.txt',
    dev_file='dev.txt',
    train_file='train.txt')


print(corpus_txt.obtain_statistics())

2020-11-14 14:25:11,330 Reading data from /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train
2020-11-14 14:25:11,330 Train: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/train.txt
2020-11-14 14:25:11,331 Dev: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/dev.txt
2020-11-14 14:25:11,332 Test: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/text_train/test.txt
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 2892,
        "number_of_documents_per_class": {
            "0": 1618,
            "1": 1274
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 163584,
            "min": 3,
            "max": 1428,
            "avg": 56.56431535269709
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 361,
        "number_of_documents_per_class": {
        

In [17]:
label_dict = corpus_txt.make_label_dictionary()
document_embeddings = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=True)
classifier_txt = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)
trainer_txt = ModelTrainer(classifier_txt, corpus_txt, optimizer=Adam)

2020-11-14 14:25:31,305 Computing label dictionary. Progress:


100%|██████████| 3253/3253 [00:04<00:00, 650.71it/s]

2020-11-14 14:25:36,671 [b'0', b'1']





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [19]:
trainer_txt.train(f'{project_path}/output/final/text_train',
              learning_rate=3e-5, # use very small learning rate
              mini_batch_size=16,
              #mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
              max_epochs=5,
              )

2020-11-14 14:26:57,639 ----------------------------------------------------------------------------------------------------
2020-11-14 14:26:57,644 Model: "TextClassifier(
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
               

{'dev_loss_history': [0.6621371507644653,
  0.6686359643936157,
  1.1185706853866577,
  1.651928186416626],
 'dev_score_history': [0.6369, 0.6536, 0.662, 0.6341],
 'test_score': 0.6814,
 'train_loss_history': [0.6651822056888875,
  0.549201679657836,
  0.2573731599056589,
  0.09523815586834951]}

... dla tytułów:

In [20]:
data_folder_title = Path(f'{project_path}/input/title_train').resolve()


corpus_tit = ClassificationCorpus(
    data_folder_title,
    test_file='test.txt',
    dev_file='dev.txt',
    train_file='train.txt')


print(corpus_tit.obtain_statistics())

2020-11-14 14:36:12,626 Reading data from /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train
2020-11-14 14:36:12,629 Train: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/train.txt
2020-11-14 14:36:12,634 Dev: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/dev.txt
2020-11-14 14:36:12,636 Test: /content/drive/My Drive/Colab Notebooks/DW_fake_news_detection/input/title_train/test.txt
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 3889,
        "number_of_documents_per_class": {
            "0": 2354,
            "1": 1535
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 16924,
            "min": 1,
            "max": 59,
            "avg": 4.351761378246336
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 486,
        "number_of_documents_per_class": {
       

In [21]:
label_dict = corpus_tit.make_label_dictionary()
document_embeddings = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=True)
classifier_tit = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)
trainer_tit = ModelTrainer(classifier_tit, corpus_tit, optimizer=Adam)

2020-11-14 14:36:14,841 Computing label dictionary. Progress:


100%|██████████| 4375/4375 [00:02<00:00, 2125.72it/s]

2020-11-14 14:36:17,406 [b'0', b'1']





In [22]:
trainer_tit.train(f'{project_path}/output/final/title_train',
              learning_rate=3e-5, # use very small learning rate
              mini_batch_size=16,
              #mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
              max_epochs=5,
              )

2020-11-14 14:36:32,689 ----------------------------------------------------------------------------------------------------
2020-11-14 14:36:32,695 Model: "TextClassifier(
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
               

{'dev_loss_history': [0.6284379363059998,
  0.5850365161895752,
  0.5897732973098755,
  0.688412606716156,
  0.8103304505348206],
 'dev_score_history': [0.614, 0.655, 0.6674, 0.6468, 0.6345],
 'test_score': 0.6317,
 'train_loss_history': [0.6184929433454317,
  0.5477736911690626,
  0.5155468625856228,
  0.4815388741429712,
  0.4497747525084214]}

In [None]:
test_fake['is_fake'] = test_fake['text'].map(lambda x: make_pred_proba(x, classifier_txt))
test_fake[['id', 'is_fake']].to_csv(f'{project_path}/output/final/bert-base-uncased_text_test_5epch.csv', index=False)

In [None]:
test_fake['is_fake'] = test_fake['title'].map(lambda x: make_pred_proba(x, classifier_tit))
test_fake[['id', 'is_fake']].to_csv(f'{project_path}/output/final/bert-base-uncased_title_test_5epch.csv', index=False)

In [None]:
train_fake['is_fake_txt_proba'] = train_fake['text'].map(lambda x: make_pred_proba(x, classifier_txt))
train_fake[ ['id', 'is_fake_txt_proba', 'is_fake'] ].to_csv(f'{project_path}/output/final/bert-base-uncased_text_train_20epch.csv', index=False)

In [None]:
train_fake['is_fake_tit_proba'] = train_fake['title'].map(lambda x: make_pred_proba(x, classifier_tit))
train_fake[ ['id', 'is_fake_tit_proba', 'is_fake'] ].to_csv(f'{project_path}/output/final/bert-base-uncased_title_train_20epch.csv', index=False)

### Obliczam dodatkową reprezentację wektorową wykorzystując bibliotekę `flair`:

In [23]:
glove_embedding = WordEmbeddings('glove')
news_forward_embeddings = FlairEmbeddings('news-forward-fast')
embeddings = DocumentPoolEmbeddings(
    [glove_embedding, news_forward_embeddings],
    pooling='mean', #max, min
)

2020-11-14 15:01:01,336 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpnnnhoq0v


100%|██████████| 160000128/160000128 [00:06<00:00, 26091395.96B/s]

2020-11-14 15:01:07,999 copying /tmp/tmpnnnhoq0v to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2020-11-14 15:01:08,500 removing temp file /tmp/tmpnnnhoq0v
2020-11-14 15:01:09,474 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmptsesjheq


100%|██████████| 21494764/21494764 [00:01<00:00, 14595613.92B/s]

2020-11-14 15:01:11,268 copying /tmp/tmptsesjheq to cache at /root/.flair/embeddings/glove.gensim





2020-11-14 15:01:11,291 removing temp file /tmp/tmptsesjheq
2020-11-14 15:01:13,692 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmppsmrl22v


100%|██████████| 19689779/19689779 [00:01<00:00, 14741726.14B/s]

2020-11-14 15:01:15,351 copying /tmp/tmppsmrl22v to cache at /root/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2020-11-14 15:01:15,372 removing temp file /tmp/tmppsmrl22v





Oczyszczam tytuły i tekst za pomocą metody `clean` z biblioteki `texthero`:

In [24]:
fake_df['tit_cln'] = hero.clean(fake_df['title'])
fake_df['txt_cln'] = hero.clean(fake_df['text'])

Wyliczam wektory:

In [29]:
def calc_embeddings(text, embeddings):
    if isinstance(text, str) and len(text) != 0:
        sent = Sentence(text)
        embeddings.embed(sent)
        return np.array(sent.embedding)

In [None]:
%time fake_df['vectors_tit'] = fake_df.tit_cln.map(lambda text: calc_embeddings(text, embeddings))
np.save(f'{project_path}/input/vectors_tit.npy', fake_df['vectors_tit'])

In [None]:
%time fake_df['vectors_txt'] = fake_df.txt_cln.map(lambda text: calc_embeddings(text, embeddings))
np.save(f'{project_path}/input/vectors_txt.npy', fake_df['vectors_txt'])

## Trenowanie ostatecznego modelu (XGBoost):
Wykorzystując wyniki wcześniejszych modeli klasyfikacji tekstu (flair) wyciągam prawdopodobienstwa, jake te modele obliczyły dla swoich predykcji:

In [3]:
def combine_predictions(epch=5, model='distill'):
    dist_train_text = pd.read_csv(f'{project_path}/output/final/{model}_text_train_{epch}epch.csv')
    dist_test_text = pd.read_csv(f'{project_path}/output/final/{model}_text_test_{epch}epch.csv')
    dist_train_title = pd.read_csv(f'{project_path}/output/final/{model}_title_train_{epch}epch.csv')
    dist_test_title = pd.read_csv(f'{project_path}/output/final/{model}_title_test_{epch}epch.csv')

    
    def _get_pred(text):
        return float(re.match(r'(\[)(\d)', text).groups()[1])

    def _get_proba(text):
        return float(re.match(r'(.*\()(\d+\.\d+)', text).groups()[1])
    
    dist_train_text['pred'] = dist_train_text['is_fake_txt_proba'].map(_get_pred)
    dist_train_text['proba'] = dist_train_text['is_fake_txt_proba'].map(_get_proba)
    dist_test_text['pred'] = dist_test_text['is_fake'].map(_get_pred)
    dist_test_text['proba'] = dist_test_text['is_fake'].map(_get_proba)
    
    dist_train_title['pred'] = dist_train_title['is_fake_tit_proba'].map(_get_pred)
    dist_train_title['proba'] = dist_train_title['is_fake_tit_proba'].map(_get_proba)
    dist_test_title['pred'] = dist_test_title['is_fake'].map(_get_pred)
    dist_test_title['proba'] = dist_test_title['is_fake'].map(_get_proba)
    
    dist_test_text['is_fake_proba'] = np.abs(1 - dist_test_text['pred'] - dist_test_text['proba'])
    dist_train_text['is_fake_proba'] = np.abs(1 - dist_train_text['pred'] - dist_train_text['proba'])
    dist_test_title['is_fake_proba'] = np.abs(1 - dist_test_title['pred'] - dist_test_title['proba'])
    dist_train_title['is_fake_proba'] = np.abs(1 - dist_train_title['pred'] - dist_train_title['proba'])
    
    dist_train_proba = pd.merge(dist_train_text, dist_train_title, on='id', suffixes=('_text', '_title'))
    dist_test_proba = pd.merge(dist_test_text, dist_test_title, on='id', suffixes=('_text', '_title'))
    dist_proba = pd.concat([dist_train_proba, dist_test_proba])
    dist_proba = dist_proba[['is_fake_proba_text', 'is_fake_proba_title']]
    dist_proba.reset_index(drop=True, inplace=True)
    
    return dist_proba

Łączę wszystkie cechy:

In [6]:
feats = ['flair_text_LOC',
       'flair_text_PER', 'flair_text_ORG', 'flair_text_MISC', 'flair_title_LOC', 'flair_title_PER',
       'flair_title_ORG', 'flair_title_MISC', 'title_len', 'text_len', 'title_upper_cnt',
       'text_upper_cnt', 'title_starts_with_lower', 'title_wrd_cnt',
       'text_wrd_cnt', 'flair_text_LOC_freq', 'flair_text_PER_freq',
       'flair_text_ORG_freq', 'flair_text_MISC_freq', 'flair_title_LOC_freq',
       'flair_title_PER_freq', 'flair_title_ORG_freq',
       'flair_title_MISC_freq', 'PRON_txt', 'AUX_txt', 'NUM_txt', 'ADJ_txt', 'NOUN_txt', 'ADP_txt', 
        'PROPN_txt', 'PUNCT_txt', 'CCONJ_txt', 'DET_txt', 'VERB_txt', 'SYM_txt', 'ADV_txt', 'SCONJ_txt',
         'PART_txt', 'X_txt', 'INTJ_txt', 'SPACE_txt', 'NOUN_tit', 'PUNCT_tit', 'ADV_tit', 'PART_tit', 
         'VERB_tit', 'ADP_tit', 'DET_tit', 'ADJ_tit', 'SCONJ_tit', 'CCONJ_tit', 'PROPN_tit', 'NUM_tit', 
         'AUX_tit', 'PRON_tit', 'SYM_tit', 'X_tit', 'INTJ_tit', 'SPACE_tit', 'len_txt', 'len_tit', 
         'PRON_txt_freq', 'AUX_txt_freq', 'NUM_txt_freq', 'ADJ_txt_freq', 'NOUN_txt_freq', 'ADP_txt_freq', 
         'PROPN_txt_freq', 'PUNCT_txt_freq', 'CCONJ_txt_freq', 'DET_txt_freq', 'VERB_txt_freq', 
         'SYM_txt_freq', 'ADV_txt_freq', 'SCONJ_txt_freq', 'PART_txt_freq', 'X_txt_freq', 'INTJ_txt_freq', 
         'SPACE_txt_freq', 'NOUN_tit_freq', 'PUNCT_tit_freq', 'ADV_tit_freq', 'PART_tit_freq', 
         'VERB_tit_freq', 'ADP_tit_freq', 'DET_tit_freq', 'ADJ_tit_freq', 'SCONJ_tit_freq', 
         'CCONJ_tit_freq', 'PROPN_tit_freq', 'NUM_tit_freq', 'AUX_tit_freq', 'PRON_tit_freq', 
         'SYM_tit_freq', 'X_tit_freq', 'INTJ_tit_freq', 'SPACE_tit_freq']

In [7]:
dist_proba = combine_predictions(epch=5)
dist_proba_bert = combine_predictions(epch=5, model='bert-base-uncased')
dist_proba_news = combine_predictions(epch=20, model='flair_news')



X_txt = np.load(f'{out_dir}/train_X_text_uncased_L-12_H-768_A-12.npy')
X_tit = np.load(f'{out_dir}/train_X_title_uncased_L-12_H-768_A-12.npy')

X_txt_test = np.load(f'{out_dir}/test_X_text_uncased_L-12_H-768_A-12.npy')
X_tit_test = np.load(f'{out_dir}/test_X_title_uncased_L-12_H-768_A-12.npy')

X_txt_vecs = np.load(f'{project_path}/input/vectors_txt.npy', allow_pickle=True)
X_txt_vecs = np.array([x if x is not None else np.zeros(1124) for x in X_txt_vecs])

X_tit_vecs = np.load(f'{project_path}/input/vectors_tit.npy', allow_pickle=True)
X_tit_vecs = np.array([x if x is not None else np.zeros(1124) for x in X_tit_vecs])

X_txt_vecs_train = X_txt_vecs[fake_df['is_fake'].notnull()]
X_txt_vecs_test = X_txt_vecs[fake_df['is_fake'].isnull()]
X_tit_vecs_train = X_tit_vecs[fake_df['is_fake'].notnull()]
X_tit_vecs_test = X_tit_vecs[fake_df['is_fake'].isnull()]


fake_df['is_fake_proba_text'] = dist_proba['is_fake_proba_text']
fake_df['is_fake_proba_title'] = dist_proba['is_fake_proba_title']
fake_df['is_fake_proba_text_bert'] = dist_proba_bert['is_fake_proba_text']
fake_df['is_fake_proba_title_bert'] = dist_proba_bert['is_fake_proba_title']
fake_df['is_fake_proba_text_news'] = dist_proba_news['is_fake_proba_text']
fake_df['is_fake_proba_title_news'] = dist_proba_news['is_fake_proba_title']



proba_feats = ['is_fake_proba_text', 'is_fake_proba_title', 'is_fake_proba_text_bert', 'is_fake_proba_title_bert', 'is_fake_proba_text_news', 'is_fake_proba_title_news']
train_feats = feats + proba_feats
other_feats = fake_df[fake_df['is_fake'].notnull()][train_feats].values

X = np.hstack([X_txt, X_tit, X_txt_vecs_train, X_tit_vecs_train, other_feats])
y = fake_df[fake_df['is_fake'].notnull()]['is_fake'].values



Trenowanie modelu: (po wcześniejszym wykonaniu optymalizacji `hyperopt`)


In [None]:
xgbparams = {'colsample_bytree': 0.8347102424652894, 
             'learning_rate': 0.155758152117416, 
             'max_depth': 16, 
             'min_child_weight': 3.0, 
             'n_estimators': 90, 
             'reg_alpha': 1.3602305184888648, 
             'reg_lambda': 0.932480406974228, 
             'seed': 8250, 
             'subsample': 0.7603205093434772}

cvrpt = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
model = xgb.XGBClassifier(**xgbparams)

scores = []

for train_idx, test_idx in cvrpt.split(X, y):
    model.fit(X[train_idx], y[train_idx])
    y_pred = model.predict_proba(X[test_idx])[:, 1]
    score = roc_auc_score(y[test_idx], y_pred)
    scores.append(score)

  
test_fake = fake_df[fake_df['is_fake'].isnull()].copy()
other_feats_test = fake_df[fake_df['is_fake'].isnull()][train_feats].values

X_test = np.hstack([X_txt_test, X_tit_test, X_txt_vecs_test, X_tit_vecs_test, other_feats_test])
test_fake['is_fake'] = model.predict_proba(X_test)[:, 1]


In [15]:
np.mean(scores), np.std(scores)

(0.9818255971571602, 0.006162104282424518)

In [None]:
test_fake[['id', 'is_fake']].to_csv(f'{project_path}/output/final/combined_proba_misc_and_pos_ner_hopt.csv', index=False)