## Detekcia falošných správ zameraných na tému Covid-19 - vybalansovaná trénovacia množina, vymazané stop slová

In [1]:
# načítanie potrebných knižníc
import pandas as pd
import csv
import json
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import WordPunctTokenizer, word_tokenize
from bs4 import BeautifulSoup
import re
import nltk
import gensim
from gensim.models import Word2Vec
import os
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, Model, load_model
from keras.layers import Input, concatenate, Activation, Dense, Dropout, Flatten, LSTM, Bidirectional, GRU, Conv1D, GlobalMaxPooling1D, MaxPooling1D, SpatialDropout1D, GlobalAveragePooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_auc_score

from sklearn.model_selection import GridSearchCV, KFold
from keras.wrappers.scikit_learn import KerasClassifier
import collections

Using TensorFlow backend.


In [2]:
#načítanie dátovej množiny
korona=pd.read_csv("detekcia_Covid19.csv")

In [3]:
#vymazanie duplicitných hodnôt
korona=korona.drop_duplicates(subset='url', keep="last")

In [4]:
#ukážka dátovej množiny
korona.head()

Unnamed: 0,name,url,body,perex,published
187,redaktor maria miz,https://www.hlavnespravy.sk/ceska-vlada-predlz...,Praha 9. apríla 2020 (SITA/HSP/Foto:SITA/AP-Ef...,<p>Praha 9. apríla 2020 (SITA/HSP/Foto:SITA/AP...,2020-04-09T17:40:29.000000+0200
189,ta3.com,https://www.ta3.com/clanok/1190300/pracuju-aj-...,,"Veterinári, ošetrovatelia zvierat aj zamestnan...",2020-08-17T10:37:25.000000+0200
190,hnonline.sk,https://finweb.hnonline.sk/financie-a-burzy/21...,,Trhy sú naďalej znepokojené ekonomickými dôsle...,2020-03-11T19:37:00.000000+0100
191,ta3.com,https://www.ta3.com/clanok/1178311/taliansko-z...,,V Taliansku zatvoria v súvislosti s rýchlym ší...,2020-03-11T22:19:40.000000+0100
194,Martina Max,https://www.hlavnespravy.sk/naozaj-koronavirus...,Bratislava 12. apríla 2020 (HSP/Foto:Pixabay)\...,<p>Bratislava 12. apríla 2020 (HSP/Foto:Pixaba...,2020-04-12T12:31:47.000000+0200


In [5]:
#výpis autorov a počet ich článkov
korona.name.value_counts()

ta3.com                      1964
redaktor janka papcunova     1691
Martina Max                  1569
redaktor maria dutkova       1539
redaktor maria miz           1528
aktuality.sk                 1522
hnonline.sk                  1364
tomas zajaros                1172
sme.sk                        476
zemavek.sk                    313
hlavnespravy.sk               310
redaktor jaroslav             234
maria palastova               202
slobodnyvysielac.sk           176
Redakcia                       59
redakcia                       29
redaktor ivana                 21
admin                          19
redaktor ivana pl              10
redaktor zuzana                10
lenzena                         7
redaktor renata karbanova       5
bajecnezeny.sk                  2
Martin Kočiš                    2
::prop                          1
Verrny                          1
Name: name, dtype: int64

In [6]:
#vymazanie zdroja blog.sme pretože je nerelevantný
korona=korona[korona.name != 'sme.sk']
korona=korona[korona.name != 'redaktor renata karbanova']

#nastavenie cieľového atribútu 
korona["label"]=np.where(korona["name"].str.contains("ta3")|korona["name"].str.contains("aktuality")|korona["name"].str.contains("hnonline")|korona["name"].str.contains("maria dutkova")|korona["name"].str.contains("janka papcunova")
                        |korona["name"].str.contains("maria miz")|korona["name"].str.contains("Martina Max")|korona["name"].str.contains("tomas zajaros")|korona["name"].str.contains("hlavnespravy.sk")
                        |korona["name"].str.contains("maria palastova")|korona["name"].str.contains("ivana")|korona["name"].str.contains("Martin Kočiš"), 0,1)

In [7]:
#pomer cieľového atribútu
korona.label.value_counts()

0    12894
1      851
Name: label, dtype: int64

In [8]:
#počet prázdnych hodnôt
korona.isna().sum()

name            0
url             0
body         3401
perex           8
published       0
label           0
dtype: int64

In [9]:
#veľa NA hodnôt mali pravdivé texty, čiže z perexu sme vzali text a priradili ho do body
korona["body"]=korona["body"].fillna(korona["perex"])

In [10]:
korona.isna().sum()

name         0
url          0
body         5
perex        8
published    0
label        0
dtype: int64

In [11]:
#vymazanie prázdnych hodnôt
korona=korona.dropna()

In [12]:
#do množiny x vyberieme len atribúty name, body a label a do y len atribút label
x=korona[['name','body','label']]
y=korona[['label']]

#rozdelenie množiny na trénovaciu a testovaciu v pomere 70:30
SEED = 2000
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=SEED)

In [13]:
x_train.label.value_counts()

0    8984
1     631
Name: label, dtype: int64

In [14]:
x_test.label.value_counts()

0    3902
1     220
Name: label, dtype: int64

In [15]:
#balansovanie trénovacej množiny - náhodné odstránenie určitého počtu (približne 50-60%) pravdivých správ, 
#kvôli vyrovnaniu pomeru cieľového atribútu
to_remove_TA3 = np.random.choice(x_train[x_train['name']=='ta3.com'].index,size=1136,replace=False) 
x_train=x_train.drop(to_remove_TA3)

to_remove_Max = np.random.choice(x_train[x_train['name']=='Martina Max'].index,size=1060,replace=False)
x_train=x_train.drop(to_remove_Max)

to_remove_miz = np.random.choice(x_train[x_train['name']=='redaktor maria miz'].index,size=998,replace=False)
x_train=x_train.drop(to_remove_miz)

to_remove_dutkova = np.random.choice(x_train[x_train['name']=='redaktor maria dutkova'].index,size=1045,replace=False)
x_train=x_train.drop(to_remove_dutkova)

to_remove_papcunova = np.random.choice(x_train[x_train['name']=='redaktor janka papcunova'].index,size=1137,replace=False)
x_train=x_train.drop(to_remove_papcunova)

to_remove_hsp = np.random.choice(x_train[x_train['name']=='maria palastova'].index,size=100,replace=False)
x_train=x_train.drop(to_remove_hsp)

to_remove_aktuality = np.random.choice(x_train[x_train['name']=='aktuality.sk'].index,size=990,replace=False)
x_train=x_train.drop(to_remove_aktuality)

to_remove_hnonline= np.random.choice(x_train[x_train['name']=='hnonline.sk'].index,size=870,replace=False)
x_train=x_train.drop(to_remove_hnonline)

to_remove_zajaros = np.random.choice(x_train[x_train['name']=='tomas zajaros'].index,size=821,replace=False)
x_train=x_train.drop(to_remove_zajaros)

to_remove_anton = np.random.choice(x_train[x_train['name']=='hlavnespravy.sk'].index,size=195,replace=False)
x_train=x_train.drop(to_remove_anton)

In [16]:
#uloženie nových hodnôt cieľového atribútu label do y_train
y_train=x_train[['label']]

In [17]:
x_train.label.value_counts()

0    632
1    631
Name: label, dtype: int64

In [18]:
y_train.label.value_counts()

0    632
1    631
Name: label, dtype: int64

In [19]:
#vymazanie atribútu name-názov autora článku, pretože detekciu sme vykonávali len zo samotného textu
x_train=x_train.drop(columns=['name'])
x_test=x_test.drop(columns=['name'])

In [20]:
#uloženie trénovacích a testovacích množín
x_test.to_csv('x_test_clanky_korona.csv',encoding='utf-8')
y_test.to_csv('y_test_label_korona.csv',encoding='utf-8')

x_test=pd.read_csv('x_test_clanky_korona.csv',encoding='utf-8')
y_test=pd.read_csv('y_test_label_korona.csv',encoding='utf-8')

x_train.to_csv('x_train_clanky_korona_balans.csv',encoding='utf-8')
y_train.to_csv('y_train_label_korona_balans.csv',encoding='utf-8')

x_train=pd.read_csv('x_train_clanky_korona_balans.csv',encoding='utf-8')

x_train=x_train.drop(columns=["Unnamed: 0"])
x_test=x_test.drop(columns=["Unnamed: 0"])
y_test=y_test.drop(columns=["Unnamed: 0"])

In [21]:
#definovanie, čo sa má z textu odstrániť
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+' #odstránenie @
pat2 = r'https?://[^ ]+' #odstránenie odkazov https
combined_pat = r'|'.join((pat1, pat2)) #kombinovane odstránenie pat1 aj pat2
www_pat = r'www.[^ ]+' #odstránenie odkazov www
pat3=r'http?://[^ ]+' #odstránenie odkazov http
combined_pat2 = r'|'.join((www_pat, pat3)) 

pat4=r'img src=[^ ]+' #odstránenie odkazu na obrázok
pat5=r'\(SITA/[^)]*\)' #odstránenie konkrétneho odkazu začínajúceho sa zdrojom SITA
combined_pat3= r'|'.join((pat4, pat5)) 
pat6=r'\(HSP/[^)]*\)' #odstránenie konkrétneho odkazu začínajúceho sa zdrojom HSP
pat7=r'\(TASR/[^)]*\)' #odstránenie konkrétneho odkazu začínajúceho sa zdrojom TASR
combined_pat4= r'|'.join((pat6, pat7)) 

In [22]:
def cleaner(text):
    soup = BeautifulSoup(text, 'lxml') #dekódovanie html na všeobecný text
    souped = soup.get_text()
    try:
        bom_removed = souped.replace("ï¿½", "?") #ošetrenie chyby keď nedekóduje dobre
    except:
        bom_removed = souped
    stripped3 = re.sub(combined_pat, '', bom_removed)
    stripped2= re.sub(combined_pat2, '', stripped3)
    stripped1 = re.sub(combined_pat3, '', stripped2)
    stripped = re.sub(combined_pat4, '', stripped1)
    lower_case = stripped.lower()
    letters_only = re.sub("[^a-zA-Z\ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ]", " ", lower_case)
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

In [23]:
#definovanie slovenských stop slov
stop_words = frozenset(["tasr","a","aby","aj","ak","ako","ale","alebo","and","ani","áno","asi","až","bez","bude","budem","budeš","budeme","budete","budú","by","bol","bola","boli","bolo","byť","cez","čo","či","ďalší","ďalšia","ďalšie","dnes","do","ho","ešte","for","i","ja","je","jeho","jej","ich","iba","iné","iný","som","si","sme","sú","k","kam","každý","každá","každé","každí","kde","keď","kto","ktorá","ktoré","ktorou","ktorý","ktorí","ku","lebo","len","ma","mať","má","máte","medzi","mi","mna","mne","mnou","musieť","môcť","môj","môže","my","na","nad","nám","náš","naši","nie","nech","než","nič","niektorý","nové","nový","nová","noví","o","od","odo","of","on","ona","ono","oni","ony","po","pod","podľa","pokiaľ","potom","práve","pre","prečo","preto","pretože","prvý","prvá","prvé","prví","pred","predo","pri","pýta","s","sa","so","svoje","svoj","svojich","svojím","svojími","ta","tak","takže","táto","teda","te","tě","ten","tento","the","tieto","tým","týmto","tiež","to","toto","toho","tohoto","tom","tomto","tomuto","tu","tú","túto","tvoj","ty","tvojími","už","v","vám","váš","vaše","vo","viac","však","všetok","vy","z","za","zo","že","buď","ju","menej","moja","moje","späť","ste","tá","tam"])

In [24]:
#čistenie textu na trénovacej a testovacej množine
testing_train = x_train.body[0:(len(x_train))]
test_result_train = []
for t in testing_train:
    test_result_train.append(cleaner(t))


testing_test = x_test.body[0:(len(x_test))]
test_result_test = []
for t in testing_test:
    test_result_test.append(cleaner(t))

In [25]:
#vytvorenie dataframe clean_df
clean_train = pd.DataFrame(test_result_train,columns=['body'])
clean_train['label'] = x_train.label #pridanie stĺpca label

clean_test = pd.DataFrame(test_result_test,columns=['body'])
clean_test['label'] = x_test.label 

In [26]:
#uloženie vyčisteného textu
clean_train.to_csv('clean_train_korona_balans.csv',encoding='utf-8')
csv_train = 'clean_train_korona_balans.csv'
df_train = pd.read_csv(csv_train,index_col=0)

clean_test.to_csv('clean_test_korona.csv',encoding='utf-8')
csv_test = 'clean_test_korona.csv'
df_test = pd.read_csv(csv_test,index_col=0)

In [27]:
df_train.dropna(inplace=True) #vymazanie prázdnych hodnôt
df_train.reset_index(drop=True,inplace=True)

df_test.dropna(inplace=True) #vymazanie prázdnych hodnôt
df_test.reset_index(drop=True,inplace=True)

In [28]:
#ukážka vyčistenej dátovej množiny
df_train.head()

Unnamed: 0,body,label
0,zatiaľ čo európu paralyzuje koronavírus stále ...,0
1,peking februára čína redukuje clá na dovoz usa...,0
2,lausanne marca prezident európskej atletickej ...,0
3,brusel mája európska únia vyjadrila podporu sv...,0
4,bratislava apríla redaktorka denníka sme býval...,1


In [29]:
df_train.label.value_counts()

0    632
1    631
Name: label, dtype: int64

In [30]:
df_test.label.value_counts()

0    3901
1     220
Name: label, dtype: int64

In [31]:
#vytvorenie tokenov zo slov pre trénovaciu a testovaciu množinu
df_token_train = df_train['body'].values.tolist()

token_train=list()
for i in df_token_train:
    word_train=nltk.word_tokenize(i)
    word_train = [w for w in word_train if not w in stop_words]
    token_train.append(word_train)


df_token_test = df_test['body'].values.tolist()

token_test=list()
for i in df_token_test:
    word_test=nltk.word_tokenize(i)
    word_test = [w for w in word_test if not w in stop_words]
    token_test.append(word_test)

In [32]:
##word embeddings pomocou Word2Vec
model = Word2Vec(token_train, min_count = 1)
vocabulary = model.wv.vocab

name = 'w2v.txt'
model.wv.save_word2vec_format(name, binary = False)

embeddings_index = {}
f = open(os.path.join('','w2v.txt'), encoding = "utf-8")
for line in f:
    values = line.split()
    word_train = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word_train] = coefs
f.close()

In [33]:
max_length = 3700

tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_train)

seq_train = tokenizer.texts_to_sequences(token_train)
seq_test = tokenizer.texts_to_sequences(token_test)

word_index = tokenizer.word_index
print('Najdenych %s jedinecnych tokenov.' %len (word_index))

train_padding = pad_sequences(seq_train, max_length)
test_padding = pad_sequences(seq_test, max_length)

Najdenych 49667 jedinecnych tokenov.


In [34]:
EMBEDDING_DIM = 100
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word_train, ii in word_index.items():
    if ii > num_words:
        continue
    embedding_vector = embeddings_index.get(word_train)
    if embedding_vector is not None:
        embedding_matrix[ii] = embedding_vector
print(num_words)

49668


In [35]:
y_train = df_train['label'].values
y_test = df_test['label'].values

In [36]:
#architektúra CNN modelu
inputs = Input(shape=(max_length,))
x = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix])(inputs)
x = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(256, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)
model_cnn = Model(inputs=inputs, outputs=output)
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model_cnn.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3700)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 3700, 100)         4966800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 3699, 100)         20100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               25856     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 5,013,013
Trainable params: 5,013,013
Non-trainable params: 0
_________________________________________________

In [None]:
#uloženie modelu
saved_model = "model_cnn_korona_balansbezstop.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [None]:
#trénovanie modelu
history = model_cnn.fit(train_padding, y_train, epochs=5, batch_size=32, validation_split=0.1, callbacks=[checkpoint])

In [None]:
#načítanie modelu
model_cnn=load_model('model_cnn_korona_balansbezstop.hdf5')

#predikcia na testovacích dátach pomocou natrénovaného modelu,
#vypísanie kontingenčnej tabuľky a metrík na vyhodnotenie ako úspešnosť, návrtanosť, F1 ...
y_cnn = model_cnn.predict(test_padding)
print('Roc auc score is {}'.format(roc_auc_score(y_test, y_cnn)))
y_int = np.zeros_like(y_cnn)
y_int[y_cnn > 0.5] = 1
print('Accuracy is {}'.format(accuracy_score(y_test,y_int)))

print(classification_report(y_test, y_int, zero_division=0))
print(confusion_matrix(y_test, y_int))

|              | Precision | Recall | F1-score | Support |
|:------------:|:---------:|:------:|:--------:|:-------:|
|       0      |    0.98   |  0.82  |   0.89   |   3901  |
|       1      |    0.19   |  0.76  |   0.30   |   220   |
|              |           |        |          |         |
|   Accuracy   |           |        |   0.81   |   4121  |
|   Macro avg  |    0.59   |  0.79  |   0.60   |   4121  |
| Weighted avg |    0.94   |  0.81  |   0.86   |   4121  |

Accuracy : 0.813637

ROC : 0.863783

| Actual/Predicted | Fake news | True news |
|------------------|-----------|-----------|
| Fake news        |   TP-167  |   FN-53   |
| True news        |   FP-715  |  TN-3186  |

In [37]:
#architektúra LSTM modelu
inputs = Input(shape=(max_length,))
x = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix])(inputs)
x = LSTM(128)(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)
model_lstm = Model(inputs=inputs, outputs=output)
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model_lstm.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 3700)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 3700, 100)         4966800   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 5,100,689
Trainable params: 5,100,689
Non-trainable params: 0
_________________________________________________

In [None]:
#uloženie modelu
saved_model = "model_lstm_korona_balansbezstop.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [None]:
#trénovanie modelu
history = model_lstm.fit(train_padding, y_train, epochs=5, batch_size=32, validation_split=0.1, callbacks=[checkpoint])

In [None]:
#načítanie modelu
model_lstm=load_model('model_lstm_korona_balansbezstop.hdf5')

#predikcia na testovacích dátach pomocou natrénovaného modelu,
#vypísanie kontingenčnej tabuľky a metrík na vyhodnotenie ako úspešnosť, návrtanosť, F1 ...
y_cnn = model_lstm.predict(test_padding)
print('Roc auc score is {}'.format(roc_auc_score(y_test, y_cnn)))
y_int = np.zeros_like(y_cnn)
y_int[y_cnn > 0.5] = 1
print('Accuracy is {}'.format(accuracy_score(y_test,y_int)))

print(classification_report(y_test, y_int, zero_division=0))
print(confusion_matrix(y_test, y_int))

|              | Precision | Recall | F1-score | Support |
|:------------:|:---------:|:------:|:--------:|:-------:|
|       0      |    0.99   |  0.82  |   0.90   |   3901  |
|       1      |    0.22   |  0.89  |   0.35   |   220   |
|              |           |        |          |         |
|   Accuracy   |           |        |   0.83   |   4121  |
|   Macro avg  |    0.61   |  0.85  |   0.63   |   4121  |
| Weighted avg |    0.95   |  0.83  |   0.87   |   4121  |

Accuracy : 0.826256

ROC : 0.930680

| Actual/Predicted | Fake news | True news |
|------------------|-----------|-----------|
| Fake news        |   TP-195  |   FN-25   |
| True news        |   FP-691  |  TN-3210  |

In [38]:
#architektúra Bi-LSTM + CNN modelu
inputs = Input(shape=(max_length,))
x = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix])(inputs)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(64, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(32, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
x = GlobalMaxPooling1D()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
output = Dense(1, activation='sigmoid')(x)
model_bilstm = Model(inputs=inputs, outputs=output)
model_bilstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model_bilstm.summary())

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 3700)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 3700, 100)         4966800   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 3700, 100)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 3700, 128)         84480     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 3698, 32)          12320     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                2112

In [None]:
#uloženie modelu
saved_model = "model_bilstm_korona_balansbezstop.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [None]:
#trénovanie modelu
history = model_bilstm.fit(train_padding, y_train, epochs=5, batch_size=32, validation_split=0.1, callbacks=[checkpoint])

In [None]:
#načítanie modelu
model_bilstm=load_model('model_bilstm_korona_balansbezstop.hdf5')

#predikcia na testovacích dátach pomocou natrénovaného modelu,
#vypísanie kontingenčnej tabuľky a metrík na vyhodnotenie ako úspešnosť, návrtanosť, F1 ...
y_cnn = model_bilstm.predict(test_padding)
print('Roc auc score is {}'.format(roc_auc_score(y_test, y_cnn)))
y_int = np.zeros_like(y_cnn)
y_int[y_cnn > 0.5] = 1
print('Accuracy is {}'.format(accuracy_score(y_test,y_int)))

print(classification_report(y_test, y_int, zero_division=0))
print(confusion_matrix(y_test, y_int))

|              | Precision | Recall | F1-score | Support |
|:------------:|:---------:|:------:|:--------:|:-------:|
|       0      |    0.99   |  0.97  |   0.98   |   3901  |
|       1      |    0.63   |  0.79  |   0.70   |   220   |
|              |           |        |          |         |
|   Accuracy   |           |        |   0.96   |   4121  |
|   Macro avg  |    0.81   |  0.88  |   0.84   |   4121  |
| Weighted avg |    0.97   |  0.96  |   0.97   |   4121  |

Accuracy : 0.963601

ROC : 0.936874

| Actual/Predicted | Fake news | True news |
|------------------|-----------|-----------|
| Fake news        |   TP-173  |   FN-47   |
| True news        |   FP-103  |  TN-3798  |