## Import potrebných knižníc

In [1]:
# pandas
import pandas as pd

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer # web - https://scikit-learn.org/stable/index.html

# Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# KFold
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.model_selection import train_test_split

# metrics
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

# classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import MinMaxScaler # for multinomial
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# graphs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import time

import warnings
warnings.filterwarnings('always')  # "error", "ignore", "always", "default", "module" or "once"

# images for jupyter
from IPython.display import Image

2022-05-31 13:58:35.567091: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-31 13:58:35.567147: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
data = pd.read_csv('data/coref_news.csv', sep=';')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,without_coreference,with_coreference
0,0,fake,I woke up this morning to find a variation of ...,I woke up this morning to find a variation of ...
1,1,fake,Former President Bill Clinton and his Clinton ...,Former President Bill Clinton and Bill Clinton...
2,2,fake,After collapsing just before trying to step in...,After collapsing just before trying to step in...
3,3,fake,"Donald Trump is, well, deplorable. He’s sugges...","Donald Trump is, well, deplorable. Donald Trum..."
4,4,fake,Website is Down For Maintenance,Website is Down For Maintenance


In [4]:
data.tail()

Unnamed: 0.1,Unnamed: 0,label,without_coreference,with_coreference
400,402,real,"KALLSTADT, Germany — Few places in Germany are...","KALLSTADT, Germany — Few places in Germany are..."
401,403,real,Hollywood loses yet another one of their deare...,Hollywood loses yet another one of Hollywood d...
402,404,real,"As my 25th wedding anniversary approached, I t...","As my 25th wedding anniversary approached, I t..."
403,405,real,Story highlights Trump was sitting in a chair ...,Story highlights Trump was sitting in a chair ...
404,406,real,"Donald Trump Jr., a son of the Republican pres...","Donald Trump Jr., a son of the Republican pres..."


### Definovanie premenných s coreference a bez coreference¶

In [5]:
X_nocoref = data['without_coreference']
X_coref = data['with_coreference']
y = data['label']

### Vytvorenie nového dataframe-u pre export výsledkov z klasifikácie

In [6]:
global data_new
data_new = pd.DataFrame(columns=['Is Coref', 'Method', 'Accuracy', 'Precision Real', 'Precision Fake','Recall Real', 'Recall Fake', 'F1 Score Real', 'F1 Score Fake'])

In [7]:
def write_to_df(coref, method, acc, prec_r, prec_f, rec_r, rec_f, f1_r, f1_f):
    # dict = {'Is Coref': coref, 'Method': method, 'Accuracy' : acc, 'Precision Real' : prec_r, 'Precision Fake' : prec_f, 'Recall Real' : rec_r, 'Recall Fake' : rec_f, 'F1 Score Real' : f1_r, 'F1 Score Fake' : f1_f}
    global data_new
    # data_new = data_new.append(dict, ignore_index = True)
    # data_new = pd.concat(dict)
    data_new.loc[len(data_new.index)] = [coref, method, acc, prec_r, prec_f, rec_r, rec_f, f1_r, f1_f]

## Zadefinovanie modelov

### TFIDF

In [8]:
def tfidf_vectorizer(vmin, vmax):
    return TfidfVectorizer(ngram_range=(vmin,vmax))

### Doc2Vec

In [9]:
def doc2vec(x_data):
    tokenized_doc = []
    
    for d in x_data:
        tokenized_doc.append(word_tokenize(d.lower()))
    tokenized_doc
    
    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
    tagged_data
    
    d2v_model = Doc2Vec(tagged_data, vector_size=300, window=4, min_count=1, workers=4, epochs = 100)

    d2v_model.save("test_doc2vec.model")

    d2v_model= Doc2Vec.load("test_doc2vec.model")
    
    return d2v_model

#### Príprava dát pre Doc2Vec

In [10]:
d2v_nocoref = doc2vec(X_nocoref)

In [11]:
d2v_coref = doc2vec(X_coref)

In [12]:
nocoref2vec = [d2v_nocoref.infer_vector((X_nocoref[i].split(' '))) for i in range(0, len(X_nocoref))]
nocoref2vec

[array([-1.2057829e-01,  3.4658191e-01, -3.7620220e+00, -1.0475286e+00,
        -2.8636187e-01, -9.0059084e-01, -2.6024508e+00,  1.3747258e+00,
        -6.8700463e-01,  3.4910605e+00,  5.9760410e-01,  6.6549009e-01,
        -2.8540115e+00,  6.9456190e-01, -2.8954213e+00, -6.5141952e-01,
         5.5052052e+00,  9.9264741e-01, -9.5667683e-02,  4.0162235e-01,
         1.5921623e+00, -3.9524364e-01,  5.2175760e-01,  1.5134126e+00,
         1.0244004e+00,  1.7810050e+00,  4.5501813e-01, -1.8555104e+00,
         2.0759149e+00, -4.2196431e+00,  2.2928231e+00, -2.9992580e-01,
         3.4078836e-02, -7.3909181e-01,  4.1968886e-02,  6.6540730e-01,
        -1.2384837e+00, -9.5332676e-01, -5.7383299e-01, -1.3945738e+00,
        -9.0758955e-01, -1.4410243e+00,  3.6925256e-02, -1.7279992e+00,
        -1.8484913e+00,  5.7484165e-02, -8.5212260e-01,  4.9597996e-01,
         1.5226219e+00, -2.9738200e+00,  1.2990508e+00,  5.0974816e-01,
        -4.0127730e-01, -5.6361008e-01,  7.5996846e-02,  1.68483

In [13]:
coref2vec = [d2v_coref.infer_vector((X_coref[i].split(' '))) for i in range(0, len(X_coref))]
coref2vec

[array([ 2.6846778 ,  1.7817116 , -2.3205886 ,  1.4064324 ,  2.0366633 ,
        -2.54251   , -1.8707505 ,  0.17300837, -2.6358125 ,  1.6872504 ,
         1.2673721 , -0.60917574,  0.6767339 ,  0.18638057,  0.45296946,
        -2.431122  ,  0.57737774, -0.6451759 , -0.15313825,  0.5832353 ,
         0.5458275 , -1.1773745 , -1.3972294 ,  0.361341  , -0.39142323,
        -1.1700065 , -1.6226546 ,  0.52419645,  2.1732454 , -1.2920394 ,
         1.3479246 , -2.2357507 ,  1.4906784 ,  0.33835018,  2.3634193 ,
        -1.1720873 , -0.99857616, -0.00527789,  3.2407317 , -1.9034619 ,
        -1.6763253 , -2.1187074 ,  3.559141  ,  1.2890054 , -0.00954694,
         0.70510876,  1.1133093 ,  2.0438192 , -2.5508215 ,  0.58960676,
        -1.7489533 ,  0.9500085 ,  0.7140332 ,  0.98848563,  0.7721362 ,
         0.4005294 , -2.0334733 , -0.90190935, -2.7855146 ,  2.492508  ,
        -1.4371022 ,  1.1551961 ,  0.21750756, -0.36540586, -1.6947687 ,
         0.00890225, -0.6461254 ,  0.27097243,  1.1

In [14]:
nocoref_data_frame = pd.DataFrame(nocoref2vec)
coref_data_frame = pd.DataFrame(coref2vec)

In [15]:
X_nocoref_np = np.array(nocoref_data_frame)
X_coref_np = np.array(coref_data_frame)
y = np.array(data.label)

## Metódy pre klasifikáciu

#### TFIDF  + Multiple classifiers

In [16]:
def tfidf_classifiers(X,y,vmin,vmax, is_coref):
    vectorizerTfidf = tfidf_vectorizer(vmin,vmax)
    
    kf = KFold(n_splits=10, shuffle = True)
    
    classifiers = [DecisionTreeClassifier(),
                   RandomForestClassifier(n_estimators=100),
                   LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000),
                   SGDClassifier(),
                   LinearSVC(),
                   KNeighborsClassifier(n_neighbors=5),
                   GradientBoostingClassifier(),
                   BernoulliNB(),
                   MultinomialNB()]
    
    for classifier in classifiers:
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            X_train_tfidf = vectorizerTfidf.fit_transform(X_train)
        
            clf = classifier
            clf = clf.fit(X_train_tfidf,y_train)
            y_pred = clf.predict(vectorizerTfidf.transform(X_test))
            
            write_to_df(is_coref, 'TFIDF + '  + str(classifier), accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, pos_label="real"), precision_score(y_test, y_pred, pos_label="fake"), recall_score(y_test, y_pred, pos_label="real"), recall_score(y_test, y_pred, pos_label="fake"), f1_score(y_test, y_pred, pos_label="real"), f1_score(y_test, y_pred, pos_label="fake"))
       
    print('Success!')

#### Doc2Vec + Multiple Classifiers

In [17]:
kf = KFold(n_splits=10, random_state=5, shuffle = True)

def doc2vec_classifiers(X,y,iscoref):
    classifiers = [DecisionTreeClassifier(), RandomForestClassifier(n_estimators=100), LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000), SGDClassifier(), LinearSVC(), KNeighborsClassifier(n_neighbors=5), GradientBoostingClassifier(), BernoulliNB()]

    for classifier in classifiers:
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
        
            clf = classifier
            clf = clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
    
            write_to_df(iscoref, 'Doc2Vec + ' + str(classifier), accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, pos_label="real"), precision_score(y_test, y_pred, pos_label="fake"), recall_score(y_test, y_pred, pos_label="real"), recall_score(y_test, y_pred, pos_label="fake"), f1_score(y_test, y_pred, pos_label="real"), f1_score(y_test, y_pred, pos_label="fake"))
    print('Success!')

Multinomial osobitne

In [18]:
kf = KFold(n_splits=10, random_state=5, shuffle = True)

def multinom(X,y, iscoref):
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        clf = MultinomialNB()
        clf = clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        write_to_df(iscoref, 'Doc2Vec + ' + str(clf), accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, pos_label="real"), precision_score(y_test, y_pred, pos_label="fake"), recall_score(y_test, y_pred, pos_label="real"), recall_score(y_test, y_pred, pos_label="fake"), f1_score(y_test, y_pred, pos_label="real"), f1_score(y_test, y_pred, pos_label="fake"))
    print("success")

## Klasifikácia

### TFIDF

In [19]:
# povodny text bez coreference so zamenami
tfidf_classifiers(X_nocoref, y, 1,1,'No')

Success!


In [20]:
# coreference text bez zamen
tfidf_classifiers(X_coref, y, 1,1,'Yes')

Success!


### Doc2Vec

In [21]:
# povodny text bez coreference so zamenami
doc2vec_classifiers(X_nocoref_np,y,"No")



Success!


In [22]:
multinom(X_nocoref_np, y, "No")

success


In [23]:
# coreference text bez zamen
doc2vec_classifiers(X_coref_np,y,"Yes")



Success!


In [24]:
multinom(X_coref_np, y, "Yes")

success


In [25]:
data_new

Unnamed: 0,Is Coref,Method,Accuracy,Precision Real,Precision Fake,Recall Real,Recall Fake,F1 Score Real,F1 Score Fake
0,No,TFIDF + DecisionTreeClassifier(),0.658537,0.666667,0.652174,0.600000,0.714286,0.631579,0.681818
1,No,TFIDF + DecisionTreeClassifier(),0.731707,0.666667,0.823529,0.842105,0.636364,0.744186,0.717949
2,No,TFIDF + DecisionTreeClassifier(),0.707317,0.700000,0.714286,0.700000,0.714286,0.700000,0.714286
3,No,TFIDF + DecisionTreeClassifier(),0.829268,0.826087,0.833333,0.863636,0.789474,0.844444,0.810811
4,No,TFIDF + DecisionTreeClassifier(),0.829268,0.812500,0.840000,0.764706,0.875000,0.787879,0.857143
...,...,...,...,...,...,...,...,...,...
355,Yes,Doc2Vec + MultinomialNB(),0.900000,0.944444,0.863636,0.850000,0.950000,0.894737,0.904762
356,Yes,Doc2Vec + MultinomialNB(),0.775000,0.791667,0.750000,0.826087,0.705882,0.808511,0.727273
357,Yes,Doc2Vec + MultinomialNB(),0.750000,0.777778,0.741935,0.466667,0.920000,0.583333,0.821429
358,Yes,Doc2Vec + MultinomialNB(),0.675000,0.714286,0.653846,0.526316,0.809524,0.606061,0.723404


In [26]:
data_new.to_csv('data_new.csv', sep=';')