In [1]:
cd ..

C:\Users\hunter\dev\sysfake


In [2]:
import multiprocessing
import functools
import itertools
import pickle
import glob
import sys
#from collections import deque

from tqdm import tqdm

import scipy
import pandas as pd
import numpy as np

from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, precision_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer

from waybackmachine import WaybackMachine

import feature_extraction as fe

In [3]:
sz = 100
CLASS_DICT = dict(zip(('real', 'fake', 'opinion', 'polarized', 'satire', 'promotional', 'correction'),
                      (1, 2, 3, 5, 7, 9, 11)))

In [4]:
try:
    with open('./data/all_urls.pickle', mode='rb') as filein:
        urls = pickle.load(filein)
except:
    urls = {}
    for fname in glob.glob('./data/*urls.txt'):
        print(fname)
        label = fname.split('_')[0].split('\\')[-1]
        urls_list = pd.read_csv(fname,
                                delim_whitespace=True,
                                header=None, encoding='utf-8').squeeze().to_list()
        urls.update({label: urls_list})
    with open('./data/all_urls.pickle', mode='wb') as fileout:
        pickle.dump(urls, fileout)

In [5]:
for label, url_list in urls.items():
    urls.update({label: [url if 'https://' in url or 'http://' in url else 'https://'+url for url in url_list]})

In [6]:
try:
    with open('./data/sub100_vectors.pickle', mode='rb') as filein:
        vectors = pickle.load(filein)
except:
    urls_choice = {k:np.random.choice(v, size=sz) for k, v in urls.items()}
    vectors = {}
    for label, url_list in urls_choice.items():
        vector_list = [fe.ArticleVector(url=url)
                       for url in tqdm(url_list, desc=label)]
        vectors.update({label: vector_list})
    with open('./data/sub100_vectors.pickle', mode='wb') as fileout:
        pickle.dump(obj=vectors, file=fileout)

In [7]:
texts = {}
for label, vector_list in vectors.items():
    text_list = [vector.text for vector in vector_list]
    texts.update({label: text_list})

In [8]:
canonical = pd.DataFrame([v.vector for v in list(itertools.chain.from_iterable(vectors.values()))])

In [9]:
texts_labelled = pd.melt(pd.DataFrame.from_dict(texts),
                         var_name='label', value_name='text')

In [10]:
texts_labelled = texts_labelled.replace({'label': CLASS_DICT})

In [11]:
with open(f"./data/texts_labelled_sub{sz!s}.pickle", mode='wb') as fileout:
    pickle.dump(obj=texts_labelled, file=fileout)

In [12]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_rep = tfidf_vectorizer.fit_transform(texts_labelled['text'])
tfidf_can = scipy.sparse.hstack((tfidf_rep, canonical))
y = texts_labelled['label']

In [13]:
def precision_report_confusion(y_true, y_pred):
    print(classification_report(y_true, y_pred, zero_division=0))
    #sys.stdout.write(confusion_matrix(y_true, y_pred))
    return precision_score(y_true, y_pred, average='micro')

def train(estimator, x, y):
    scores = cross_val_score(estimator,
                             x, y,
                             scoring='precision_micro',
                             cv=StratifiedKFold(n_splits=10, shuffle=True),
                             verbose=3, n_jobs=-1)
    print(f"{scores!s}" + '\n' + f"Mean: {np.mean(scores)!s}")
    
def train2(estimator, x, y):
    scores = cross_val_score(estimator,
                             x, y,
                             scoring=make_scorer(precision_report_confusion),
                             cv=StratifiedKFold(n_splits=10, shuffle=True),
                             verbose=3)
    print(f"{scores!s}" + '\n' + f"Mean: {np.mean(scores)!s}")

In [14]:
print("Vanilla SVC")
print("TFIDF Only")
train2(SVC(), tfidf_rep, y)

Vanilla SVC
TFIDF Only
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           1       0.67      0.80      0.73        10
           2       0.50      0.30      0.37        10
           3       0.33      0.10      0.15        10
           5       0.33      0.40      0.36        10
           7       1.00      0.30      0.46        10
           9       1.00      0.50      0.67        10
          11       0.34      1.00      0.51        10

    accuracy                           0.49        70
   macro avg       0.60      0.49      0.47        70
weighted avg       0.60      0.49      0.47        70

[CV] .................................... , score=0.486, total=   0.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.50      0.50      0.50        10
           2       0.40      0.40      0.40        10
           3       0.25      0.10      0.14        10
           5       0.50      0.60      0.55        10
           7       1.00      0.20      0.33        10
           9       0.86      0.60      0.71        10
          11       0.32      0.80      0.46        10

    accuracy                           0.46        70
   macro avg       0.55      0.46      0.44        70
weighted avg       0.55      0.46      0.44        70

[CV] .................................... , score=0.457, total=   0.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.44      0.40      0.42        10
           2       0.78      0.70      0.74        10
           3       0.29      0.20      0.24        10
           5       0.43      0.30      0.35        10
           7       1.00      0.20      0.33        10
           9       1.00      0.40      0.57        10
          11       0.31      1.00      0.48        10

    accuracy                           0.46        70
   macro avg       0.61      0.46      0.45        70
weighted avg       0.61      0.46      0.45        70

[CV] .................................... , score=0.457, total=   0.8s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.33      0.40      0.36        10
           2       0.80      0.40      0.53        10
           3       0.40      0.40      0.40        10
           5       0.30      0.30      0.30 

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.3s finished


In [15]:
print("Vanilla NuSVC")
print("TFIDF Only")
train2(NuSVC(), tfidf_rep, y)

Vanilla NuSVC
TFIDF Only
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           1       0.40      0.40      0.40        10
           2       0.83      0.50      0.62        10
           3       0.38      0.30      0.33        10
           5       0.47      0.70      0.56        10
           7       1.00      0.40      0.57        10
           9       0.50      0.30      0.37        10
          11       0.43      0.90      0.58        10

    accuracy                           0.50        70
   macro avg       0.57      0.50      0.49        70
weighted avg       0.57      0.50      0.49        70

[CV] .................................... , score=0.500, total=   0.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.29      0.20      0.24        10
           2       0.67      0.60      0.63        10
           3       0.20      0.20      0.20        10
           5       0.33      0.40      0.36        10
           7       1.00      0.50      0.67        10
           9       0.80      0.80      0.80        10
          11       0.47      0.80      0.59        10

    accuracy                           0.50        70
   macro avg       0.54      0.50      0.50        70
weighted avg       0.54      0.50      0.50        70

[CV] .................................... , score=0.500, total=   0.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.33      0.70      0.45        10
           2       0.60      0.30      0.40        10
           3       0.20      0.20      0.20        10
           5       0.67      0.40      0.50        10
           7       1.00      0.40      0.57        10
           9       1.00      1.00      1.00        10
          11       0.50      0.70      0.58        10

    accuracy                           0.53        70
   macro avg       0.61      0.53      0.53        70
weighted avg       0.61      0.53      0.53        70

[CV] .................................... , score=0.529, total=   0.8s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.56      0.50      0.53        10
           2       0.33      0.20      0.25        10
           3       0.57      0.40      0.47        10
           5       0.36      0.50      0.42 

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.8s finished


In [16]:
print("Vanilla Logistic L2-Penalized SVM")
print("TFIDF Only")
train2(SGDClassifier(loss='log',
                    penalty='l2'),
      tfidf_rep, y)

Vanilla Logistic L2-Penalized SVM
TFIDF Only
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.30      0.30      0.30        10
           2       0.50      0.80      0.62        10
           3       0.50      0.30      0.37        10
           5       0.83      0.50      0.62        10
           7       0.67      0.40      0.50        10
           9       0.88      0.70      0.78        10
          11       0.39      0.70      0.50        10

    accuracy                           0.53        70
   macro avg       0.58      0.53      0.53        70
weighted avg       0.58      0.53      0.53        70

[CV] .................................... , score=0.529, total=   0.1s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.33      0.30      0.32        10
           2       0.78      0.70   

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.83      0.50      0.62        10
           2       0.56      0.50      0.53        10
           3       0.20      0.10      0.13        10
           5       0.38      0.50      0.43        10
           7       0.70      0.70      0.70        10
           9       0.67      0.80      0.73        10
          11       0.53      0.80      0.64        10

    accuracy                           0.56        70
   macro avg       0.55      0.56      0.54        70
weighted avg       0.55      0.56      0.54        70

[CV] .................................... , score=0.557, total=   0.1s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.25      0.30      0.27        10
           2       0.42      0.50      0.45        10
           3       0.33   

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [17]:
print("Cosine Similarity Logistic L2-SVM")
print("TFIDF Only")
pipeline_steps = [('kernel', Nystroem(kernel='cosine', n_components=100)),
                  ('clf', SGDClassifier(loss='log', penalty='l2'))]
pl = Pipeline(pipeline_steps)
train2(pl, tfidf_rep, y)

Cosine Similarity Logistic L2-SVM
TFIDF Only
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        10
           2       1.00      0.10      0.18        10
           3       0.00      0.00      0.00        10
           5       0.24      0.70      0.36        10
           7       0.67      0.60      0.63        10
           9       0.60      0.60      0.60        10
          11       0.38      0.80      0.52        10

    accuracy                           0.40        70
   macro avg       0.41      0.40      0.33        70
weighted avg       0.41      0.40      0.33        70

[CV] .................................... , score=0.400, total=   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.50      0.50      0.50        10
           2       0.29      0.70      0.41        10
           3       1.00      0.10      0.18        10
           5       0.50      0.30      0.37        10
           7       0.83      0.50      0.62        10
           9       0.60      0.30      0.40        10
          11       0.44      0.80      0.57        10

    accuracy                           0.46        70
   macro avg       0.60      0.46      0.44        70
weighted avg       0.60      0.46      0.44        70

[CV] .................................... , score=0.457, total=   0.1s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.29      0.20      0.24        10
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00        10
           5       0.44      0.40      0.42 

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [18]:
print("Vanilla SVC")
print("TFIDF+Explication Features")
train2(SVC(), tfidf_can, y)

Vanilla SVC
TFIDF+Explication Features
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           1       0.31      0.80      0.44        10
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00        10
           5       0.21      0.70      0.33        10
           7       0.50      0.20      0.29        10
           9       0.00      0.00      0.00        10
          11       0.67      0.40      0.50        10

    accuracy                           0.30        70
   macro avg       0.24      0.30      0.22        70
weighted avg       0.24      0.30      0.22        70

[CV] .................................... , score=0.300, total=   0.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.39      0.90      0.55        10
           2       0.60      0.60      0.60        10
           3       0.50      0.10      0.17        10
           5       0.32      0.90      0.47        10
           7       0.50      0.10      0.17        10
           9       0.00      0.00      0.00        10
          11       0.50      0.20      0.29        10

    accuracy                           0.40        70
   macro avg       0.40      0.40      0.32        70
weighted avg       0.40      0.40      0.32        70

[CV] .................................... , score=0.400, total=   0.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.32      1.00      0.49        10
           2       0.50      0.10      0.17        10
           3       0.00      0.00      0.00        10
           5       0.27      0.70      0.39        10
           7       0.80      0.40      0.53        10
           9       0.40      0.20      0.27        10
          11       1.00      0.10      0.18        10

    accuracy                           0.36        70
   macro avg       0.47      0.36      0.29        70
weighted avg       0.47      0.36      0.29        70

[CV] .................................... , score=0.357, total=   0.8s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.23      0.70      0.34        10
           2       0.75      0.30      0.43        10
           3       0.00      0.00      0.00        10
           5       0.33      0.70      0.45 

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.9s finished


In [19]:
print("Vanilla Logistic L2-Penalized SVM")
print("TFIDF+Explication Features")
train2(SGDClassifier(loss='log', penalty='l2'), tfidf_can, y)

Vanilla Logistic L2-Penalized SVM
TFIDF+Explication Features
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.57      0.40      0.47        10
           2       0.50      0.50      0.50        10
           3       0.35      0.60      0.44        10
           5       0.42      0.50      0.45        10
           7       0.58      0.70      0.64        10
           9       0.75      0.30      0.43        10
          11       0.75      0.60      0.67        10

    accuracy                           0.51        70
   macro avg       0.56      0.51      0.51        70
weighted avg       0.56      0.51      0.51        70

[CV] .................................... , score=0.514, total=   0.1s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.50      0.60      0.55        10
           2       1

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s



[CV] .................................... , score=0.586, total=   0.1s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.26      0.50      0.34        10
           2       0.54      0.70      0.61        10
           3       0.40      0.20      0.27        10
           5       0.45      0.50      0.48        10
           7       0.83      0.50      0.62        10
           9       0.64      0.70      0.67        10
          11       0.80      0.40      0.53        10

    accuracy                           0.50        70
   macro avg       0.56      0.50      0.50        70
weighted avg       0.56      0.50      0.50        70

[CV] .................................... , score=0.500, total=   0.1s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.53      0.80      0.64        10
         

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.8s finished


In [20]:
print("Cosine Similarity Logistic L2-Penalized SVM")
print("TFIDF+Explication Features")
pipeline_steps = [('kernel', Nystroem(kernel='cosine', n_components=600)),
                  ('clf', SGDClassifier(loss='log', penalty='l2'))]
pl = Pipeline(pipeline_steps)
train2(pl, tfidf_can, y)

Cosine Similarity Logistic L2-Penalized SVM
TFIDF+Explication Features
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           1       0.31      0.80      0.44        10
           2       0.54      0.70      0.61        10
           3       0.67      0.20      0.31        10
           5       0.67      0.40      0.50        10
           7       0.42      0.50      0.45        10
           9       0.83      0.50      0.62        10
          11       0.50      0.20      0.29        10

    accuracy                           0.47        70
   macro avg       0.56      0.47      0.46        70
weighted avg       0.56      0.47      0.46        70

[CV] .................................... , score=0.471, total=   0.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.75      0.60      0.67        10
           2       0.56      0.50      0.53        10
           3       0.50      0.70      0.58        10
           5       0.86      0.60      0.71        10
           7       0.57      0.80      0.67        10
           9       1.00      0.60      0.75        10
          11       0.67      0.80      0.73        10

    accuracy                           0.66        70
   macro avg       0.70      0.66      0.66        70
weighted avg       0.70      0.66      0.66        70

[CV] .................................... , score=0.657, total=   0.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


              precision    recall  f1-score   support

           1       0.67      0.40      0.50        10
           2       0.55      0.60      0.57        10
           3       0.62      0.50      0.56        10
           5       0.38      0.50      0.43        10
           7       0.71      0.50      0.59        10
           9       0.78      0.70      0.74        10
          11       0.56      0.90      0.69        10

    accuracy                           0.59        70
   macro avg       0.61      0.59      0.58        70
weighted avg       0.61      0.59      0.58        70

[CV] .................................... , score=0.586, total=   0.4s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.33      0.40      0.36        10
           2       0.50      0.40      0.44        10
           3       0.50      0.10      0.17        10
           5       0.62      0.80      0.70 

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.3s finished


In [21]:
print("Naive Bayes")
print("TFIDF+Explication Features")
train2(MultinomialNB(), tfidf_can, y)

Naive Bayes
TFIDF+Explication Features
[CV]  ................................................................
              precision    recall  f1-score   support

           1       1.00      0.40      0.57        10
           2       0.67      0.40      0.50        10
           3       0.40      0.40      0.40        10
           5       0.00      0.00      0.00        10
           7       1.00      0.40      0.57        10
           9       0.29      0.60      0.39        10
          11       0.36      0.90      0.51        10

    accuracy                           0.44        70
   macro avg       0.53      0.44      0.42        70
weighted avg       0.53      0.44      0.42        70

[CV] .................................... , score=0.443, total=   0.0s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.30      0.30      0.30        10
           2       0.43      0.30      0.3

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] .................................... , score=0.486, total=   0.0s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.62      0.50      0.56        10
           2       0.50      0.60      0.55        10
           3       0.33      0.30      0.32        10
           5       1.00      0.20      0.33        10
           7       1.00      0.30      0.46        10
           9       0.40      0.40      0.40        10
          11       0.38      1.00      0.56        10

    accuracy                           0.47        70
   macro avg       0.61      0.47      0.45        70
weighted avg       0.61      0.47      0.45        70

[CV] .................................... , score=0.471, total=   0.0s
[CV]  ................................................................
              precision    recall  f1-score   support

           1       0.18      0.20      0.19        10
         

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
