In [1]:
cd ..

C:\Users\hunter\dev\sysfake


In [2]:
import multiprocessing
import functools
import itertools
import pickle
import glob
import sys
#from collections import deque

from tqdm import tqdm

import scipy
import pandas as pd
import numpy as np

from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, precision_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer

from waybackmachine import WaybackMachine

import feature_extraction as fe

In [3]:
sz = 100
CLASS_DICT = dict(zip(('real', 'fake', 'opinion', 'polarized', 'satire', 'promotional', 'correction'),
                      (1, 2, 3, 5, 7, 9, 11)))
features = ["url_ending_index", "from_reputable_source_index",
            "today_index", "grammar_index", "quotation_index",
            "past_tense_index", "present_tense_index", "should_index",
            "opinion_index", "all_caps_index", "from_satire_source_index",
            "exclamation_index", "apa_index", "name_source_index",
            "interjection_index", "you_index", "dot_gov_ending_index",
            "from_unreputable_source_index"]

In [4]:
try:
    with open('./data/all_urls.pickle', mode='rb') as filein:
        urls = pickle.load(filein)
except:
    urls = {}
    for fname in glob.glob('./data/*urls.txt'):
        print(fname)
        label = fname.split('_')[0].split('\\')[-1]
        urls_list = pd.read_csv(fname,
                                delim_whitespace=True,
                                header=None, encoding='utf-8').squeeze().to_list()
        urls.update({label: urls_list})
    with open('./data/all_urls.pickle', mode='wb') as fileout:
        pickle.dump(urls, fileout)

In [5]:
for label, url_list in urls.items():
    urls.update({label: [url if 'https://' in url or 'http://' in url else 'https://'+url for url in url_list]})

In [6]:
try:
    with open(f'./data/sub{sz!s}_vectors.pickle', mode='rb') as filein:
        vectors = pickle.load(filein)
except:
    urls_choice = {k:np.random.choice(v, size=sz) for k, v in urls.items()}
    vectors = {}
    for label, url_list in urls_choice.items():
        vector_list = [fe.ArticleVector(url=url)
                       for url in tqdm(url_list, desc=label)]
        vectors.update({label: vector_list})
    with open(f'./data/sub{sz!s}_vectors.pickle', mode='wb') as fileout:
        pickle.dump(obj=vectors, file=fileout)

In [7]:
texts = {}
for label, vector_list in vectors.items():
    text_list = [vector.text for vector in vector_list]
    texts.update({label: text_list})

In [8]:
canonical = pd.DataFrame([v.vector for v in list(itertools.chain.from_iterable(vectors.values()))])

In [9]:
texts_labelled = pd.melt(pd.DataFrame.from_dict(texts),
                         var_name='label', value_name='text')
texts_labelled = texts_labelled[texts_labelled['text']!='']
texts_labelled = texts_labelled.replace({'label': CLASS_DICT})

In [10]:
# TODO: huge hack
c1 = canonical[canonical.index.isin(texts_labelled[texts_labelled['text']!=''].index)].reset_index(drop=True)
c1.columns = features
c1['label'] = texts_labelled[texts_labelled['text']!='']['label'].reset_index(drop=True)
c1.to_csv('data/sub100_vectorrep_labeled.csv', index=False)

In [11]:
with open(f"./data/texts_labelled_sub{sz!s}.pickle", mode='wb') as fileout:
    pickle.dump(obj=texts_labelled, file=fileout)

In [12]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_rep = tfidf_vectorizer.fit_transform(texts_labelled['text'])
tfidf_can = scipy.sparse.hstack((tfidf_rep, c1))
y = texts_labelled['label']
with open('./data/tfidf_rep.pickle', mode='wb') as fileout:
    pickle.dump(tfidf_rep, fileout)

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 700, expected 654.

In [None]:
def precision_report_confusion(y_true, y_pred):
    print(classification_report(y_true, y_pred, zero_division=0))
    #sys.stdout.write(confusion_matrix(y_true, y_pred))
    return precision_score(y_true, y_pred, average='micro')

def train(estimator, x, y):
    scores = cross_val_score(estimator,
                             x, y,
                             scoring='precision_micro',
                             cv=StratifiedKFold(n_splits=10, shuffle=True),
                             verbose=3, n_jobs=-1)
    print(f"{scores!s}" + '\n' + f"Mean: {np.mean(scores)!s}")
    
def train2(estimator, x, y):
    scores = cross_val_score(estimator,
                             x, y,
                             scoring=make_scorer(precision_report_confusion),
                             cv=StratifiedKFold(n_splits=10, shuffle=True),
                             verbose=3)
    print(f"{scores!s}" + '\n' + f"Mean: {np.mean(scores)!s}")

In [None]:
print("Vanilla SVC")
print("TFIDF Only")
train2(SVC(), tfidf_rep, y)

In [None]:
print("Vanilla NuSVC")
print("TFIDF Only")
train2(NuSVC(), tfidf_rep, y)

In [None]:
print("Vanilla Logistic L2-Penalized SVM")
print("TFIDF Only")
train2(SGDClassifier(loss='log',
                     penalty='l2'),
       tfidf_rep, y)

In [None]:
print("Cosine Similarity Logistic L2-SVM")
print("TFIDF Only")
pipeline_steps = [('kernel', Nystroem(kernel='cosine', n_components=100)),
                  ('clf', SGDClassifier(loss='log', penalty='l2'))]
pl = Pipeline(pipeline_steps)
train2(pl, tfidf_rep, y)

In [None]:
print("Vanilla SVC")
print("TFIDF+Explication Features")
train2(SVC(), tfidf_can, y)

In [None]:
print("Vanilla Logistic L2-Penalized SVM")
print("TFIDF+Explication Features")
train2(SGDClassifier(loss='log', penalty='l2'), tfidf_can, y)

In [None]:
print("Cosine Similarity Logistic L2-Penalized SVM")
print("TFIDF+Explication Features")
pipeline_steps = [('kernel', Nystroem(kernel='cosine', n_components=600)),
                  ('clf', SGDClassifier(loss='log', penalty='l2'))]
pl = Pipeline(pipeline_steps)
train2(pl, tfidf_can, y)

In [None]:
print("Naive Bayes")
print("TFIDF+Explication Features")
train2(MultinomialNB(), tfidf_can, y)