In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import os
from tqdm.notebook import tqdm
import numpy as np
import classifier_unit_test
import nltk
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import pickle

In [3]:
def load_encodings(path, train_or_test="train"): 
    
    _,_, files = next(os.walk(path))
    
    for file in tqdm(files):

        if file == f"{train_or_test}X.npy":
            encodings = np.load(path+'/'+file, allow_pickle=True)[()]
            
        if file == f"{train_or_test}Y.npy": 
            target = np.load(path+'/'+file, allow_pickle=True)[()]
            
    encodings.data = np.nan_to_num(encodings.data) #Remove NaN and Inf
    # test_X.data = np.nan_to_num(test_X.data)
    
    return encodings, target

In [3]:
def load_autoencodings(path, train_or_test="train"): 
    
    _,_, files = next(os.walk(path))
    
    for file in tqdm(files):

        if file == f"{train_or_test}_X.npy":
            encodings = np.load(path+'/'+file, allow_pickle=True)[()]
            
        if file == f"{train_or_test}_y.npy": 
            target = np.load(path+'/'+file, allow_pickle=True)[()]
                
    encodings.data = np.nan_to_num(encodings.data) #Remove NaN and Inf
    

    return encodings, target

In [4]:
def classify(train_X, train_y, test_X, test_y, clf="Random Forest", model_name="Random Forest, Authorship Attribution, manual encodings", 
            give_roc=True, give_importances=False):
    
    if clf == "Random Forest": 
         
        RandomForest_classifier = RandomForestClassifier(random_state=42, n_jobs=-1, verbose=1)
        output = classifier_unit_test.test_classifier(RandomForest_classifier, train_X, test_X, train_y, test_y, 
                                                      give_roc=give_roc, give_importance=give_importances, model_name=model_name)
            
    if clf == "K Nearest Neighbor": 
        
        KNN_classifier = KNeighborsClassifier()
        output = classifier_unit_test.test_classifier(KNN_classifier, train_X, test_X, train_y, test_y, 
                                                      give_roc=give_roc, give_importance=give_importances, model_name=model_name)

    if clf == "Logistic Regression": 

        LogReg_classifier = LogisticRegression(random_state=42, n_jobs=-1, verbose=1)
        output = classifier_unit_test.test_classifier(LogReg_classifier,  train_X, test_X, train_y, test_y, 
                                                      give_roc=give_roc, give_importance=give_importances, model_name=model_name)
            
    return output


# Authorship attribution

This section performs authorshop attribution based on our manual encodings, the automatic encodings by Simple Transformers, and lastly a manual and automatic combination

## Manual encodings

In [73]:
authors_train_X, authors_train_y = load_encodings("Data/additional/xtra_features_author", train_or_test="train")
authors_test_X, authors_test_y = load_encodings("Data/additional/xtra_features_author", train_or_test="test")

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [74]:
authors_train_X.shape

(89956, 10180)

## Automatic encodings - Simple Transformers

In [75]:
authors_train_X_auto, authors_train_y_auto = load_autoencodings("auto_encodings/xtra_author_encodings", train_or_test="train")
authors_test_X_auto, authors_test_y_auto = load_autoencodings("auto_encodings/xtra_author_encodings", train_or_test="test")

  0%|          | 0/4 [00:00<?, ?it/s]

  encodings.data = np.nan_to_num(encodings.data) #Remove NaN and Inf


  0%|          | 0/4 [00:00<?, ?it/s]

## Combined encodings

In [76]:
combined_authors_train = np.hstack((authors_train_X.toarray(),authors_train_X_auto))
combined_authors_test = np.hstack((authors_test_X.toarray(),authors_test_X_auto))

In [77]:
combined_authors_train.shape, combined_authors_test.shape 

((89956, 10948), (22489, 10948))

## Authorship Attribution overview

In [78]:
%%time
    
combined_RF_output = classify(combined_authors_train, authors_train_y, combined_authors_test, authors_test_y, clf="Random Forest", model_name="Random Forest, authorship attribution, combined encodings")


Training model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 63 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 out of 100 | elapsed: 46.8min remaining: 14.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 52.5min finished
[Parallel(n_jobs=63)]: Using backend ThreadingBackend with 63 concurrent workers.
[Parallel(n_jobs=63)]: Done  76 out of 100 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=63)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=63)]: Using backend ThreadingBackend with 63 concurrent workers.
[Parallel(n_jobs=63)]: Done  76 out of 100 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=63)]: Done 100 out of 100 | elapsed:    0.6s finished


Training dummy...
----------
Correctly predicted 6031 of 22489	Dummy: 291 of 22489	:-)
Accuracy: 0.27			Dummy: 0.01	:-)
Balanced accuracy: 0.26		Dummy: 0.01	:-)
TOP 5 accuracy: 0.48		Dummy: 0.02	:-)
F1 score: 0.23			Dummy: 0.00	:-)
Cohen's Kappa: 0.26		Dummy: 0.00	:-)
ROC AUC: 0.83			Dummy: 0.50	:-)
----------
ROC AUC control for RandomForestClassifier(n_jobs=-1, random_state=42, verbose=1): 0.85
ROC AUC control for Dummy: 0.50
Process time: 0:52:36.011785
CPU times: user 1d 19h 21min 40s, sys: 49.5 s, total: 1d 19h 22min 30s
Wall time: 52min 36s


In [79]:
with open("xtra_AA_combined_RF_classsification.dat", "wb") as combined_file: 
    pickle.dump(combined_RF_output, combined_file)
    


# Newspaper Attribution

This sections performs newspaper attribution based on our manual encodings, the automatic encodings by Simple Transformers, and lastly a manual and automatic combination

## Manual encodings

In [64]:
domains_train_X, domains_train_y = load_encodings("Data/additional/xtra_features_domain", train_or_test="train")
domains_test_X, domains_test_y = load_encodings("Data/additional/xtra_features_domain", train_or_test="test")

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [65]:
domains_train_X.shape

(94572, 10368)

## Automatic encodings - Simple Transformers

In [68]:
domains_train_X_auto, domains_train_y_auto = load_autoencodings("auto_encodings/xtra_domain_encodings", train_or_test="train")
domains_test_X_auto, domains_test_y_auto = load_autoencodings("auto_encodings/xtra_domain_encodings", train_or_test="test")

  0%|          | 0/4 [00:00<?, ?it/s]

  encodings.data = np.nan_to_num(encodings.data) #Remove NaN and Inf


  0%|          | 0/4 [00:00<?, ?it/s]

## Combined encodings

In [69]:
combined_domains_train = np.hstack((domains_train_X.toarray(),domains_train_X_auto))
combined_domains_test = np.hstack((domains_test_X.toarray(),domains_test_X_auto))

In [70]:
combined_domains_train.shape, combined_domains_test.shape

((94572, 11136), (23644, 11136))

## Newspaper Attribution overview

In [71]:
%%time
   
combined_RF_output = classify(combined_domains_train, domains_train_y, combined_domains_test, domains_test_y, clf="Random Forest", model_name="Random Forest, newspaper attribution, combined encodings")

Training model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 63 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 out of 100 | elapsed: 29.9min remaining:  9.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 34.9min finished
[Parallel(n_jobs=63)]: Using backend ThreadingBackend with 63 concurrent workers.
[Parallel(n_jobs=63)]: Done  76 out of 100 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=63)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=63)]: Using backend ThreadingBackend with 63 concurrent workers.
[Parallel(n_jobs=63)]: Done  76 out of 100 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=63)]: Done 100 out of 100 | elapsed:    0.2s finished


Training dummy...
----------
Correctly predicted 17825 of 23644	Dummy: 1211 of 23644	:-)
Accuracy: 0.75			Dummy: 0.05	:-)
Balanced accuracy: 0.73		Dummy: 0.03	:-)
TOP 5 accuracy: 0.95		Dummy: 0.08	:-)
F1 score: 0.75			Dummy: 0.00	:-)
Cohen's Kappa: 0.74		Dummy: 0.00	:-)
ROC AUC: 0.98			Dummy: 0.50	:-)
----------
ROC AUC control for RandomForestClassifier(n_jobs=-1, random_state=42, verbose=1): 0.98
ROC AUC control for Dummy: 0.51
Process time: 0:35:00.587501
CPU times: user 1d 5h 14min 43s, sys: 35.1 s, total: 1d 5h 15min 18s
Wall time: 35min


In [72]:
with open("xtra_NA_combined_RF_classsification.dat", "wb") as combined_file: 
    pickle.dump(combined_RF_output, combined_file)


# Headline Generation

This section evaluates the performance of headline generation based on manual encodings. The metrics used for evaluation is ROGUE (Recall-Oriented Understudy for Gisting Evaluation)

In [4]:
# from nltk.translate.bleu_score import SmoothingFunction
from rouge import Rouge

In [5]:
with open('Data/headline_predictions.txt' ,'r', encoding='utf-8') as f:
    lines = f.readlines()

targets = []
predictions = []

for i, line in enumerate(lines):
    line = line.strip('\n')
    target, prediction = line.split('\t')
    
    target = target.replace('Target: ', '')
    targets.append(target)

    prediction = prediction.replace('Predicted: ', '')
    predictions.append(prediction)

In [6]:
rouge = Rouge()

In [9]:
scores = rouge.get_scores(predictions, targets, avg=True)

In [10]:
scores

{'rouge-1': {'r': 0.14035417931455124,
  'p': 0.14328138452105196,
  'f': 0.1378241596288976},
 'rouge-2': {'r': 0.016728557553557545,
  'p': 0.016601934176934167,
  'f': 0.016217014411291013},
 'rouge-l': {'r': 0.1336364476610998,
  'p': 0.1361387374020891,
  'f': 0.13109143901232878}}

In [28]:
target1 = ["ung", "mand", "lagt", "i", "kunstigt", "koma", "efter", "trafikulykke"]
prediction1 = ["ung", "mand", "i", "retten", "for", "at", "voldtage", "sin", "egen", "far"]
target2 = list("michelle bellaichs kæreste taler ud om kæmpe sorg".split())
prediction2 = list("dansk mand er død".split())
#there may be several references

# BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis, smoothing_function=SmoothingFunction().method2)
# print(BLEUscore)

In [7]:
%%time
corpus_BLEU = nltk.translate.bleu_score.corpus_bleu(targets, predictions, smoothing_function=SmoothingFunction().method2)


CPU times: user 5.4 s, sys: 0 ns, total: 5.4 s
Wall time: 14.6 s


In [69]:
# for i, target in enumerate(targets):
#     print("Target: ",target, "\nPrediction: ",predictions[i])
#     print("\n")