# Notebook para o PAN - Atribuição Autoral - 2018

In [1]:
%matplotlib inline
#python basic libs
from __future__ import print_function

from tempfile import mkdtemp
from shutil import rmtree
import os;
from os.path import join as pathjoin;

import re;
import glob;
import json;
import codecs;
from collections import defaultdict;
import pprint;


from pprint import pprint
from time import time
import logging


#data analysis libs
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import random;

#machine learning libs
#feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

#preprocessing and transformation
from sklearn.preprocessing import normalize, Normalizer, MaxAbsScaler, MinMaxScaler, LabelBinarizer;
from sklearn.decomposition import PCA;
from sklearn.metrics.pairwise import cosine_similarity;


from sklearn.base import BaseEstimator, ClassifierMixin

#classifiers
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import RFE,SelectFpr,SelectPercentile, chi2;

#
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier


from sklearn.pipeline import Pipeline

#model valuation
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score;

In [2]:
import platform; print(platform.platform())
print("NumPy", np.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)

Darwin-17.5.0-x86_64-i386-64bit
NumPy 1.14.2
SciPy 1.0.1
Scikit-Learn 0.19.1


### paths configuration

In [3]:
baseDir = '/Users/joseeleandrocustodio/Dropbox/mestrado/02 - Pesquisa/code';

inputDir= pathjoin(baseDir,'pan18aa');
outputDir= pathjoin(baseDir,'out',"oficial");
if not os.path.exists(outputDir):
    os.mkdir(outputDir);

## loading the dataset

In [4]:
def readCollectionsOfProblems(path):
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    with open(infocollection, 'r') as f:
        problems  = [
            {
                'problem': attrib['problem-name'],
                'language': attrib['language'],
                'encoding': attrib['encoding'],
            }
            for attrib in json.load(f)
            
        ]
    return problems;

In [5]:
problems = readCollectionsOfProblems(inputDir);

In [6]:
problems[0]

{'encoding': u'UTF-8', 'language': u'en', 'problem': u'problem00001'}

In [7]:
def readProblem(path, problem):
    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])
    return unk_folder, candidates;

In [8]:
def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(pathjoin(path,label,'*.txt'))
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label, os.path.basename(v)))
        f.close()
    return texts

In [9]:
for index,problem in enumerate(problems):
    unk_folder, candidates_folder = readProblem(inputDir, problem['problem']); 
    problem['candidates_folder_count'] = len(candidates_folder);
    problem['candidates'] = [];
    for candidate in candidates_folder:
        problem['candidates'].extend(read_files(pathjoin(inputDir, problem['problem']),candidate));
    
    problem['unknown'] = read_files(pathjoin(inputDir, problem['problem']),unk_folder);    

In [10]:
pd.DataFrame(problems)

Unnamed: 0,candidates,candidates_folder_count,encoding,language,problem,unknown
0,"[(graceful ones.\n\n""One more,"" Marvelous said...",20,UTF-8,en,problem00001,"[(after all, his best friends. And what in the..."
1,"[(a mission.""\n\nJensen just raises an eyebrow...",5,UTF-8,en,problem00002,"[(“Potter was attractive,” Draco thought, sigh..."
2,[(qui l'avait tué mais tout était de la faute ...,20,UTF-8,fr,problem00003,[(son réveil. Sa main pulse et Draco frotte l'...
3,[(. Le canapé est vide et lorsqu'il passe deva...,5,UTF-8,fr,problem00004,"[(abasourdie.\n\nTout d'abord, elle crut que s..."
4,"[(Eppure lui la mappa l’aveva stampata, dannaz...",20,UTF-8,it,problem00005,[(– Oh. Cazzo.\nSirius era così sconvolto che ...
5,[(Yato ha trovato una lettera sul suo comodino...,5,UTF-8,it,problem00006,"[(così la tua vista, Moony?\n– Cercavo di esse..."
6,[(zmienił zdanie. Niech się stworzonko pobawi....,20,UTF-8,pl,problem00007,"[(dawniej pełna radości i ciepła, a teraz wiec..."
7,"[(Słowem, które Sherlock najczęściej słyszał w...",5,UTF-8,pl,problem00008,"[(, uderzającego o żebra niczym dzwon- niemal ..."
8,[(pero no lo ama como ama a Guignol –explicó e...,20,UTF-8,sp,problem00009,[(–La nariz puntiaguda del elfo casi rozaba el...
9,"[(incapaz de señalar un momento exacto, un pun...",5,UTF-8,sp,problem00010,[(tan parecidas hizo que su trasero latiese de...


In [11]:
#*******************************************************************************************************
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder


def eval_measures(gt, pred):
    """Compute macro-averaged F1-scores, macro-averaged precision, 
    macro-averaged recall, and micro-averaged accuracy according the ad hoc
    rules discussed at the top of this file.
    Parameters
    ----------
    gt : dict
        Ground truth, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    pred : dict
        Predicted attribution, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    Returns
    -------
    f1 : float
        Macro-averaged F1-score
    precision : float
        Macro-averaged precision
    recall : float
        Macro-averaged recall
    accuracy : float
        Micro-averaged F1-score
    """

    actual_authors = list(gt.values())
    encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)

    text_ids, gold_authors, silver_authors = [], [], []
    for text_id in sorted(gt):
        text_ids.append(text_id)
        gold_authors.append(gt[text_id])
        try:
            silver_authors.append(pred[text_id])
        except KeyError:
            # missing attributions get <UNK>:
            silver_authors.append('<UNK>')

    assert len(text_ids) == len(gold_authors)
    assert len(text_ids) == len(silver_authors)

    # replace non-existent silver authors with '<UNK>':
    silver_authors = [a if a in encoder.classes_ else '<UNK>' 
                      for a in silver_authors]

    gold_author_ints   = encoder.transform(gold_authors)
    silver_author_ints = encoder.transform(silver_authors)

    # get F1 for individual classes (and suppress warnings):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        precision = precision_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        recall = recall_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        accuracy = accuracy_score(gold_author_ints,
                  silver_author_ints)

    return f1,precision,recall,accuracy


In [12]:
def evaluate(ground_truth_file,predictions_file):
    # Calculates evaluation measures for a single attribution problem
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author']

    pred = {}
    with open(predictions_file, 'r') as f:
        for attrib in json.load(f):
            if attrib['unknown-text'] not in pred:
                pred[attrib['unknown-text']] = attrib['predicted-author']
    f1,precision,recall,accuracy =  eval_measures(gt,pred)
    return f1, precision, recall, accuracy

In [13]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse


class DenseTransformer(BaseEstimator):
    def __init__(self, return_copy=True):
        self.return_copy = return_copy
        self.is_fitted = False

    def transform(self, X, y=None):
        if issparse(X):
            return X.toarray()
        elif self.return_copy:
            return X.copy()
        else:
            return X

    def fit(self, X, y=None):
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [14]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse


class ObfuscationTransformer(BaseEstimator):
    def __init__(self,re_from=r'(\b)(\w{0,2})\w+(\w{1,3})(\b)', re_to=r'\1\2XX\3\4', return_copy=True):
        self.re_from = re_from
        self.re_to = re_to

    def transform(self, X, y=None):
        X = np.array(X).copy();
        for i in range(len(X)):
            X[i] = re.sub(self.re_from,self.re_to, X[i])
        
        return X;

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [19]:
def runML(problem):
    print ("Problem: %s,  language: %s, " %(problem['problem'],problem['language']))
    
    train_docs, train_labels, _   = zip(*problem['candidates'])
    problem['training_docs_size'] = len(train_docs);
    test_docs, _, test_filename   = zip(*problem['unknown'])
    
    
    pipeline1 = Pipeline([
        ('vect',   TfidfVectorizer(
                        analyzer='char',
                        ngram_range=(3,5),
                        min_df=0.01,
                        max_df=1.0,
                        norm='l1',
                        lowercase =False,
                        sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.99)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])
    
    pipeline2 = Pipeline([
        ('obs',ObfuscationTransformer(re_from=r'\w',re_to='x')),
        ('vect',   TfidfVectorizer(
                        analyzer='char',
                        ngram_range=(3,5),
                        min_df=0.01,
                        max_df=1.0,
                        norm='l1',
                        lowercase =False,
                        sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.99)),
        ('clf', LogisticRegressionCV(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])
    

    
    pipeline3 = Pipeline([
        ('vect',   TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1,3),
                    norm='l1',
                    min_df=2,
                    max_df=1.0,
                    lowercase =True,
                    sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.99)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ]);
    
    t0 = time()

    for p in [pipeline1,pipeline2, pipeline3]:
        p.fit(train_docs, train_labels)

    xtrain_mix = np.hstack([p.predict_proba(train_docs) for p in [pipeline1, pipeline2, pipeline3]])
    xtest_mix  = np.hstack([p.predict_proba(test_docs) for p in [pipeline1, pipeline2, pipeline3]])

    clfFinal = Pipeline([
        ('pca', PCA(0.9999)),
        ('clf',LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg'))
    ]);
    clfFinal.fit(xtrain_mix, train_labels);

    train_pred=clfFinal.predict(xtrain_mix);
    test_pred =clfFinal.predict(xtest_mix);
        
    print("done in %0.3fs \n" % (time() - t0))
    # Writing output file
    out_data=[]
    for i,v in enumerate(test_pred):
        out_data.append(
                {'unknown-text': test_filename[i],
                 'predicted-author': v
                }
                )
    answerFile = pathjoin(outputDir,'answers-'+problem['problem']+'.json');
    with open(answerFile, 'w') as f:
        json.dump(out_data, f, indent=4)
        #allProblems.extend(out_data)
    
    
    #evaluation train
    f1,precision,recall,accuracy=evaluate(
                pathjoin(inputDir, problem['problem'], 'ground-truth.json'),
                answerFile)
    return {
                'problem-name'  :       problem['problem'],
                "language"      :       problem['language'],
                'AuthorCount'   :       len(set(train_labels)),
                "train_doc_size":       len(train_docs),
                "train_caract_per_doc": sum([len(l) for l in train_docs])/len(train_docs),
                "test_doc_size" :       len(test_docs),
                "test_caract_per_doc":  sum([len(l) for l in test_docs])/len(test_docs),
                
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3),
                
        };

In [18]:
result = [];
for problem in problems:
    result.append(runML(problem));
pd.DataFrame(result)

Problem: problem00001,  language: en, 
done in 13.199s 

Problem: problem00002,  language: en, 
done in 3.076s 

Problem: problem00003,  language: fr, 
done in 11.767s 

Problem: problem00004,  language: fr, 
done in 3.144s 

Problem: problem00005,  language: it, 
done in 13.364s 

Problem: problem00006,  language: it, 
done in 3.913s 

Problem: problem00007,  language: pl, 
done in 16.309s 

Problem: problem00008,  language: pl, 
done in 3.583s 

Problem: problem00009,  language: sp, 
done in 14.450s 

Problem: problem00010,  language: sp, 
done in 4.675s 



Unnamed: 0,AuthorCount,language,macro-f1,macro-precision,macro-recall,micro-accuracy,problem-name,test_caract_per_doc,test_doc_size,train_caract_per_doc,train_doc_size
0,20,en,0.511,0.471,0.704,0.61,problem00001,4370,105,4327,140
1,5,en,0.575,0.617,0.65,0.524,problem00002,4296,21,4342,35
2,20,fr,0.65,0.66,0.709,0.653,problem00003,4508,49,4492,140
3,5,fr,0.867,0.864,0.92,0.81,problem00004,4532,21,4522,35
4,20,it,0.559,0.529,0.687,0.75,problem00005,4787,80,4720,140
5,5,it,0.621,0.608,0.704,0.826,problem00006,4765,46,4847,35
6,20,pl,0.52,0.548,0.59,0.534,problem00007,5200,103,5145,140
7,5,pl,0.822,0.867,0.878,0.867,problem00008,5214,15,5049,35
8,20,sp,0.8,0.807,0.874,0.812,problem00009,4788,117,4794,140
9,5,sp,0.797,0.8,0.809,0.859,problem00010,4827,64,4955,35


In [None]:
df=pd.DataFrame(result)[['problem-name',
                     "language",
                     'AuthorCount',
                     "train_doc_size","train_caract_per_doc",
                     "test_doc_size", "test_caract_per_doc",
                     'macro-f1','macro-precision','macro-recall' ,'micro-accuracy']]

In [None]:
df

In [None]:
pd.DataFrame(result)[['macro-f1']].describe()

In [None]:
problem = problems[6]

In [None]:
print ("Problem: %s,  language: %s, " %(problem['problem'],problem['language']))

train_docs, train_labels, _   = zip(*problem['candidates'])
problem['training_docs_size'] = len(train_docs);
test_docs, _, test_filename   = zip(*problem['unknown'])

In [None]:
#code from baseline
gt = {}
with open(pathjoin(inputDir, problem['problem'], 'ground-truth.json'), 'r') as f:
    for attrib in json.load(f)['ground_truth']:
        gt[attrib['unknown-text']] = attrib['true-author']

test_docs, _, test_filename = zip(*problem['unknown'])
test_labels = [gt[v] for v in test_filename]

In [None]:
pipeline1 = Pipeline([
        ('vect',   TfidfVectorizer(
                analyzer='char',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(2,5),
                lowercase=False,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.99)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])

In [None]:
pipeline2 = Pipeline([
        ('obs',ObfuscationTransformer(re_from=r'\w|\d',re_to='x')),
        ('vect',   TfidfVectorizer(
                analyzer='char',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(2,5),
                lowercase=False,
                norm='l1',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.99)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])

In [None]:
pipeline3 = Pipeline([
        ('vect',   TfidfVectorizer(
                analyzer='word',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(1,3),
                lowercase=False,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.99)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])

In [None]:
t0 = time()
pipeline1.fit(train_docs, train_labels);
pipeline2.fit(train_docs, train_labels);
pipeline3.fit(train_docs, train_labels);
print("done in %0.3fs \n" % (time() - t0))

In [None]:
plt.plot(pipeline1.named_steps['transf'].explained_variance_ratio_.cumsum(),label="1");
plt.plot(pipeline2.named_steps['transf'].explained_variance_ratio_.cumsum(),label="2");
plt.plot(pipeline3.named_steps['transf'].explained_variance_ratio_.cumsum(),label='3');
plt.legend();

In [None]:
xtrain_mix = np.hstack([p.predict_proba(train_docs) for p in [pipeline1, pipeline2, pipeline3]])
xtest_mix  = np.hstack([p.predict_proba(test_docs) for p in [pipeline1, pipeline2, pipeline3]])

In [None]:
#clfFinal = LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg', C=0.1);
clfFinal = Pipeline([
    #('prepro', Normalizer(norm='l2')),
    #('pca', PCA(0.99)),
    ('clf',MLPClassifier(activation='identity',hidden_layer_sizes=(xtrain_mix.shape[1]), random_state=0))
]);
#clfFinal = MLPClassifier(activation='identity',hidden_layer_sizes=(xtrain_mix.shape[1]), random_state=0)

clfFinal.fit(xtrain_mix, train_labels);

train_pred=clfFinal.predict(xtrain_mix);
test_pred =clfFinal.predict(xtest_mix);

In [None]:
plt.plot(clfFinal.named_steps['clf'].loss_curve_);

In [None]:
print(np.sum(np.array(train_pred) == np.array(train_labels))*1.0/len(train_labels))
print(np.sum(np.array(test_pred) == np.array(test_labels))*1.0/len(test_labels))

In [None]:
test_pred =clfFinal.predict(xtest_mix);
f1,precision,recall,accuracy =  eval_measures(gt,{k: v for k,v in zip(test_filename, test_pred)  })
results = [{ 'caso' : 'total',
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3)
             }]
for clf in [pipeline1, pipeline2, pipeline3]:
    p = clf.predict(test_docs)
    f1,precision,recall,accuracy =  eval_measures(gt,{k: v for k,v in zip(test_filename, p)  })
    results.append({
                'caso' : clf.named_steps['vect'].analyzer,
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3)
             });

pd.DataFrame(results)

In [None]:
df=pd.DataFrame(
    zip(
        pipeline1.predict(test_docs),
        pipeline2.predict(test_docs),
        pipeline3.predict(test_docs),
        test_pred,
        test_labels
    ),
    columns=['p1','p2','p3','ens', 'truth'])

In [None]:
df.p1  = np.where(df.p1 == df.truth,1,0)
df.p2  = np.where(df.p2 == df.truth,1,0)
df.p3  = np.where(df.p3 == df.truth,1,0)
df.ens = np.where(df.ens == df.truth,1,0)

In [None]:
df.groupby(by='truth').agg(np.sum).reset_index()