# Notebook para o PAN - Atribuição Autoral - 2018

In [1]:
%matplotlib inline
#python basic libs
from __future__ import print_function

from tempfile import mkdtemp
from shutil import rmtree
import os;
from os.path import join as pathjoin;

import re;
import glob;
import json;
import codecs;
from collections import defaultdict;
import pprint;


from pprint import pprint
from time import time
import logging


#data analysis libs
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import random;

#machine learning libs
#feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

#preprocessing and transformation
from sklearn.preprocessing import normalize, Normalizer, MaxAbsScaler, MinMaxScaler, LabelBinarizer;
from sklearn.decomposition import PCA;
from sklearn.metrics.pairwise import cosine_similarity;


from sklearn.base import BaseEstimator, ClassifierMixin

#classifiers
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import RFE,SelectFpr,SelectPercentile, chi2;

from sklearn.metrics import pairwise_distances;

from sklearn.pipeline import Pipeline

#model valuation
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score;

In [2]:
import platform; print(platform.platform())
print("NumPy", np.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)

Darwin-17.5.0-x86_64-i386-64bit
NumPy 1.14.2
SciPy 1.0.1
Scikit-Learn 0.19.1


### paths configuration

In [3]:
baseDir = '/Users/joseeleandrocustodio/Dropbox/mestrado/02 - Pesquisa/code';

inputDir= pathjoin(baseDir,'pan18aa');
outputDir= pathjoin(baseDir,'out',"oficial");
if not os.path.exists(outputDir):
    os.mkdir(outputDir);

## loading the dataset

In [4]:
def readCollectionsOfProblems(path):
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    with open(infocollection, 'r') as f:
        problems  = [
            {
                'problem': attrib['problem-name'],
                'language': attrib['language'],
                'encoding': attrib['encoding'],
            }
            for attrib in json.load(f)
            
        ]
    return problems;

In [5]:
problems = readCollectionsOfProblems(inputDir);

In [6]:
problems[0]

{'encoding': u'UTF-8', 'language': u'en', 'problem': u'problem00001'}

In [7]:
def readProblem(path, problem):
    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])
    return unk_folder, candidates;

In [8]:
def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(pathjoin(path,label,'*.txt'))
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label, os.path.basename(v)))
        f.close()
    return texts

In [9]:
for index,problem in enumerate(problems):
    unk_folder, candidates_folder = readProblem(inputDir, problem['problem']); 
    problem['candidates_folder_count'] = len(candidates_folder);
    problem['candidates'] = [];
    for candidate in candidates_folder:
        problem['candidates'].extend(read_files(pathjoin(inputDir, problem['problem']),candidate));
    
    problem['unknown'] = read_files(pathjoin(inputDir, problem['problem']),unk_folder);    

In [10]:
pd.DataFrame(problems)

Unnamed: 0,candidates,candidates_folder_count,encoding,language,problem,unknown
0,"[(graceful ones.\n\n""One more,"" Marvelous said...",20,UTF-8,en,problem00001,"[(after all, his best friends. And what in the..."
1,"[(a mission.""\n\nJensen just raises an eyebrow...",5,UTF-8,en,problem00002,"[(“Potter was attractive,” Draco thought, sigh..."
2,[(qui l'avait tué mais tout était de la faute ...,20,UTF-8,fr,problem00003,[(son réveil. Sa main pulse et Draco frotte l'...
3,[(. Le canapé est vide et lorsqu'il passe deva...,5,UTF-8,fr,problem00004,"[(abasourdie.\n\nTout d'abord, elle crut que s..."
4,"[(Eppure lui la mappa l’aveva stampata, dannaz...",20,UTF-8,it,problem00005,[(– Oh. Cazzo.\nSirius era così sconvolto che ...
5,[(Yato ha trovato una lettera sul suo comodino...,5,UTF-8,it,problem00006,"[(così la tua vista, Moony?\n– Cercavo di esse..."
6,[(zmienił zdanie. Niech się stworzonko pobawi....,20,UTF-8,pl,problem00007,"[(dawniej pełna radości i ciepła, a teraz wiec..."
7,"[(Słowem, które Sherlock najczęściej słyszał w...",5,UTF-8,pl,problem00008,"[(, uderzającego o żebra niczym dzwon- niemal ..."
8,[(pero no lo ama como ama a Guignol –explicó e...,20,UTF-8,sp,problem00009,[(–La nariz puntiaguda del elfo casi rozaba el...
9,"[(incapaz de señalar un momento exacto, un pun...",5,UTF-8,sp,problem00010,[(tan parecidas hizo que su trasero latiese de...


In [11]:
#*******************************************************************************************************
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder


def eval_measures(gt, pred):
    """Compute macro-averaged F1-scores, macro-averaged precision, 
    macro-averaged recall, and micro-averaged accuracy according the ad hoc
    rules discussed at the top of this file.
    Parameters
    ----------
    gt : dict
        Ground truth, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    pred : dict
        Predicted attribution, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    Returns
    -------
    f1 : float
        Macro-averaged F1-score
    precision : float
        Macro-averaged precision
    recall : float
        Macro-averaged recall
    accuracy : float
        Micro-averaged F1-score
    """

    actual_authors = list(gt.values())
    encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)

    text_ids, gold_authors, silver_authors = [], [], []
    for text_id in sorted(gt):
        text_ids.append(text_id)
        gold_authors.append(gt[text_id])
        try:
            silver_authors.append(pred[text_id])
        except KeyError:
            # missing attributions get <UNK>:
            silver_authors.append('<UNK>')

    assert len(text_ids) == len(gold_authors)
    assert len(text_ids) == len(silver_authors)

    # replace non-existent silver authors with '<UNK>':
    silver_authors = [a if a in encoder.classes_ else '<UNK>' 
                      for a in silver_authors]

    gold_author_ints   = encoder.transform(gold_authors)
    silver_author_ints = encoder.transform(silver_authors)

    # get F1 for individual classes (and suppress warnings):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        precision = precision_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        recall = recall_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        accuracy = accuracy_score(gold_author_ints,
                  silver_author_ints)

    return f1,precision,recall,accuracy


In [12]:
def evaluate(ground_truth_file,predictions_file):
    # Calculates evaluation measures for a single attribution problem
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author']

    pred = {}
    with open(predictions_file, 'r') as f:
        for attrib in json.load(f):
            if attrib['unknown-text'] not in pred:
                pred[attrib['unknown-text']] = attrib['predicted-author']
    f1,precision,recall,accuracy =  eval_measures(gt,pred)
    return f1, precision, recall, accuracy

In [13]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse


class DenseTransformer(BaseEstimator):
    def __init__(self, return_copy=True):
        self.return_copy = return_copy
        self.is_fitted = False

    def transform(self, X, y=None):
        if issparse(X):
            return X.toarray()
        elif self.return_copy:
            return X.copy()
        else:
            return X

    def fit(self, X, y=None):
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [14]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse


class ObfuscationTransformer(BaseEstimator):
    def __init__(self,re_from=r'(\b)(\w{0,2})\w+(\w{1,3})(\b)', re_to=r'\1\2XX\3\4', return_copy=True):
        self.re_from = re_from
        self.re_to = re_to

    def transform(self, X, y=None):
        X = np.array(X).copy();
        for i in range(len(X)):
            X[i] = re.sub(self.re_from,self.re_to, X[i])
        
        return X;

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [86]:
class DTransformer(BaseEstimator):
    """Convert a sparse array into a dense array."""

    def __init__(self,
                analyzer='char',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(2,5),
                lowercase=False,
                norm='l1',
                sublinear_tf=True,
                distances=['cosine']):

            self.analyzer=analyzer;
            self.min_df=min_df;
            self.max_df=max_df;
            self.ngram_range=ngram_range;
            self.lowercase=lowercase;
            self.norm=norm;
            self.sublinear_tf=sublinear_tf
            self.distances = distances;
            

    def fit(self, X, y):
        self.vectorizer_ = TfidfVectorizer(
                analyzer=self.analyzer,
                min_df=self.min_df,
                max_df=self.max_df,
                ngram_range=self.ngram_range,
                lowercase=self.lowercase,
                norm=self.norm,
                sublinear_tf=self.sublinear_tf);
        
        #building the internal vocabulary
        self.vectorizer_.fit(X);
        
        #creating author profile
        profile = defaultdict(unicode);
        for text, label in zip(X,y):
            profile[label]+=text;
        
        #make sure the labels are going to be sorted
        self.profileLabels_ = set(profile.keys());
        x = [ profile[label] for label in self.profileLabels_]
            
        self.profileVector_ = self.vectorizer_.transform(x);
        
        return self;

    def transform(self, X, y=None):
        X = self.vectorizer_.transform(X);
        XD = [
            pairwise_distances(X.todense(), self.profileVector_.todense(), metric = d)
             for d in self.distances];
        XD = np.hstack(XD);
        return XD;

    def fit_transform(self, X, y):
        self.fit(X,y);
        return self.transform(X=X, y=y)

In [96]:
def runML(problem):
    print ("Problem: %s,  language: %s " %(problem['problem'],problem['language']))
    
    train_docs, train_labels, _   = zip(*problem['candidates'])
    problem['training_docs_size'] = len(train_docs);
    test_docs, _, test_filename   = zip(*problem['unknown'])
    
    t0 = time()
    
    pipeline = Pipeline([
        ('vect', DTransformer(distances=distancesList, sublinear_tf=True)),
        ('clf',  LogisticRegression(random_state=0))
    ])
    
    pipeline.fit(train_docs,train_labels);
    test_pred =pipeline.predict(test_docs);
    
    print("done in %0.3fs" % (time() - t0))
        
    
    # Writing output file
    out_data=[]
    for i,v in enumerate(test_pred):
        out_data.append({'unknown-text': test_filename[i],'predicted-author': v})
    answerFile = pathjoin(outputDir,'answers-'+problem['problem']+'.json');
    with open(answerFile, 'w') as f:
        json.dump(out_data, f, indent=4)
        #allProblems.extend(out_data)
    
    
    #evaluation train
    f1,precision,recall,accuracy=evaluate(
                pathjoin(inputDir, problem['problem'], 'ground-truth.json'),
                answerFile)
    return {
                'problem-name'   : problem['problem'],
                "train_doc_size":len(train_docs),
                "language":problem['language'],
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3),
                'AuthorCount':len(set(train_labels))
        };

In [97]:
distancesList =['yule']
result = [];
for problem in problems:
    result.append(runML(problem));

Problem: problem00001,  language: en 
done in 9.406s
Problem: problem00002,  language: en 
done in 2.195s
Problem: problem00003,  language: fr 
done in 7.894s
Problem: problem00004,  language: fr 
done in 2.285s
Problem: problem00005,  language: it 
done in 8.890s
Problem: problem00006,  language: it 
done in 2.854s
Problem: problem00007,  language: pl 
done in 10.823s
Problem: problem00008,  language: pl 
done in 2.606s
Problem: problem00009,  language: sp 
done in 9.635s
Problem: problem00010,  language: sp 
done in 3.293s


In [93]:
pd.DataFrame(result)

Unnamed: 0,AuthorCount,language,macro-f1,macro-precision,macro-recall,micro-accuracy,problem-name,train_doc_size
0,20,en,0.353,0.357,0.513,0.429,problem00001,140
1,5,en,0.529,0.567,0.617,0.429,problem00002,35
2,20,fr,0.428,0.515,0.485,0.449,problem00003,140
3,5,fr,0.685,0.76,0.76,0.571,problem00004,35
4,20,it,0.53,0.518,0.634,0.6,problem00005,140
5,5,it,0.602,0.616,0.702,0.826,problem00006,35
6,20,pl,0.527,0.538,0.607,0.563,problem00007,140
7,5,pl,0.867,0.9,0.9,0.933,problem00008,35
8,20,sp,0.638,0.622,0.765,0.667,problem00009,140
9,5,sp,0.735,0.735,0.746,0.813,problem00010,35


In [98]:
pd.DataFrame(result)

Unnamed: 0,AuthorCount,language,macro-f1,macro-precision,macro-recall,micro-accuracy,problem-name,train_doc_size
0,20,en,0.353,0.357,0.513,0.429,problem00001,140
1,5,en,0.529,0.567,0.617,0.429,problem00002,35
2,20,fr,0.428,0.515,0.485,0.449,problem00003,140
3,5,fr,0.685,0.76,0.76,0.571,problem00004,35
4,20,it,0.53,0.518,0.634,0.6,problem00005,140
5,5,it,0.602,0.616,0.702,0.826,problem00006,35
6,20,pl,0.527,0.538,0.607,0.563,problem00007,140
7,5,pl,0.867,0.9,0.9,0.933,problem00008,35
8,20,sp,0.638,0.622,0.765,0.667,problem00009,140
9,5,sp,0.735,0.735,0.746,0.813,problem00010,35


In [99]:
result2 = [];
for d in ['euclidean', 'l2','manhattan',  'braycurtis',
                'canberra', 'chebyshev', 
                'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
                 'matching', 'minkowski',
                'rogerstanimoto', 'russellrao', 
                'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']:
    distancesList = [d];
    print (d)
    for problem in problems:
        r = runML(problem);
        r['distance'] = d;
        result2.append(r);

euclidean
Problem: problem00001,  language: en 
done in 8.496s
Problem: problem00002,  language: en 
done in 2.179s
Problem: problem00003,  language: fr 
done in 7.749s
Problem: problem00004,  language: fr 
done in 2.362s
Problem: problem00005,  language: it 
done in 9.183s
Problem: problem00006,  language: it 
done in 2.957s
Problem: problem00007,  language: pl 
done in 10.848s
Problem: problem00008,  language: pl 
done in 2.699s
Problem: problem00009,  language: sp 
done in 9.984s
Problem: problem00010,  language: sp 
done in 3.279s
l2
Problem: problem00001,  language: en 
done in 8.593s
Problem: problem00002,  language: en 
done in 2.184s
Problem: problem00003,  language: fr 
done in 8.067s
Problem: problem00004,  language: fr 
done in 2.292s
Problem: problem00005,  language: it 
done in 9.090s
Problem: problem00006,  language: it 
done in 2.916s
Problem: problem00007,  language: pl 


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(result2)

In [89]:
import seaborn as sns
df = df.pivot(index='problem-name', columns='distance',values='macro-f1')\
.reset_index().drop(columns=['manhattan','correlation','problem-name','kulsinski','hamming','l2','sqeuclidean'])

cm = sns.light_palette("green", as_cmap=True)
df=df.style.background_gradient(cmap=cm,axis=1)
df

AttributeError: 'Styler' object has no attribute 'pivot'

In [24]:
pd.DataFrame(result)[['problem-name','language','macro-f1']]

Unnamed: 0,problem-name,language,macro-f1
0,problem00001,en,0.323
1,problem00002,en,0.444
2,problem00003,fr,0.563
3,problem00004,fr,0.661
4,problem00005,it,0.525
5,problem00006,it,0.612
6,problem00007,pl,0.475
7,problem00008,pl,0.867
8,problem00009,sp,0.537
9,problem00010,sp,0.724


In [None]:
distancesList =['yule']
result = [];
for problem in problems:
    result.append(runML(problem));

In [None]:
distancesList =['cosine',  'l1', 'l2',  'braycurtis', 'canberra',  'hamming', 'jaccard','rogerstanimoto','yule']

In [None]:
def dist(x,y):
    x = x/x.sum();
    y = y/y.sum();
    n = np.sum((x-y)**2);
    d1 = np.sum((1-np.abs(x))**2);
    d2 = np.sum((1-np.abs(y))**2)
    d = np.arccosh(1 + 2*n/(d1*d2));
    return d;
tt= pairwise_distances(x2.todense(), pv.todense(), metric=dist)
tt =normalize(tt)
plt.matshow(tt, cmap='Blues');
plt.yticks(range(len(test_labels)), test_labels)
plt.xticks(range(len(plabels)), plabels, rotation=90)
plt.show()

In [None]:
plt.hist(tt.flatten());

In [None]:
import seaborn as sns

df2= df[['macro-f1','caso','problem-name']].pivot(index='problem-name', columns='caso', values='macro-f1').reset_index()
cm = sns.light_palette("green", as_cmap=True)
df2=df2.style.background_gradient(cmap=cm,axis=1)
df2