In [1]:
import gensim
from  gensim.models import KeyedVectors
import gzip
import zipfile
import os
import re
import random
import glob
from time import time
import pandas as pd
from os.path import join as pathjoin

In [2]:
baseDir = '/Users/joseeleandrocustodio/dataScienceNotebook/PANAA2018';

corpusTraining    = 'pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02';
corpusEvaluation  = 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20';
corpusEach1 = 'AvaliacaoPT';

currentCorpus = corpusEach1;

inputDir= pathjoin(baseDir,currentCorpus);
outputDir= pathjoin(baseDir,'out');
if not os.path.exists(outputDir):
    os.mkdir(outputDir);

In [3]:
import pan

In [4]:
def problemDoc(inputDir,problem):
    train_docs, train_labels, _   = zip(*problem['candidates'])
    test_docs, _, test_filename   = zip(*problem['unknown']);
    test_labels = pan.readGroundTruh(pathjoin(inputDir, problem['problem'], 'ground-truth.json'),test_filename)
    return train_docs, train_labels, test_docs, test_labels,test_filename;

In [5]:
regex_cleaning = [re.compile(w) for w in[
    r'[\u4E00-\u9FA5]+',  #chinese,
    r'[\u3040-\u309F]+'  #hiragana
]]

In [6]:
def problemsToTokens(problems, inputDir):
    def clean(d):
        for r in regex_cleaning:
            d = r.sub('',d);
        return d;
    tokens = {};
    pattern = re.compile(r"(?u)\b\w\w+\b");
    for problem in problems:
        lang = problem['language'];
        if lang == 'sp':
            lang = 'es';
        train_docs, _, test_docs, _, _ = problemDoc(inputDir,problem);
        docs = list(train_docs)+list(test_docs);

        if lang not in tokens:
            tokens[lang] = list();
        tokens[lang] +=list(set([w  for d in docs for w in pattern.findall(clean(d))]));
        tokens[lang] = sorted(list(set(tokens[lang])));
    return tokens;

In [7]:
collectionsCorpus = [
    pan.readCollectionsOfProblems(pathjoin(baseDir,c))
    for c in [corpusTraining, corpusEvaluation, corpusEach1]
]


In [8]:
vocabCorpuses = [
    problemsToTokens(col,pathjoin(baseDir,c))
    for col,c in zip(collectionsCorpus, [corpusTraining, corpusEvaluation, corpusEach1])
]

In [9]:
for v in vocabCorpuses:
    print(v.keys());

dict_keys(['en', 'fr', 'it', 'pl', 'es'])
dict_keys(['en', 'fr', 'it', 'pl', 'es'])
dict_keys(['pt', 'en'])


In [10]:
vocabTotal = {};

for t in vocabCorpuses:
    for k,v in t.items():
        if k not in vocabTotal:
            vocabTotal[k] = [];
        vocabTotal[k] =  sorted(list(set(vocabTotal[k] + t[k])))

In [11]:
for t in vocabCorpuses:
    for k,v in t.items():
        s = random.randint(0,len(v));
        print(k,len(v), v[s:(s+5)],'\n')

en 13556 ['Jersey', 'Jerusalem', 'Jess', 'Jessica', 'Jesus'] 

fr 15656 ['ronge', 'rongeait', 'ronger', 'ronronna', 'ronronnant'] 

it 20284 ['osservando', 'osservandola', 'osservandolo', 'osservare', 'osservargli'] 

pl 34367 ['quidditcha', 'quidditchu', 'quo', 'ra', 'rabarbarowym'] 

es 21188 ['suspirar', 'suspiraron', 'suspiras', 'suspiro', 'suspiros'] 

en 13899 ['hilt', 'him', 'himself', 'hindered', 'hinges'] 

fr 17698 ['glaça', 'glissa', 'glissaient', 'glissait', 'glissant'] 

it 20690 ['mancanze', 'mancare', 'mancargli', 'mancasse', 'mancata'] 

pl 35761 ['graczy', 'grafficiarzy', 'graj', 'grajka', 'grają'] 

es 20625 ['autónoma', 'avalancha', 'avance', 'avancemos', 'avances'] 

pt 24161 ['Carro', 'Carta', 'Cartazes', 'Carteira', 'Cartilha'] 

en 19406 ['climate', 'climb', 'climbed', 'climbing', 'cling'] 



In [12]:
for k,v in vocabTotal.items():
    s = random.randint(0,len(v));
    print(k,len(v), v[s:(s+5)],'\n')

en 30149 ['blasted', 'blasters', 'blastin', 'blasting', 'blatant'] 

fr 24698 ['mortelle', 'mortellement', 'mortels', 'mortes', 'mortifié'] 

it 30291 ['accarezzandole', 'accarezzandoli', 'accarezzandolo', 'accarezzandone', 'accarezzandosi'] 

pl 54181 ['niech', 'niechcenia', 'niechciana', 'niechciane', 'niechcianego'] 

es 30807 ['pequeñín', 'pequeñísima', 'pequeñísimo', 'per', 'percata'] 

pt 24161 ['abandoná', 'abano', 'abastecem', 'abate', 'abater'] 



In [13]:
def readNLPLFile(filename):
    with zipfile.ZipFile(os.path.join(w2v_repository,filename), "r") as archive:
        stream =  archive.open("model.txt");
        model = KeyedVectors.load_word2vec_format(stream, binary=False, unicode_errors='replace')
    return model;

In [14]:
def readGoogleFile():
    return KeyedVectors.load_word2vec_format(
        os.path.join(w2v_repository,'GoogleNews-vectors-negative300.bin.gz')
        ,binary=True)

In [17]:
w2v_repository = '/Users/joseeleandrocustodio/Downloads/w2v_repository/'
embeddingsFiles = {
    'it':'w2v Italian CoNLL17 corpus.zip',
    'fr':'w2v French CoNLL17 corpus.zip',
    #'de':'w2v Dutch CoNLL17 corpus.zip',
    'pl':'w2v Polish CoNLL17 corpus.zip',
    'it':'w2v Italian CoNLL17 corpus.zip',
    'es': 'w2v Spanish CoNLL17 corpus.zip',
    'pt': 'w2v Portuguese CoNLL17 corpus.zip',
    'en':'GoogleNews-vectors-negative300.bin.gz'
}

In [20]:
for lang, file in embeddingsFiles.items():
    print("\n\n\nLanguage %s" % lang);
    
    if os.path.exists(pathjoin('embedding_cache','w2v_'+lang+'.txt.gz')):
        print("cached");
        continue;
    
    print("Loading zip",end=' ');t0 = time();
    if '.zip' in file:
        model = readNLPLFile(file);
    else:
        model = readGoogleFile();
    print("Done in %0.3fs" % (time() - t0))
    
    print("Filtering model",end=' ');t0 = time();
    vocabLang = vocabTotal[lang];
    vocabLang = {w:model[w] for w in vocabLang if w in model};
    print("Done in %0.3fs" % (time() - t0));
    embeddingSize = model.vector_size;
    
    
    del model;
    import gc;
    gc.collect();
    
    print("Language %s - Vocab Total %s - Embedding found %s" %(lang, len(vocabTotal[lang]), len(vocabLang)))

    print("Writing model",end=' ');t0 = time();
    with gzip.open(pathjoin('embedding_cache','w2v_'+lang+'.txt.gz'), 'w') as f:
        f.write(("%s %s\n"%(len(vocabLang),embeddingSize)).encode('utf-8'))
        for w in sorted(list(vocabLang.keys())):
            a = " ".join([str(f) for f in vocabLang[w]]);
            line = "%s %s\n" % (w, a)
            f.write(line.encode('utf-8'))
    print("Done in %0.3fs" % (time() - t0));
os.system( "say finished" )




Language it
cached



Language fr
cached



Language pl
cached



Language es
cached



Language pt
cached



Language en
Loading zip Done in 118.733s
Filtering model Done in 2.226s
Language en - Vocab Total 30149 - Embedding found 27720
Writing model Done in 20.645s


0

In [None]:
Language pt - Vocab Total 24161 - Embedding found 16971
Language en - Vocab Total 30149 - Embedding found 27720

In [7]:
import os;
import json;
import glob;
import codecs;
import zipfile;
from pprint import pprint

from os.path import join as pathjoin;

In [4]:
file = '../pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20.zip'

In [26]:
import pandas as pd

In [37]:
pd.DataFrame(problems)

Unnamed: 0,candidates,encoding,language,n_authors,problem-name,unknown
0,[(hairdryer currently turned on high.\n ...,UTF-8,en,20,problem00001,[(still wearing a cool smile. He turned it on ...
1,"[(hey, perhaps he'd like to test that authorit...",UTF-8,en,15,problem00002,[(It had seemed like a good idea at the time.\...
2,[(MORE BLISSED OUT AT THE VERY FACT THAT THEY ...,UTF-8,en,10,problem00003,"[(or even worse, accept them) – and you’d foll..."
3,[(to feel Link inside of him anyways. Ever sin...,UTF-8,en,5,problem00004,[(Every night they brought them. New sacrific...
4,[(Du coin de l'œil il pouvait voir le sourire ...,UTF-8,fr,20,problem00005,"[(mêler de leurs affaires.\n\nMais voilà, elle..."
5,"[(ai pas vu s'agrandir et s'affirmer, jusqu'à ...",UTF-8,fr,15,problem00006,[(est quoi ? » Il désigna de sa canne les bou...
6,"[(qui allait tout revomir ensuite, Varus serra...",UTF-8,fr,10,problem00007,"[(était une sorte de monstre. « Sirius, ne re..."
7,[(même expérience que toi par rapport aux vamp...,UTF-8,fr,5,problem00008,"[(une entrée brusque et terrifiante, tel un va..."
8,[(aveva usato anche uno squallidissimo doppio ...,UTF-8,it,20,problem00009,[( \n\n \n\n \n\nAlbus è così intelligente che...
9,[(\n\n\n\n\n\n\n\n'Sarai mio.'\n\n\n\n\n\n\n\n...,UTF-8,it,15,problem00010,"[(giorno della gita ad Hogsmeade, sarebbe venu..."


In [None]:




def readCollectionsOfProblems(path):
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    with open(infocollection, 'r') as f:
        problems  = [
            {
                'problem': attrib['problem-name'],
                'language': attrib['language'],
                'encoding': attrib['encoding'],
            }
            for attrib in json.load(f)
            
        ]

    for index,problem in enumerate(problems):
        unk_folder, candidates_folder = readProblem(path, problem['problem']); 
        problem['candidates_folder_count'] = len(candidates_folder);
        problem['candidates'] = [];
        for candidate in candidates_folder:
            problem['candidates'].extend(read_files(pathjoin(path, problem['problem']),candidate));
        
        problem['unknown'] = read_files(pathjoin(path, problem['problem']),unk_folder);    

    return problems;


def readProblem(path, problem):
    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])
    return unk_folder, candidates;


def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(pathjoin(path,label,'*.txt'))
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append([f.read(),label, os.path.basename(v)])
        f.close()
    return texts


#*******************************************************************************************************
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder


def eval_measures(gt, pred):
    """Compute macro-averaged F1-scores, macro-averaged precision, 
    macro-averaged recall, and micro-averaged accuracy according the ad hoc
    rules discussed at the top of this file.
    Parameters
    ----------
    gt : dict
        Ground truth, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    pred : dict
        Predicted attribution, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    Returns
    -------
    f1 : float
        Macro-averaged F1-score
    precision : float
        Macro-averaged precision
    recall : float
        Macro-averaged recall
    accuracy : float
        Micro-averaged F1-score
    """

    actual_authors = list(gt.values())
    encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)

    text_ids, gold_authors, silver_authors = [], [], []
    for text_id in sorted(gt):
        text_ids.append(text_id)
        gold_authors.append(gt[text_id])
        try:
            silver_authors.append(pred[text_id])
        except KeyError:
            # missing attributions get <UNK>:
            silver_authors.append('<UNK>')

    assert len(text_ids) == len(gold_authors)
    assert len(text_ids) == len(silver_authors)

    # replace non-existent silver authors with '<UNK>':
    silver_authors = [a if a in encoder.classes_ else '<UNK>' 
                      for a in silver_authors]

    gold_author_ints   = encoder.transform(gold_authors)
    silver_author_ints = encoder.transform(silver_authors)

    # get F1 for individual classes (and suppress warnings):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        precision = precision_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        recall = recall_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        accuracy = accuracy_score(gold_author_ints,
                  silver_author_ints)

    return f1,precision,recall,accuracy


def readGroundTruh(ground_truth_file, unkowndocs):
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author'];

    return [gt[d]  for d in unkowndocs];



def evaluate(ground_truth_file,predictions_file):
    # Calculates evaluation measures for a single attribution problem
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author']

    pred = {}
    with open(predictions_file, 'r') as f:
        for attrib in json.load(f):
            if attrib['unknown-text'] not in pred:
                pred[attrib['unknown-text']] = attrib['predicted-author']
    f1,precision,recall,accuracy =  eval_measures(gt,pred)
    return f1, precision, recall, accuracy