In [1]:
import gensim
from  gensim.models import KeyedVectors
import gzip
import zipfile
import os
import re
import random
import glob
from time import time
import pandas as pd
from os.path import join as pathjoin

In [2]:
baseDir = '/Users/joseeleandrocustodio/dataScienceNotebook/PANAA2018';

corpusTraining    = 'pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02';
corpusEvaluation  = 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20';
corpusEach1 = 'AvaliacaoPT';

currentCorpus = corpusEach1;

inputDir= pathjoin(baseDir,currentCorpus);
outputDir= pathjoin(baseDir,'out');
if not os.path.exists(outputDir):
    os.mkdir(outputDir);

In [3]:
import pan

In [4]:
def problemDoc(inputDir,problem):
    train_docs, train_labels, _   = zip(*problem['candidates'])
    test_docs, _, test_filename   = zip(*problem['unknown']);
    test_labels = pan.readGroundTruh(pathjoin(inputDir, problem['problem'], 'ground-truth.json'),test_filename)
    return train_docs, train_labels, test_docs, test_labels,test_filename;

In [5]:
regex_cleaning = [re.compile(w) for w in[
    r'[\u4E00-\u9FA5]+',  #chinese,
    r'[\u3040-\u309F]+'  #hiragana
]]

In [6]:
def problemsToTokens(problems, inputDir):
    def clean(d):
        for r in regex_cleaning:
            d = r.sub('',d);
        return d;
    tokens = {};
    pattern = re.compile(r"(?u)\b\w\w+\b");
    for problem in problems:
        lang = problem['language'];
        if lang == 'sp':
            lang = 'es';
        train_docs, _, test_docs, _, _ = problemDoc(inputDir,problem);
        docs = list(train_docs)+list(test_docs);

        if lang not in tokens:
            tokens[lang] = list();
        tokens[lang] +=list(set([w  for d in docs for w in pattern.findall(clean(d))]));
        tokens[lang] = sorted(list(set(tokens[lang])));
    return tokens;

In [7]:
collectionsCorpus = [
    pan.readCollectionsOfProblems(pathjoin(baseDir,c))
    for c in [corpusTraining, corpusEvaluation, corpusEach1]
]


In [8]:
vocabCorpuses = [
    problemsToTokens(col,pathjoin(baseDir,c))
    for col,c in zip(collectionsCorpus, [corpusTraining, corpusEvaluation, corpusEach1])
]

In [9]:
for v in vocabCorpuses:
    print(v.keys());

dict_keys(['en', 'fr', 'it', 'pl', 'es'])
dict_keys(['en', 'fr', 'it', 'pl', 'es'])
dict_keys(['pt', 'en'])


In [10]:
vocabTotal = {};

for t in vocabCorpuses:
    for k,v in t.items():
        if k not in vocabTotal:
            vocabTotal[k] = [];
        vocabTotal[k] =  sorted(list(set(vocabTotal[k] + t[k])))

In [11]:
for t in vocabCorpuses:
    for k,v in t.items():
        s = random.randint(0,len(v));
        print(k,len(v), v[s:(s+5)],'\n')

en 13556 ['Jersey', 'Jerusalem', 'Jess', 'Jessica', 'Jesus'] 

fr 15656 ['ronge', 'rongeait', 'ronger', 'ronronna', 'ronronnant'] 

it 20284 ['osservando', 'osservandola', 'osservandolo', 'osservare', 'osservargli'] 

pl 34367 ['quidditcha', 'quidditchu', 'quo', 'ra', 'rabarbarowym'] 

es 21188 ['suspirar', 'suspiraron', 'suspiras', 'suspiro', 'suspiros'] 

en 13899 ['hilt', 'him', 'himself', 'hindered', 'hinges'] 

fr 17698 ['glaça', 'glissa', 'glissaient', 'glissait', 'glissant'] 

it 20690 ['mancanze', 'mancare', 'mancargli', 'mancasse', 'mancata'] 

pl 35761 ['graczy', 'grafficiarzy', 'graj', 'grajka', 'grają'] 

es 20625 ['autónoma', 'avalancha', 'avance', 'avancemos', 'avances'] 

pt 24161 ['Carro', 'Carta', 'Cartazes', 'Carteira', 'Cartilha'] 

en 19406 ['climate', 'climb', 'climbed', 'climbing', 'cling'] 



In [12]:
for k,v in vocabTotal.items():
    s = random.randint(0,len(v));
    print(k,len(v), v[s:(s+5)],'\n')

en 30149 ['blasted', 'blasters', 'blastin', 'blasting', 'blatant'] 

fr 24698 ['mortelle', 'mortellement', 'mortels', 'mortes', 'mortifié'] 

it 30291 ['accarezzandole', 'accarezzandoli', 'accarezzandolo', 'accarezzandone', 'accarezzandosi'] 

pl 54181 ['niech', 'niechcenia', 'niechciana', 'niechciane', 'niechcianego'] 

es 30807 ['pequeñín', 'pequeñísima', 'pequeñísimo', 'per', 'percata'] 

pt 24161 ['abandoná', 'abano', 'abastecem', 'abate', 'abater'] 



In [13]:
def readNLPLFile(filename):
    with zipfile.ZipFile(os.path.join(w2v_repository,filename), "r") as archive:
        stream =  archive.open("model.txt");
        model = KeyedVectors.load_word2vec_format(stream, binary=False, unicode_errors='replace')
    return model;

In [14]:
def readGoogleFile():
    return KeyedVectors.load_word2vec_format(
        os.path.join(w2v_repository,'GoogleNews-vectors-negative300.bin.gz')
        ,binary=True)

In [17]:
w2v_repository = '/Users/joseeleandrocustodio/Downloads/w2v_repository/'
embeddingsFiles = {
    'it':'w2v Italian CoNLL17 corpus.zip',
    'fr':'w2v French CoNLL17 corpus.zip',
    #'de':'w2v Dutch CoNLL17 corpus.zip',
    'pl':'w2v Polish CoNLL17 corpus.zip',
    'it':'w2v Italian CoNLL17 corpus.zip',
    'es': 'w2v Spanish CoNLL17 corpus.zip',
    'pt': 'w2v Portuguese CoNLL17 corpus.zip',
    'en':'GoogleNews-vectors-negative300.bin.gz'
}

In [20]:
for lang, file in embeddingsFiles.items():
    print("\n\n\nLanguage %s" % lang);
    
    if os.path.exists(pathjoin('embedding_cache','w2v_'+lang+'.txt.gz')):
        print("cached");
        continue;
    
    print("Loading zip",end=' ');t0 = time();
    if '.zip' in file:
        model = readNLPLFile(file);
    else:
        model = readGoogleFile();
    print("Done in %0.3fs" % (time() - t0))
    
    print("Filtering model",end=' ');t0 = time();
    vocabLang = vocabTotal[lang];
    vocabLang = {w:model[w] for w in vocabLang if w in model};
    print("Done in %0.3fs" % (time() - t0));
    embeddingSize = model.vector_size;
    
    
    del model;
    import gc;
    gc.collect();
    
    print("Language %s - Vocab Total %s - Embedding found %s" %(lang, len(vocabTotal[lang]), len(vocabLang)))

    print("Writing model",end=' ');t0 = time();
    with gzip.open(pathjoin('embedding_cache','w2v_'+lang+'.txt.gz'), 'w') as f:
        f.write(("%s %s\n"%(len(vocabLang),embeddingSize)).encode('utf-8'))
        for w in sorted(list(vocabLang.keys())):
            a = " ".join([str(f) for f in vocabLang[w]]);
            line = "%s %s\n" % (w, a)
            f.write(line.encode('utf-8'))
    print("Done in %0.3fs" % (time() - t0));
os.system( "say finished" )




Language it
cached



Language fr
cached



Language pl
cached



Language es
cached



Language pt
cached



Language en
Loading zip Done in 118.733s
Filtering model Done in 2.226s
Language en - Vocab Total 30149 - Embedding found 27720
Writing model Done in 20.645s


0

In [None]:
Language pt - Vocab Total 24161 - Embedding found 16971
Language en - Vocab Total 30149 - Embedding found 27720