In [1]:
import json
import os
import re
from tqdm import tqdm

In [2]:
series_key = 'asoif'
#series_key = 'hp'

In [4]:
coref_algo = 'neural'
coref_dist = 3
coref_params = f'{series_key}_{coref_algo}_{coref_dist}'
output_path = f'output_{coref_params}/'

In [5]:
def read_json(key, path):
    arrayjson = []
    for file in tqdm(sorted(os.listdir(path), 
                            key=lambda x: (
                                int(x.split('_')[-2]),
                                int(x.split('_')[-1].replace('.json', '')))
                           )
                    ):
        with open(path + file, encoding='utf-8') as f:
            arrayjson.append(json.load(f))
    
    return arrayjson

In [6]:
arrayjson = read_json(series_key, output_path)

100%|████████████████████████████████████████████████████████████████████████████████| 349/349 [01:39<00:00,  2.47it/s]


In [7]:
def replace_coref(chapter, chapsent, doCoref=True):
    #chapter = arrayjson[0]
    #chapsent = allsent[0]

    for index in chapter['corefs']:
        if not doCoref:
            break
        corefs = chapter['corefs'][index]
        
        ind0 = 0
        while not corefs[ind0]['isRepresentativeMention']:
            ind0 += 1

        num0 = corefs[ind0]['sentNum'] - 1
        start0 = corefs[ind0]['startIndex'] - 1
        end0 = corefs[ind0]['endIndex'] - 1
        gender0 = corefs[ind0]['gender']
        number0 = corefs[ind0]['number']
        type0 = corefs[ind0]['type']
        text0 = corefs[ind0]['text']
        textslice0 = chapsent[num0][start0:end0]

        #print('$$$$$$$$$$$$', textslice0, text0)

        # pass pronomial
        if type0 == 'PRONOMINAL':
            #print("______PASS_______")
            continue

        for i in range(len(corefs)):
            if i == ind0:
                continue
            cor = corefs[i]
            
            num_ = cor['sentNum'] - 1
            start_ = cor['startIndex'] - 1
            end_ = cor['endIndex'] - 1
            gender_ = cor['gender']
            number_ = cor['number']
            type_ = cor['type']
            text_ = cor['text']
            textslice_ = chapsent[num_][start_:end_]

            # pass direct speech
            if type_ == 'PRONOMINAL' and text_.lower() in ['i', 'you', 'we']:
                continue
            # pass mistakes in gender and number
            if gender0 != gender_ or number0 != number_:
                continue

            #print(num_, textslice_)
            chapsent[num_][start_] = [len(textslice_)] + textslice0[:]
            #print(num_, '!', chapsent[num_])

    for k in range(len(chapsent)):
        i = 0
        maxi = len(chapsent[k])
        while i < maxi:
            elem = chapsent[k][i]
            # substitue coref list
            if type(elem) == list:
                length = elem[0]
                chapsent[k][i:i + length] = elem[1:]
                continue
            # delete punctuation tokens
            if re.sub('\W', '', elem) == '':
                chapsent[k].pop(i)
            else:
                chapsent[k][i] = chapsent[k][i].replace('’', "'")
                i += 1
            maxi = len(chapsent[k])
        #print(chapsent[k])
        
    
    return chapsent[:]

def preprocess_data(arrayjson, doCoref=True):
    allsent = []

    for chapter in tqdm(arrayjson):
        chapsent = []
        for sent in chapter['sentences']:
            cursent = []
            for token in sent['tokens']:
                cursent.append(token['originalText'])
            chapsent.append(cursent)
        allsent += replace_coref(chapter, chapsent, doCoref)
    
    return allsent

In [8]:
dataNo = preprocess_data(arrayjson, False)
dataCoref = preprocess_data(arrayjson)

100%|████████████████████████████████████████████████████████████████████████████████| 349/349 [01:20<00:00,  4.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 349/349 [00:07<00:00, 46.96it/s]


## Save to txt

In [9]:
# from unidecode import unidecode
# .encode('ascii', 'ignore').decode() deletes all non ascii characters
# unidecode replaces non-ascii characters

with open(f'{coref_params}_dataNo.txt', 'w', encoding='utf-8') as f:
    #f.write(unidecode('\n'.join([' '.join(sent) for sent in dataNo])))
    f.write('\n'.join([' '.join(sent) for sent in dataNo if sent != []]))

with open(f'{coref_params}_dataCoref.txt', 'w', encoding='utf-8') as f:
    #f.write(unidecode('\n'.join([' '.join(sent) for sent in dataCoref])))
    f.write('\n'.join([' '.join(sent) for sent in dataCoref if sent != []]))

## Train models

In [10]:
pathtosave = f'./githubs/digitalhumanities_social_network_extraction-master/models/my_{coref_params}_'

### Word2Vec

In [11]:
from gensim.models import Word2Vec

In [12]:
%time word2vec1 = Word2Vec(dataNo, size=300)
%time word2vec2 = Word2Vec(dataCoref, size=300)

Wall time: 18.7 s
Wall time: 20.1 s


In [13]:
word2vec1.wv.save_word2vec_format(pathtosave + "w2v-default-No.model")
word2vec2.wv.save_word2vec_format(pathtosave + "w2v-default-Coref.model")

In [14]:
%time word2vec3 = Word2Vec(dataNo, size=300, window=12, iter=15, sg=1, negative=15)
%time word2vec4 = Word2Vec(dataCoref, size=300, window=12, iter=15, sg=1, negative=15)

Wall time: 8min 50s
Wall time: 9min 28s


In [15]:
word2vec3.wv.save_word2vec_format(pathtosave + "w2v-ns-No.model")
word2vec4.wv.save_word2vec_format(pathtosave + "w2v-ns-Coref.model")

In [16]:
%time word2vec5 = Word2Vec(dataNo, size=300, window=12, iter=15, sg=1, hs=1)
%time word2vec6 = Word2Vec(dataCoref, size=300, window=12, iter=15, sg=1, hs=1)

Wall time: 7min 42s
Wall time: 8min 5s


In [17]:
word2vec5.wv.save_word2vec_format(pathtosave + "w2v-hs-No.model")
word2vec6.wv.save_word2vec_format(pathtosave + "w2v-hs-Coref.model")

In [18]:
%time word2vec7 = Word2Vec(dataNo, size=300, window=12, iter=15, negative=15)
%time word2vec8 = Word2Vec(dataCoref, size=300, window=12, iter=15, negative=15)

Wall time: 1min 36s
Wall time: 1min 44s


In [19]:
word2vec7.wv.save_word2vec_format(pathtosave + "w2v-CBOW-No.model")
word2vec8.wv.save_word2vec_format(pathtosave + "w2v-CBOW-Coref.model")

### FastText

In [21]:
from fastText import FastText

In [22]:
%%time
ft1 = FastText.train_unsupervised(f'{coref_params}_dataNo.txt', model='skipgram', dim=300, ws=12, epoch=15, neg=15)

Wall time: 27min 35s


In [23]:
%%time
ft2 = FastText.train_unsupervised(f'{coref_params}_dataCoref.txt', model='skipgram', dim=300, ws=12, epoch=15, neg=15)

Wall time: 30min 18s


In [24]:
def write_ft_model(ft, mname):
    with open(pathtosave + mname, 'w', encoding='utf-8') as f:
        print(len(ft.get_words()), ft.get_dimension(), file=f)
        for word in tqdm(ft.get_words()):
            print(word, *ft.get_word_vector(word), file=f)

In [25]:
write_ft_model(ft1, 'ft-No.model')
write_ft_model(ft2, 'ft-Coref.model')

100%|████████████████████████████████████████████████████████████████████████████| 12790/12790 [08:25<00:00, 25.31it/s]
100%|██████████████████████████████████████████████████████████████████████████| 13071/13071 [00:07<00:00, 1847.44it/s]
