In [None]:
import pandas as pd
from IPython.core.display import display
from deep_translator import GoogleTranslator
from spacy.lang.en import English
from spacy.lang.sv import Swedish
import os

nlp_en = English()
nlp_sv = Swedish()

In [None]:
def set_pandas_display_options() -> None:
    """Set pandas display options."""
    # Ref: https://stackoverflow.com/a/52432757/
    display = pd.options.display

    display.max_columns = 100
    display.max_rows = 10
    display.max_colwidth = 400
    display.width = 2000
    # display.precision = 2  # set as needed

set_pandas_display_options()

In [None]:
# print history of notebook
%history -g

In [None]:
# I suc core är datan uppdelad i meningar redan (all conll?), så det behövs inte göras.
df = pd.read_pickle('../data/coref_ontonotes.pkl')
texts = df['Text']
all_tokens_org = []
all_tokens_trans = []
sentence_lengths_dict = df['Sentence_lengths']
display(df)

In [None]:
# Deep translator takes a long time to run as it sleeps 2 seconds between every request
with open('data/alignment/p', 'a') as writer:
    for sentences in texts:
        for tokens_org in sentences:
            sentence = ' '.join(tokens_org)
            try:
                #translated_sents = GoogleTranslator('en', 'sv').translate_batch(joined_sents)
                translated_sent = GoogleTranslator(source='en', target='sv').translate(text=sentence)
                spacy_doc_sv = nlp_sv(translated_sent.replace('\n', ' '))
                tokens_trans = map(lambda token: token.text, spacy_doc_sv)
            except:
                print('not valid sentence: ', sentence)
                translated_sent = sentence
                tokens_trans = sentence


            # spacy_doc_en = nlp_en(translated_sent.replace('\n', ' '))
            # tokens_trans = list(map(lambda token: token.text, spacy_doc_en))

            all_tokens_org.append(tokens_org)
            all_tokens_trans.append(tokens_trans)

            if translated_sent.strip() and sentence.strip():
                writer.write(sentence)
                writer.write(' ||| ')
                writer.write(' '.join(tokens_trans))
                writer.write('\n')

In [None]:
parallel_path = '../data/alignment/parallel_small.txt'
all_sentence_len_org = []
all_sentence_len_trans = []
with open(parallel_path, 'r') as reader:
    lines = reader.readlines()
    for line in lines:
        sents = line.split('|||')
        all_tokens_org.append(sents[0].split())
        all_tokens_trans.append(sents[1].split())
    # for tokens_sv in texts:
    #     sentence = ' '.join(tokens_sv)
    #     translated_sent = GoogleTranslator(source='sv', target='en').translate(text=sentence)
    #     # spacy_doc_sv = nlp_sv(sent_sv.replace('\\n', ' '))
    #     # tokens_sv = map(lambda token: token.text, spacy_doc_sv)
    #     spacy_doc_en = nlp_en(translated_sent.replace('\n', ' '))
    #     tokens_en = list(map(lambda token: token.text, spacy_doc_en))
    #     all_tokens_org.append(tokens_sv)
    #     all_tokens_trans.append(tokens_en)
    all_sentence_len_org = list(map(len, all_tokens_org))
    all_sentence_len_trans = list(map(len, all_tokens_trans))



In [None]:
# Add column with translated sentences and their lengths to df
parallel_path = '../data/alignment/parallel_ontonotes.txt'
all_sentence_len_org = sentence_lengths_dict
all_sentence_len_trans = []
all_texts_org = texts
all_texts_trans = []
with open(parallel_path, 'r') as reader:
    for text in all_texts_org:
        text_trans = []
        sentence_len_trans = []
        for i in range(len(text)):
            line = reader.readline()
            sentence_trans = line.split('|||')[1].split()
            text_trans.append(sentence_trans)
            sentence_len_trans.append(len(sentence_trans))
        all_texts_trans.append(text_trans)
        all_sentence_len_trans.append(sentence_len_trans)
df['Text_trans'] = all_texts_trans
df['Sentence_lengths_trans'] = all_sentence_len_trans
display(df)

        

In [None]:

"align_command = '../eflomal/align.py -i data/alignment/parallel.txt --priors data/alignment/sv-en2.priors --model 3 \\ -f data/alignment/suc.fwd -r data/alignment/suc.rev'"
'../eflomal/align.py -i data/alignment/parallel_small.txt --priors data/alignment/sv-en2.priors --model 3 \\ -f data/alignment/suc_small.fwd -r data/alignment/suc_small.rev'"
"sym_command = '../word-alignment/python-test/fast_align/build/atools -c grow-diag-final-and -i data/alignment/suc.fwd -j data/alignment/suc.rev >data/alignment/suc.sym'"

In [None]:
# add column with alignments per sentence and a dict with alignments in relation to the full text to df
sym_file_path = '../data/alignment/ontonotes.sym'
all_sentence_len_org = df['Sentence_lengths']
all_sentence_len_trans = df['Sentence_lengths_trans']
all_formatted_alignments = []
alignment_dicts = []
with open(sym_file_path, 'r') as reader:
    for sen_len_list, sen_len_list_trans in zip(all_sentence_len_org, all_sentence_len_trans):
        formatted_alignments = []
        alignment_dict = {}
        offset_org = 0
        offset_trans = 0
        for sen_len, sen_len_trans in zip(sen_len_list, sen_len_list_trans):
            line = reader.readline()
            # line consists of alignment pairs in the form 0-0 1-1 2-1 ...
            alignment_pairs = [list(map(int, pair.split('-'))) for pair in line.split()]
            formatted_alignments.append(alignment_pairs)
            for alignment_pair in alignment_pairs:
                index_org = alignment_pair[0]
                index_trans = alignment_pair[1]
                alignment_dict[index_org + offset_org] = index_trans + offset_trans 

            offset_org += sen_len
            offset_trans += sen_len_trans
        all_formatted_alignments.append(formatted_alignments)
        alignment_dicts.append(alignment_dict)
df['Alignments'] = all_formatted_alignments
df['Alignment_dict'] = alignment_dicts
display(df)

In [None]:
# project the coref cluster from the original language
# to the translated language
cluster_lists_org = df['Coref_Clusters']
sent_lengths_org = df['Sentence_lengths']
sent_lengths_trans = df['Sentence_lengths_trans']
alignment_dicts = df['Alignment_dict']
cluster_lists_trans = []

for clusters, alignment_dict, sent_len_org, sent_len_trans in zip(cluster_lists_org, alignment_dicts, sent_lengths_org, sent_lengths_trans):
    new_clusters = []
    
    for cluster in clusters:
        new_cluster = []
        # kolla span, mentions
        for [start_index, end_index] in cluster:
            index_span = range(start_index, end_index + 1)
            new_indices = []
            for index in index_span:
                key = index
                if key in alignment_dict:
                    new_indices.append(alignment_dict.get(key))
            if new_indices:
                new_cluster.append([min(new_indices), max(new_indices)])
        new_clusters.append(new_cluster)
    cluster_lists_trans.append(new_clusters)
df['Coref_clusters_trans'] = cluster_lists_trans
display(df)
df.to_pickle('../data/coref_with_translations.pkl')

In [None]:
def flatten(list):
    flat_list = [item for sublist in list for item in sublist]
    return flat_list

In [None]:
#Print words in the same cluster

texts_org = df['Text']
clusters_col_org = df['Coref_Clusters']
texts_trans = df['Text_trans']
clusters_col_trans = df['Coref_clusters_trans']

for text, clusters in zip(texts_trans, clusters_col_trans):
    flat_text = flatten(text)
    print(' '.join(flat_text))
    for cluster in clusters:
        clustered_words = ''
        for span in cluster:
            [i_start, i_end] = span
            clustered_words += str(flat_text[i_start:i_end+1]) + ', '
        print(clustered_words)
        print('\n')

In [None]:
# Print words aligned with each other

texts_org = df['Text']
texts_trans = df['Text_trans']
alignment_dicts = df['Alignment_dict']

for text_org, text_trans, align_dict in zip(texts_org, texts_trans, alignment_dicts):
    flat_text_org = flatten(text_org)
    flat_text_trans = flatten(text_trans)
    for key, value in align_dict.items():
        try:
            print(flat_text_org[key], '\t', flat_text_trans[value])
        except:
            print("Error in alignment:", flat_text_org, key)
