In [35]:
import re
import treetaggerwrapper
import os
from nltk import word_tokenize

In [17]:
def read_text_file(txt_file):
    with open(txt_file, encoding='utf-8') as txt_data:
        txt = txt_data.read()
    return txt

In [18]:
txt = read_text_file("1 Goethe_Die_Leiden_des_jungen_Werthers_1774.txt")

In [21]:
def tokenize_lemmatize_text(txt, txt_file, lemmatize="treetagger", text_language="German"):
    
    exlude_from_text = [".", "!", "?", ",", ":", "-", "'", "»", "«", "’", "“", '"']
    if lemmatize == "treetagger" and text_language== "German":
        tt = treetaggerwrapper.TreeTagger(TAGLANG='de')
        txt_pos = tt.tag_text(txt)

        txt_word_pos_lemma = list()
        for word_pos_lemma in txt_pos:
            txt_word_pos_lemma_split = re.split("\t", word_pos_lemma)
            txt_word_pos_lemma += [txt_word_pos_lemma_split]

        text_lemmatized = list()
        for i in txt_word_pos_lemma:
            try:
                if (i[2] == '<unknown>' or i[2] == '@card@'):
                    text_lemmatized += [i[0]]
                if (i[2]) == "Sie|sie":
                    text_lemmatized += ["sie"]
                else:
                    if i[2] not in exlude_from_text:
                        text_lemmatized += [i[2]]
            except IndexError:
                pass

    return text_lemmatized



In [25]:
text_tokenized=tokenize_lemmatize_text(txt=txt, txt_file="1 Goethe_Die_Leiden_des_jungen_Werthers_1774.txt")

In [26]:
def read_characters(char_file):
    with open(char_file, encoding='UTF-8') as char_data:
        doc = char_data.readlines()
        character_list = list()
        for line in doc:
            splitted_line = line.split("\t")
            splitted_line[-1] = splitted_line[-1].strip()
            splitted_line[0] = splitted_line[0].strip("\ufeff")
            character_list += [splitted_line]
        return character_list

# reads the characters line by line
# (each line contains one character, with tabstop separated synonyms)

In [30]:
character_list = read_characters("2 Goethe_Werther_74_characters.txt")

In [36]:
def tokenize_characters(character_list):
    characters_list_tokenized = list()
    for single_character_list in character_list:
        new_single_char_list = list()
        for i in single_character_list:
            new_single_char_list += [word_tokenize(i)]
        characters_list_tokenized += [new_single_char_list]
    #Problem: sometimes tokenisation includes dotes with single characters: ["Graf", "R."] sometimes not: ["N.", "N", "."]
    #remove all dots:
    characters_list_tokenized_without_dots = list()
    for single_character_list in characters_list_tokenized:
        new_single_char_list_2 = list()
        for synonym_liste in single_character_list:
            # get all indices for elements that contain only "." and remove these elements:
            remove_indices = [item for item in range(len(synonym_liste)) if synonym_liste[item] == "."]
            # return only that indices that should not be removed:
            new_single_char_list_2 += [[i for j, i in enumerate(synonym_liste) if j not in remove_indices]]
        characters_list_tokenized_without_dots += [new_single_char_list_2]

    characters_list_tokenized_without_dots_stripped = list()
    for single_character_list in characters_list_tokenized_without_dots:
        new_single_char_list = list()
        for synonym_liste in single_character_list:
            new_synonym_list = list()
            for i in synonym_liste:
                new_synonym_list += [i.strip(".")]
            new_single_char_list += [new_synonym_list]
        characters_list_tokenized_without_dots_stripped += [new_single_char_list]

    return characters_list_tokenized_without_dots_stripped

# tokenizes characters

## data scheme:
## [ [ [char_a], [a_synonym_1], [a_synonym_2_token_i, a_synonym_2_token_ii], ... ], [ [char_b], ... ] ]

In [51]:
characters_tokenized = tokenize_characters(character_list)

In [61]:
def find_character_positions(text_tokenized, characters_tokenized):
    characters_indices = list()
    for character_list in characters_tokenized:
        character_index = list()
        for character_synonym in character_list:
            if len(character_synonym) == 1:
                character_index += [[item] for item in range(len(text_tokenized)) if
                                    text_tokenized[item] == character_synonym[0]]
            elif len(character_synonym) > 1:
                for word_index in range(len(text_tokenized)):
                    if text_tokenized[word_index:word_index + (len(character_synonym))] == character_synonym:
                        character_index += [[i for i in range(word_index, word_index + (len(character_synonym)))]]
        characters_indices += [character_index]
    return characters_indices
    # finds token positions (indices in txt) for each character
    ## scheme:
    ##       char_1                  char_2
    ## [ [ [3], [8, 9], ... ], [ [12], [22, 23], ... ], ... ]

In [63]:
find_character_positions(text_tokenized, characters_tokenized)

[[[7413],
  [16496],
  [17179],
  [20556],
  [20749],
  [24989],
  [24991],
  [25036],
  [26607],
  [26619],
  [27523],
  [27558],
  [28221],
  [28312],
  [28412],
  [28431],
  [28618],
  [28653],
  [29216],
  [29424],
  [29585],
  [32317],
  [32486],
  [32494],
  [32505],
  [32550],
  [32575],
  [32715],
  [33477],
  [33762],
  [7],
  [22],
  [27370],
  [27430],
  [27469],
  [32220],
  [32274],
  [33560],
  [33601],
  [33931],
  [16],
  [28],
  [105],
  [107],
  [122],
  [127],
  [134],
  [162],
  [165],
  [174],
  [192],
  [196],
  [202],
  [203],
  [227],
  [242],
  [246],
  [249],
  [251],
  [270],
  [279],
  [336],
  [348],
  [379],
  [391],
  [414],
  [429],
  [465],
  [466],
  [598],
  [616],
  [624],
  [655],
  [660],
  [666],
  [682],
  [702],
  [726],
  [752],
  [769],
  [773],
  [833],
  [850],
  [851],
  [895],
  [900],
  [912],
  [934],
  [953],
  [1021],
  [1053],
  [1064],
  [1114],
  [1120],
  [1127],
  [1131],
  [1147],
  [1153],
  [1164],
  [1184],
  [1197],
  [1214],

In [56]:
def find_character_positions_2(text_tokenized, characters_tokenized):
    characters_indices = dict()
    for index, character_list in enumerate(characters_tokenized):
        character_index = list()
        for character_synonym in character_list:
            if len(character_synonym) == 1:
                character_index += [[item] for item in range(len(text_tokenized)) if
                                    text_tokenized[item] == character_synonym[0]]
            elif len(character_synonym) > 1:
                for word_index in range(len(text_tokenized)):
                    if text_tokenized[word_index:word_index + (len(character_synonym))] == character_synonym:
                        character_index += [[i for i in range(word_index, word_index + (len(character_synonym)))]]
        #characters_indices[index] = list()
        characters_indices[index] = character_index
    return characters_indices
    # finds token positions (indices in txt) for each character
    ## scheme:
    ##       char_1                  char_2
    ## [ [ [1], [8, 9], ... ], [ [12], [22, 23], ... ], ... ]

In [62]:
find_character_positions_2(text_tokenized, characters_tokenized)

{0: [[7413],
  [16496],
  [17179],
  [20556],
  [20749],
  [24989],
  [24991],
  [25036],
  [26607],
  [26619],
  [27523],
  [27558],
  [28221],
  [28312],
  [28412],
  [28431],
  [28618],
  [28653],
  [29216],
  [29424],
  [29585],
  [32317],
  [32486],
  [32494],
  [32505],
  [32550],
  [32575],
  [32715],
  [33477],
  [33762],
  [7],
  [22],
  [27370],
  [27430],
  [27469],
  [32220],
  [32274],
  [33560],
  [33601],
  [33931],
  [16],
  [28],
  [105],
  [107],
  [122],
  [127],
  [134],
  [162],
  [165],
  [174],
  [192],
  [196],
  [202],
  [203],
  [227],
  [242],
  [246],
  [249],
  [251],
  [270],
  [279],
  [336],
  [348],
  [379],
  [391],
  [414],
  [429],
  [465],
  [466],
  [598],
  [616],
  [624],
  [655],
  [660],
  [666],
  [682],
  [702],
  [726],
  [752],
  [769],
  [773],
  [833],
  [850],
  [851],
  [895],
  [900],
  [912],
  [934],
  [953],
  [1021],
  [1053],
  [1064],
  [1114],
  [1120],
  [1127],
  [1131],
  [1147],
  [1153],
  [1164],
  [1184],
  [1197],
  [121

In [60]:
text_tokenized[20048:20050]

['Hofrat', 'R']

In [38]:
import json
from pprint import pprint
from collections import defaultdict
from operator import itemgetter
from itertools import *

with open('Werther_IH.json') as f:
    data = json.load(f)

positions = defaultdict(list)

for n,each in enumerate(data):
    if "entities" in each:
        for i in each['entities']:
            positions[i].append(n)            

updated_positions = defaultdict(list) # add list of lists

for k,v in positions.items():
    for l, g in groupby(enumerate(v), lambda x: x[0]-x[1]):
        updated_positions[k].append(list(map(itemgetter(1), g)))
        

In [40]:
print(positions)

defaultdict(<class 'list'>, {0: [2, 3, 4, 14, 15, 16, 45, 48, 55, 74, 79, 111, 114, 133, 139, 148, 153, 157, 170, 181, 186, 197, 217, 223, 231, 232, 261, 277, 283, 287, 289, 308, 313, 323, 385, 391, 404, 406, 438, 441, 451, 477, 485, 495, 535, 536, 544, 562, 686, 701, 706, 715, 742, 753, 759, 765, 766, 780, 782, 801, 806, 833, 846, 861, 879, 884, 904, 943, 951, 958, 969, 970, 1022, 1028, 1043, 1062, 1067, 1089, 1170, 1207, 1219, 1278, 1279, 1287, 1295, 1300, 1320, 1326, 1332, 1339, 1340, 1364, 1379, 1397, 1398, 1437, 1441, 1448, 1451, 1467, 1474, 1480, 1482, 1489, 1492, 1549, 1564, 1614, 1652, 1662, 1669, 1683, 1699, 1714, 1722, 1727, 1732, 1742, 1750, 1755, 1780, 1849, 1850, 1902, 1906, 1917, 1926, 1961, 1969, 1976, 1980, 1985, 1996, 2001, 2014, 2015, 2021, 2025, 2029, 2041, 2047, 2059, 2116, 2120, 2137, 2183, 2200, 2218, 2239, 2264, 2276, 2322, 2328, 2379, 2438, 2446, 2464, 2537, 2540, 2543, 2572, 2575, 2656, 2666, 2672, 2677, 2920, 2923, 2926, 2948, 2955, 3065, 3076, 3077, 3116, 312