In [None]:
# New env (NER)
# required versions in 'requirements.txt'

# 0. Read Source Data & Preprocessing

In [1]:
import pandas as pd

In [29]:
data = pd.read_excel('./data/3415_All_STORIES_Updated.xlsx', index_col = None, header = 0)

In [30]:
data.head()

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,"Wörl, Ludwig \n\nLudwig Wörl (1906-1967) spent..."
1,2.0,"Pscheidt, Johann_x000D_\n_x000D_\nJohann Psche..."
2,3.0,"Czeżowski, Tadeusz_x000D_\nCzeżowska, Antonina..."
3,4.0,"Kowalski, Władysław_x000D_\n_x000D_\nDuring th..."
4,6.0,"Choms, Władysława_x000D_\n_x000D_\nWładysława ..."


In [31]:
# remove '\n' and '_x000D_'
data['STORY TEXT'] = data['STORY TEXT'].str.replace("_x000D_", " ")
data['STORY TEXT'] = data['STORY TEXT'].replace(r'\s+|\\n', ' ', regex=True)

In [32]:
# we have
# 3414 of stories
data

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,"Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ..."
1,2.0,"Pscheidt, Johann Johann Pscheidt was born in t..."
2,3.0,"Czeżowski, Tadeusz Czeżowska, Antonina Czeżows..."
3,4.0,"Kowalski, Władysław During the occupation, Wła..."
4,6.0,"Choms, Władysława Władysława Choms, the wife o..."
...,...,...
3409,13524.0,"Kosek, Julian Kosek, Janina Mordechai Wulkan (..."
3410,13611.0,"Sajowski, Mikołaj Sajowska, Helena Dembińska (..."
3411,5257.0,"File 5257, 5257a ZIENOWICZ, HELENA KUKOLEWSKI,..."
3412,3053.0,"Files 3053; 3053a Badowski, Stefan Franciszek/..."


In [33]:
# sampling the data
data = data[:50]

In [34]:
data

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,"Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ..."
1,2.0,"Pscheidt, Johann Johann Pscheidt was born in t..."
2,3.0,"Czeżowski, Tadeusz Czeżowska, Antonina Czeżows..."
3,4.0,"Kowalski, Władysław During the occupation, Wła..."
4,6.0,"Choms, Władysława Władysława Choms, the wife o..."
5,7.0,"Hencel, Ludwik Hencel, Roman At various times ..."
6,8.0,"Hartmann, Hans Hauptmann Hans Hartmann, aged 4..."
7,11.0,Gertruda Babilinska Gertruda Babilinska was bo...
8,13.0,"Chacza, Edward Edward Chacza, who lived in Bar..."
9,14.0,"Semenyuk, Domna Loseva (Semenyuk), Nadezhda Du..."


# 1. Milestone 1: Find Names - NER & Co-reference Resolution

## 1.1 Find all Names: Simple NER: Start with SPACY Package and model “en_core_web_md” + "en_coreference_web_trf"

In [6]:
import spacy
import spacy_experimental

In [33]:
# !python -m spacy download en_core_web_md

Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [7]:
# Load basic Spacy model for splitting sentences and simple NER
nlp_base = spacy.load('en_core_web_md')
nlp_coref = spacy.load('en_coreference_web_trf')

# Combine two different models (Coref + NER)

# use replace_listeners for the coref components
nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])

# we won't copy over the span cleaner
nlp_base.add_pipe("coref", source=nlp_coref)
nlp_base.add_pipe("span_resolver", source=nlp_coref)

  from .autonotebook import tqdm as notebook_tqdm


<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x7fbf0381a220>

In [46]:
doc = nlp_base(data.iloc[1][1])

  doc = nlp_base(data.iloc[1][1])


In [47]:
for cluster in doc.spans:
    print(f"{cluster}: {doc.spans[cluster]}")

coref_head_clusters_1: [Pscheidt, Pscheidt, Pscheidt, his, he, He, he, Pscheidt, him, him, Pscheidt, Pscheidt, he, Pscheidt, Pscheidt, Pscheidt, he, Pscheidt, Pscheidt, his, Pscheidt, his, He, his, his, Pscheidt, Pscheidt, Pscheidt, his, him, He, Pscheidt, his, his, Pscheidt, he, Pscheidt, his, He, himself, Pscheidt, his, his, Pscheidt, his, he, he, He, his, his, he, Pscheidt]
coref_head_clusters_2: [Czernowitz, Czernowitz, city, Czernowitz]
coref_head_clusters_3: [workers, their, workers]
coref_head_clusters_4: [Germany, Germany]
coref_head_clusters_5: [underground, underground, underground, underground]
coref_head_clusters_6: [Będzin, Będzin, Zawiercie]
coref_head_clusters_7: [factory, factory]
coref_head_clusters_8: [1943, 1943, 1943]
coref_head_clusters_9: [ghetto, ghetto, it]
coref_head_clusters_10: [Some, them]
coref_head_clusters_11: [attic, attic, attic]
coref_head_clusters_12: [hid, period]
coref_head_clusters_13: [people, them, all, fugitives, them, them]
coref_head_clusters_

## Coreference Resolution

In [8]:
# since Spacy doesn't have direct coreference resolver
# we define lightweight function for resolving references in text
def resolve_references(doc) -> str:
    """
    Function for resolving references with the coref ouput
    doc (Doc): The Doc object processed by the coref pipeline
    RETURNS (str): The Doc string with resolved references
    """
    # token.idx : token.text
    token_mention_mapper = {}
    output_string = ""
    clusters = [
    val for key, val in doc.spans.items() if key.startswith("coref_cluster")
        ]

    # Iterate through every found cluster
    for cluster in clusters:
        first_mention = cluster[0]
        # Iterate through every other span in the cluster
        for mention_span in list(cluster)[1:]:
            # Set first_mention as value for the first token in mention_span in the token_mention_mapper
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
            for token in mention_span[1:]:
                # Set empty string for all the other tokens in mention_span
                token_mention_mapper[token.idx] = ""
    # Iterate through every token in the Doc
    for token in doc:
        # Check if token exists in token_mention_mapper
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        # Else add original token text
        else:
            output_string += token.text + token.whitespace_
    return output_string

In [35]:
coref_data = pd.DataFrame(columns = ['STORY ID', 'STORY TEXT'])

from spacy import displacy
from tqdm import tqdm

for row in tqdm(data.itertuples(), total = data.shape[0]):
    resolved_doc = resolve_references(nlp_base(row._2))
    # output = nlp_base(resolved_doc)
    # displacy.render(output, style="ent")
    new_row = {'STORY ID' : row._1, 'STORY TEXT' : resolved_doc}
    coref_data.loc[len(coref_data)] = new_row

100%|██████████| 50/50 [03:12<00:00,  3.84s/it]


In [36]:
coref_data

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,"Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ..."
1,2.0,"Pscheidt, Johann Johann Pscheidt was born in t..."
2,3.0,"Czeżowski, Czeżowski , Antonina Czeżowska, Ter..."
3,4.0,"Kowalski, Władysław During the occupation, Kow..."
4,6.0,"Choms, Władysława Władysława Choms, the wife o..."
5,7.0,"Hencel, Hencel , Roman At various times during..."
6,8.0,"Hartmann, Hans Hauptmann Hartmann , was one of..."
7,11.0,Gertruda Babilinska Gertruda Babilinska was bo...
8,13.0,"Chacza, Edward Edward Chacza, who lived in Bar..."
9,14.0,"Semenyuk, Domna Loseva (Semenyuk), Nadezhda Du..."


### Separate into sentences, find Named Entity (PERSON) and make a dataframe

In [37]:
table_1 = pd.DataFrame(columns = ['storynumber', 'name', 'sentence'])

In [38]:
for id, story in zip(coref_data['STORY ID'], coref_data['STORY TEXT']):
    sentences = [i for i in nlp_base(story).sents]
    for sentence in sentences:
        for entity in sentence.ents:
            if entity.label_ == 'PERSON':
                stc = str(sentence)
                new_row = {'storynumber' : id, 'name' : entity.text, 'sentence' : stc}
                table_1.loc[len(table_1)] = new_row

In [41]:
table_1[table_1['storynumber'] == 1.0]

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Ludwig Wörl,"Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ..."
1,1.0,Wörl,"Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ..."
2,1.0,Ludwig Ludwig Wörl,"Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ..."
3,1.0,Ludwig Ludwig Wörl,"Wörl, Ludwig Ludwig Wörl (1906-1967) was first..."
4,1.0,Wörl,After spending some nine months in a dark dete...
5,1.0,Ludwig Ludwig Wörl,After spending some nine months in a dark dete...
6,1.0,Wörl,"In 1942, Wörl, Ludwig Ludwig Wörl (1906-1967) ..."
7,1.0,Ludwig Ludwig Wörl,"In 1942, Wörl, Ludwig Ludwig Wörl (1906-1967) ..."
8,1.0,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
9,1.0,Wörl,Appointed as the Lagerälteste (the camp elder)...


In [110]:
# Extract Output
table_1.to_csv('./output/3415_SPACY_All_Names.xlsx', index = False)

In [55]:
table_1 = pd.read_csv('./output/3415_SPACY_All_Names.xlsx')

## 1.2 Find Rescuers Names: Filtering Only Rescuers (Righteous Among the Nations) Names

### 1.2.1.1 Step 1: Preparing a table of filtered sentences: “Rescuing Phrases-Name” - Lexicon-based

In [56]:
# Table 1
table_1
# table1 = pd.read_excel('./output/3415_SPACY_All_Names.xlsx', index = False)

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Ludwig Wörl,"Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ..."
1,1.0,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
2,1.0,Wörl,Appointed as the Lagerälteste (the camp elder)...
3,1.0,Wörl,As a result of consistently countermanding the...
4,1.0,Kapos,"Here, again, he made his beneficial influence ..."
...,...,...,...
63055,4481.0,Michał,"Regina’s brother, Michał, also survived, and a..."
63056,4481.0,Yad Vashem,"On January 17,1990, Yad Vashem recognized Józe..."
63057,4481.0,Józef Matuszewski,"On January 17,1990, Yad Vashem recognized Józe..."
63058,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."


In [57]:
# Table 2 (lexicon)
table_2 = pd.read_excel('./data/Final_Lexicon_Updated.xlsx')

In [58]:
table_2

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase
0,1.0,sent,his life as a political prisoner in nazi conce...
1,1.0,sent,"to the camp’s joinery and later, as a trained ..."
2,1.0,employed,personnel. appointed as the lagerälteste (the ...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ..."
...,...,...,...
8922,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8923,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8924,5257.0,sheltered,"Initially, they were only supposed to stay wit..."
8925,5257.0,obtained,Helenapresented the fugitives as her brother’s...


In [59]:
#Table3
table_3 = pd.DataFrame(columns = ['Story ID','Rescuing Verb', 'Resuing_phrases-name'])

In [61]:
# input
# Table1
table_1['sentence'][0]
# check if rescuing verb appear in the sentence. (Lexicon reference - same story ID)
# If there is rescuing verb existed,
# (Also name)
# add this sentence to a new table Table3 

'Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 years of his life as a political prisoner in Nazi concentration camps.'

In [62]:
# sampling table_1
table_1_ = table_1[:50]

In [63]:
table_1_

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Ludwig Wörl,"Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ..."
1,1.0,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
2,1.0,Wörl,Appointed as the Lagerälteste (the camp elder)...
3,1.0,Wörl,As a result of consistently countermanding the...
4,1.0,Kapos,"Here, again, he made his beneficial influence ..."
5,1.0,Wörl,Even prisoners with tuberculosis were able to ...
6,1.0,Wörl,"After the war Wörl, who became chairman of the..."
7,1.0,Yad Vashem,"On March 19, 1963, Yad Vashem recognized Ludwi..."
8,1.0,Ludwig Wörl,"On March 19, 1963, Yad Vashem recognized Ludwi..."
9,2.0,Johann Johann Pscheidt,"Pscheidt, Johann Johann Pscheidt was born in t..."


In [65]:
for row in tqdm(table_1_.itertuples(), total = table_1_.shape[0]):
    # storyid = row.storynumber
    # 본 회차 스토리에서 찾아내야하는 verb 목록 불러오고
    verbs_to_find = list(set(table_2[table_2['Story ID'] == row.storynumber]['Rescuing Verb']))
    # row 한개를 nlp_base에 넣고 verb 추출
    doc = nlp_base(row.sentence)
    verbs_extracted = [token for token in doc if token.pos_ == 'VERB']
    for verb in verbs_extracted:
        # 존재하는 verb들 중 lexicon verbs 에 있는 verb를 발견하고
        if str(verb) in verbs_to_find:
            # 그 sentence의 사람 이름도 있다면(당연히 있지) 통과
            # 이 부분은 추후에 제거 가능 ( 나혼자 해본것.)
            for entity in doc.ents:
                if entity.label_ == 'PERSON':
                     print(entity.text)

            # new_row = {'Story ID' : row.storynumber, 'Rescuing Verb' : str(verb), 'Resuing_phrases-name' : row.sentence}
            # table_3.loc[len(table_3)] = new_row

  4%|▍         | 2/50 [00:00<00:15,  3.10it/s]

Lagerälteste
Wörl
Lagerälteste
Wörl


  6%|▌         | 3/50 [00:01<00:16,  2.92it/s]

Lagerälteste
Wörl
Lagerälteste
Wörl


 10%|█         | 5/50 [00:01<00:15,  2.87it/s]

Kapos


 12%|█▏        | 6/50 [00:02<00:14,  3.00it/s]

Wörl
Wörl


 14%|█▍        | 7/50 [00:02<00:14,  2.92it/s]

Wörl


 28%|██▊       | 14/50 [00:04<00:13,  2.68it/s]

Pscheidt


 56%|█████▌    | 28/50 [00:08<00:07,  3.08it/s]

Adolf Eichmann
Eichmann


 60%|██████    | 30/50 [00:09<00:05,  3.39it/s]

Adolf Eichmann
Eichmann


 76%|███████▌  | 38/50 [00:12<00:05,  2.31it/s]

Fessel
Fessel


 78%|███████▊  | 39/50 [00:13<00:04,  2.33it/s]

Czezowski


 80%|████████  | 40/50 [00:13<00:03,  2.51it/s]

Czezowski
Złata Kaczergińska


 82%|████████▏ | 41/50 [00:13<00:03,  2.57it/s]

Czezowski
Złata Kaczergińska


 94%|█████████▍| 47/50 [00:16<00:01,  2.64it/s]

Kowalski
Kowalski


 96%|█████████▌| 48/50 [00:16<00:00,  2.63it/s]

Kowalski
Kowalski


100%|██████████| 50/50 [00:17<00:00,  2.89it/s]

Kowalski
Kowalski





In [198]:
# Table 3
table_3

Unnamed: 0,Story ID,Rescuing Verb,Resuing_phrases-name
0,1.0,employed,Appointed as the Lagerälteste (the camp elder)...
1,1.0,saving,Appointed as the Lagerälteste (the camp elder)...
2,1.0,employed,Appointed as the Lagerälteste (the camp elder)...
3,1.0,saving,Appointed as the Lagerälteste (the camp elder)...
4,1.0,protecting,"Here, again, he made his beneficial influence ..."
...,...,...,...
8122,4481.0,arranged,They arranged for Regina to move in with their...
8123,4481.0,recognized,"On January 17,1990, Yad Vashem recognized Józe..."
8124,4481.0,recognized,"On January 17,1990, Yad Vashem recognized Józe..."
8125,4481.0,recognized,"On January 17,1990, Yad Vashem recognized Józe..."


In [199]:
# Table_4 (Merge table_2 and table_3)
# table_4: Complete Lexicon Structure Example
table_4 = pd.merge(table_2, table_3, how = 'left', on = ['Story ID','Rescuing Verb'])

In [200]:
table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Resuing_phrases-name
0,1.0,sent,his life as a political prisoner in nazi conce...,
1,1.0,sent,"to the camp’s joinery and later, as a trained ...",
2,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...
3,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...
4,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Appointed as the Lagerälteste (the camp elder)...
...,...,...,...,...
14856,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
14857,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
14858,5257.0,sheltered,"Initially, they were only supposed to stay wit...",
14859,5257.0,obtained,Helenapresented the fugitives as her brother’s...,


In [201]:
table_4.to_csv('./drafts/table_4.xlsx', index = False)

### 1.2.1.2 Step 2:“Rescuers Names filtering” - Step-by-Step Breakdown

In [49]:
table_4 = pd.read_csv('./drafts/table_4.xlsx')

In [52]:
table_4['Resuing_phrases-name']

0                                                      NaN
1                                                      NaN
2        Appointed as the Lagerälteste (the camp elder)...
3        Appointed as the Lagerälteste (the camp elder)...
4        Appointed as the Lagerälteste (the camp elder)...
                               ...                        
14856                                                  NaN
14857                                                  NaN
14858                                                  NaN
14859                                                  NaN
14860                                                  NaN
Name: Resuing_phrases-name, Length: 14861, dtype: object

In [None]:
for entity in sentence.ents:
            if entity.label_ == 'PERSON':
                stc = str(sentence)
                new_row = {'storynumber' : id, 'name' : entity.text, 'sentence' : stc}
                table_1.loc[len(table_1)] = new_row

In [None]:
3415_SPACY_All_Rescuers.XLS

In [38]:
# Combine two different models (Coref + NER)

nlp_coref = spacy.load("en_coreference_web_trf")

# use replace_listeners for the coref components
nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])

# we won't copy over the span cleaner
nlp_base.add_pipe("coref", source=nlp_coref)
nlp_base.add_pipe("span_resolver", source=nlp_coref)

^C


NameError: name 'spacy' is not defined

In [12]:
text = data['STORY TEXT'][0]

In [13]:
text

'Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 years of his life as a political prisoner in Nazi concentration camps. He was first arrested by the Gestapo in 1934, and sent to Dachau for distributing a pamphlet in which the citizens of Munich were informed about the horrors of the camp. After spending some nine months in a dark detention cell, he was first transferred to the camp’s joinery and later, as a trained medical orderly, assigned to the camp’s sick-bay. In 1942, he was sent to Auschwitz together with 17 other male nurses to deal with an outbreak of typhus, which threatened not only the prisoners but also the German camp personnel. Appointed as the Lagerälteste (the camp elder) of the hospital barracks, Wörl, against the express orders of the SS, employed Jewish doctors, thus saving them from certain death. He also put himself at risk in order to obtain at least a minimum of the required medicines and medical instruments for the treatment of the sick. He would forge selection l

In [15]:
from spacy import displacy  
doc = nlp_base(text) 
# displacy.serve(doc, style="ent")

In [18]:
for e in doc :
    print(e)

Wörl
,
Ludwig
Ludwig
Wörl
(
1906
-
1967
)
spent
11
years
of
his
life
as
a
political
prisoner
in
Nazi
concentration
camps
.
He
was
first
arrested
by
the
Gestapo
in
1934
,
and
sent
to
Dachau
for
distributing
a
pamphlet
in
which
the
citizens
of
Munich
were
informed
about
the
horrors
of
the
camp
.
After
spending
some
nine
months
in
a
dark
detention
cell
,
he
was
first
transferred
to
the
camp
’s
joinery
and
later
,
as
a
trained
medical
orderly
,
assigned
to
the
camp
’s
sick
-
bay
.
In
1942
,
he
was
sent
to
Auschwitz
together
with
17
other
male
nurses
to
deal
with
an
outbreak
of
typhus
,
which
threatened
not
only
the
prisoners
but
also
the
German
camp
personnel
.
Appointed
as
the
Lagerälteste
(
the
camp
elder
)
of
the
hospital
barracks
,
Wörl
,
against
the
express
orders
of
the
SS
,
employed
Jewish
doctors
,
thus
saving
them
from
certain
death
.
He
also
put
himself
at
risk
in
order
to
obtain
at
least
a
minimum
of
the
required
medicines
and
medical
instruments
for
the
treatment
of
the
sick
.


In [37]:
for story in data['STORY TEXT']:
    

0       Wörl, Ludwig Ludwig Wörl (1906-1967) spent 11 ...
1       Pscheidt, Johann Johann Pscheidt was born in t...
2       Czeżowski, Tadeusz Czeżowska, Antonina Czeżows...
3       Kowalski, Władysław During the occupation, Wła...
4       Choms, Władysława Władysława Choms, the wife o...
                              ...                        
3410    Kosek, Julian Kosek, Janina Mordechai Wulkan (...
3411    Sajowski, Mikołaj Sajowska, Helena Dembińska (...
3412    File 5257, 5257a ZIENOWICZ, HELENA KUKOLEWSKI,...
3413    Files 3053; 3053a Badowski, Stefan Franciszek/...
3414    Files 4481; 4481a Matuszewski, Józef Matuszews...
Name: STORY TEXT, Length: 3415, dtype: object

In [None]:
output_file_name = '3415_SPACY_All_Names.xlsx'
output.to_csv(f'./output/{output_file_name}')