In [None]:
# ENV (NLP)

# 0. Read Source Data & Preprocessing

In [2]:
import pandas as pd

In [10]:
data = pd.read_excel('./data/3415_All_STORIES_Updated_ver02.xlsx', index_col = None, header = 0)

In [11]:
# Function to extract story content after the names
def extract_story(text):
    # Step 1: Split the text by line breaks
    lines = text.strip().split('\n')

    # Step 2: Detect the start of the story (the first longer segment, which should be a complete sentence)
    story_started = False
    story_lines = []
    
    for line in lines:
        # If the line contains more than a few words (let's say more than 3), assume it's the start of the story
        if len(line.split()) > 3:
            story_started = True
        # Once we detect the start of the story, append the rest of the lines
        if story_started:
            story_lines.append(line)
    
    # Step 3: Combine the story lines into a single string and return the result
    return ' '.join(story_lines).strip()

In [12]:
data['STORY TEXT'] = data['STORY TEXT'].apply(extract_story)

In [13]:
# we have
# 3414 of stories
data

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,Ludwig Wörl (1906-1967) spent 11 years of his ...
1,2.0,Johann Pscheidt was born in the city of Radaut...
2,3.0,"Even before the war, Professor Tadeusz Czeżows..."
3,4.0,"During the occupation, Władysław Kowalski, a q..."
4,6.0,"Władysława Choms, the wife of a major in the P..."
...,...,...
3409,13524.0,"Mordechai Wulkan (b. 1910) and his wife, Chaja..."
3410,13611.0,"Sala Armel-Goldhar lived with her parents, Isr..."
3411,5257.0,Following Helena Zienowicz’s graduation from t...
3412,3053.0,"Before the war, Badowski Stefan Franciszek liv..."


# 1. Milestone 1: Find Names - NER & Co-reference Resolution

## 1.1 Find all Names: Simple NER: Start with SPACY Package and model “en_core_web_md” + "en_coreference_web_trf"

In [14]:
import spacy
import neuralcoref
from spacy import displacy

In [33]:
# !python -m spacy download en_core_web_md

Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [15]:
# Load SpaCy's English model
nlp = spacy.load('en_core_web_md')

# Add neuralcoref to SpaCy's pipeline
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fc69920c150>

## Coreference Resolution

In [17]:
# import tqdm to track a progress
from tqdm import tqdm
tqdm.pandas()

In [18]:
# Sampling data
data = data[:20]

In [19]:
data

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,Ludwig Wörl (1906-1967) spent 11 years of his ...
1,2.0,Johann Pscheidt was born in the city of Radaut...
2,3.0,"Even before the war, Professor Tadeusz Czeżows..."
3,4.0,"During the occupation, Władysław Kowalski, a q..."
4,6.0,"Władysława Choms, the wife of a major in the P..."
5,7.0,"At various times during the occupation, Ludwik..."
6,8.0,"Hauptmann Hans Hartmann, aged 40, was one of t..."
7,11.0,Gertruda Babilinska was born in 1902 in Starog...
8,13.0,"Edward Chacza, who lived in Baranowicze in the..."
9,14.0,"During the war, Domna Semenyuk was a farmer li..."


In [20]:
data['STORY TEXT'] = data['STORY TEXT'].progress_apply(lambda row : nlp(row)._.coref_resolved)

100%|██████████| 20/20 [00:20<00:00,  1.03s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [54]:
data['STORY TEXT'][0]

'Ludwig Wörl (1906-1967) spent 11 years of Ludwig Wörl (1906-1967) life as a political prisoner in Nazi concentration camps.  Ludwig Wörl (1906-1967) was first arrested by the Gestapo in 1934, and sent to Dachau for distributing a pamphlet in which the citizens of Munich were informed about the horrors of the camp. After spending some nine months in a dark detention cell, Ludwig Wörl (1906-1967) was first transferred to the camp’s joinery and later, as a trained medical orderly, assigned to the camp’s sick-bay. In 1942, Ludwig Wörl (1906-1967) was sent to Auschwitz together with 17 other male nurses to deal with an outbreak of typhus, which threatened not only the prisoners but also the German camp personnel. Appointed as the Lagerälteste (the camp elder) of the hospital barracks, Wörl, against the express orders of the SS, employed Jewish doctors, thus saving Jewish doctors from certain death. Ludwig Wörl (1906-1967) also put Ludwig Wörl (1906-1967) at risk in order to obtain at least

In [None]:
data.to_csv('./drafts/3415_all_coref_data.csv', index = False)

In [None]:
data = pd.read_csv('/drafts/3415_all_coref_data.csv')

### Separate into sentences, find Named Entity (PERSON) and make a dataframe

In [22]:
table_1 = pd.DataFrame(columns = ['storynumber', 'name', 'sentence'])

In [23]:
for id, story in zip(data['STORY ID'], data['STORY TEXT']):
    sentences = [i for i in nlp(story).sents]
    for sentence in sentences:
        for entity in sentence.ents:
            if entity.label_ == 'PERSON':
                stc = str(sentence)
                new_row = {'storynumber' : id, 'name' : entity.text, 'sentence' : stc}
                table_1.loc[len(table_1)] = new_row

In [24]:
table_1

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
...,...,...,...
591,33.0,Strauss,The Jewish pharmacist Strauss was likewise ass...
592,33.0,Strauss,The Jewish pharmacist Strauss was likewise ass...
593,33.0,Werner Krumme,Werner Krumme himself was released from Auschw...
594,33.0,Vashem,"On June 16, 1964, Yad Vashem recognized Werner..."


In [110]:
# Extract Output
table_1.to_csv('./output/3415_SPACY_All_Names.xlsx', index = False)

In [68]:
table_1 = pd.read_csv('./output/3415_SPACY_All_Names.xlsx')

## 1.2 Find Rescuers Names: Filtering Only Rescuers (Righteous Among the Nations) Names

### 1.2.1.1 Step 1: Preparing a table of filtered sentences: “Rescuing Phrases-Name” - Lexicon-based

In [25]:
# Table 1
table_1
# table1 = pd.read_excel('./output/3415_SPACY_All_Names.xlsx', index = False)

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
...,...,...,...
591,33.0,Strauss,The Jewish pharmacist Strauss was likewise ass...
592,33.0,Strauss,The Jewish pharmacist Strauss was likewise ass...
593,33.0,Werner Krumme,Werner Krumme himself was released from Auschw...
594,33.0,Vashem,"On June 16, 1964, Yad Vashem recognized Werner..."


In [62]:
table_1.iloc[17]

storynumber                                                  1.0
name                                                        Wörl
sentence       After the war Wörl, who became chairman of the...
Name: 17, dtype: object

In [26]:
# Table 2 (lexicon)
table_2 = pd.read_excel('./data/Final_Lexicon_Updated.xlsx')

In [71]:
table_2

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase
0,1.0,sent,his life as a political prisoner in nazi conce...
1,1.0,sent,"to the camp’s joinery and later, as a trained ..."
2,1.0,employed,personnel. appointed as the lagerälteste (the ...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ..."
...,...,...,...
8922,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8923,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8924,5257.0,sheltered,"Initially, they were only supposed to stay wit..."
8925,5257.0,obtained,Helenapresented the fugitives as her brother’s...


In [100]:
data.columns = ['Story ID', 'Story Text']

In [101]:
data

Unnamed: 0,Story ID,Story Text
0,1.0,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,2.0,Johann Pscheidt was born in the city of Radaut...
2,3.0,"Even before the war, Professor Tadeusz Czeżows..."
3,4.0,"During the occupation, Władysław Kowalski, a q..."
4,6.0,"Władysława Choms, the wife of a major in the P..."
5,7.0,"At various times during the occupation, Ludwik..."
6,8.0,"Hauptmann Hans Hartmann, aged 40, was one of t..."
7,11.0,Gertruda Babilinska was born in 1902 in Starog...
8,13.0,"Edward Chacza, who lived in Baranowicze in the..."
9,14.0,"During the war, Domna Semenyuk was a farmer li..."


#### Trying rescuing verb 등장하는 부분을 lexicon 의 Rescuing Phrase 활용해서 Story 에서 찾기 중인데 쉽지않음

In [98]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [118]:
# Step 1: Merge the two dataframes on 'STORY ID'
merged_df = pd.merge(table_2, data, on='Story ID')

In [119]:
# Step 2: Function to find and replace 'Rescuing Phrase' with the best match in 'STORY TEXT'
def update_rescuing_phrase(row):
    story_text = row['Story Text']
    rescuing_phrase = row['Rescuing Phrase']
    
    # Split the STORY TEXT into sentences
    sentences = story_text.split('. ')
    
    # Split the Rescuing Phrase into its own sentences for matching
    rescuing_sentences = rescuing_phrase.split('. ')
    
    # Track which sentences should be updated
    updated_sentences = []
    
    # Iterate through each rescuing sentence and match it to sentences in the story
    for resc_sentence in rescuing_sentences:
        match = process.extractOne(resc_sentence, sentences, scorer=fuzz.partial_ratio)
        
        # If a good match is found, replace the matched sentence with the rescuing sentence
        if match and match[1] > 80:  # You can adjust the threshold as needed
            matched_sentence = match[0]
            updated_sentences.append(matched_sentence)
            sentences[sentences.index(matched_sentence)] = resc_sentence  # Replace in STORY TEXT
            
    # Join the updated sentences back together
    return '. '.join(sentences)

In [120]:
# Step 3: Apply the function to each row to update the 'Rescuing Phrase'
merged_df['Updated Rescuing Phrase'] = merged_df.apply(update_rescuing_phrase, axis=1)

In [121]:
merged_df

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Story Text,Updated Rescuing Phrase
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,his life as a political prisoner in nazi conce...
1,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,employed,personnel. appointed as the lagerälteste (the ...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ...",Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
...,...,...,...,...,...
84,29.0,sent,"feiner, the bund representative and adolf berm...","From September 1942, even before the establish...",the. When Zegota was established in December 1...
85,29.0,helping,"in december 1942, bartoszewski (whose undercov...","From September 1942, even before the establish...","From September 1942, even before the establish..."
86,29.0,helped,the polish public. he published many articles ...,"From September 1942, even before the establish...","From September 1942, even before the establish..."
87,30.0,sent,"the world looks on”), describing the warsaw gh...","During the occupation, Kann organized the Scou...","During the occupation, Kann organized the Scou..."


In [122]:
merged_df.iloc[-1]['Rescuing Phrase']

'his wife were held in prison in germany until january 31, 1943, and then deported to auschwitz. mrs. krumme was sent to the gas chambers. an ss-man notified werner of her death, in may 1943, with the following words: “be happy'

In [123]:
merged_df.iloc[-1]['Updated Rescuing Phrase']

'Three months after Hitler’s rise to power, in May 1933, Werner Krumme (b. 1909 in Dortmund) married a Jewish wife. Despite official discrimination and frequent harassment by the Gestapo, Krumme stood by Krumme and refused to divorce a Jewish wife. mrs. Krumme, Renate and Anita Lasker, to flee to unoccupied France. his wife were held in prison in germany until january 31, 1943, and then deported to auschwitz. krumme was sent to the gas chambers. an ss-man notified werner of her death, in may 1943, with the following words: “be happy.  Werner used Werner position to falsify so-called selection lists and to assign Jews to work units with relatively tolerable conditions, which increased the chances of Jews survival. Werner thus added the names of Ernst Krinski and Peter Schwartz to a list of a transport that was leaving Auschwitz to the concentration camp of “Warschau,” where no gassing of prisoners took place. The Jewish pharmacist Strauss was likewise assigned by The Jewish pharmacist S

In [106]:
# Step 2: Function to find and replace 'Rescuing Phrase' in 'STORY TEXT'
def replace_rescuing_phrase(row):
    story_text = row['Story Text']
    rescuing_phrase = row['Rescuing Phrase']
    
    # Find the closest matching sentence in the STORY TEXT using fuzzy matching
    match = process.extractOne(rescuing_phrase, story_text.split('. '), scorer=fuzz.partial_ratio)
    
    # If a match is found, replace it with the rescuing phrase
    if match and match[1] > 80:  # Match score should be high enough, e.g., above 80
        matched_sentence = match[0]
        new_story_text = story_text.replace(matched_sentence, rescuing_phrase)
        return new_story_text
    else:
        return story_text  # If no good match, return the original story text

In [107]:
# Step 3: Apply the function to each row to replace the rescuing phrase
merged_df['Updated STORY TEXT'] = merged_df.apply(replace_rescuing_phrase, axis=1)

In [108]:
merged_df

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Story Text,Updated STORY TEXT
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,employed,personnel. appointed as the lagerälteste (the ...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ...",Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
...,...,...,...,...,...
84,29.0,sent,"feiner, the bund representative and adolf berm...","From September 1942, even before the establish...","From September 1942, even before the establish..."
85,29.0,helping,"in december 1942, bartoszewski (whose undercov...","From September 1942, even before the establish...","From September 1942, even before the establish..."
86,29.0,helped,the polish public. he published many articles ...,"From September 1942, even before the establish...","From September 1942, even before the establish..."
87,30.0,sent,"the world looks on”), describing the warsaw gh...","During the occupation, Kann organized the Scou...","During the occupation, Kann organized the Scou..."


### Making Table 3 Starts

In [27]:
#Table3
table_3 = pd.DataFrame(columns = ['Story ID','Rescuing Verb', 'Rescuing_phrases-name'])

In [28]:
# input
# Table1
table_1['sentence'][0]
# check if rescuing verb appear in the sentence. (Lexicon reference - same story ID)
# If there is rescuing verb existed,
# (Also name)
# add this sentence to a new table Table3 

'Ludwig Wörl (1906-1967) spent 11 years of Ludwig Wörl (1906-1967) life as a political prisoner in Nazi concentration camps.  '

In [29]:
table_1.head()

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."


In [79]:
list(set(table_2[table_2['Story ID'] == 1]['Rescuing Verb']))

['influence',
 'protected',
 'treatment',
 'sent',
 'employed',
 'put himself at risk',
 'dedicated',
 'helped',
 'protecting',
 'obtain',
 'exempted',
 'saving',
 'forge']

In [80]:
table_2

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase
0,1.0,sent,his life as a political prisoner in nazi conce...
1,1.0,sent,"to the camp’s joinery and later, as a trained ..."
2,1.0,employed,personnel. appointed as the lagerälteste (the ...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ..."
...,...,...,...
8922,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8923,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8924,5257.0,sheltered,"Initially, they were only supposed to stay wit..."
8925,5257.0,obtained,Helenapresented the fugitives as her brother’s...


In [84]:
# Function to find the rescuer (subject) for each sentence
def find_rescuer(df):
    sentence = df['Rescuing Phrase']
    rescuing_verb = df['Rescuing Verb']
    
    # Skip NaN values
    if pd.isna(sentence):
        return None
    
    # Process sentence with Spacy
    doc = nlp(sentence)
    
    # If the verb is multi-word (like "put himself at risk"), we handle it separately.
    if rescuing_verb in sentence:
        # Find the subject (nsubj) related to the rescuing verb
        for token in doc:
            if rescuing_verb.split()[0] == token.lemma_ or token.text == rescuing_verb.split()[0]:
                # Find the subject (nsubj) linked to the verb
                subject = [child for child in token.children if child.dep_ == "nsubj"]
                if subject:
                    return subject[0].text  # Return the first subject found
    
    return None  # Return None if no subject is found

In [85]:
table_2.apply(find_rescuer)

KeyError: 'Rescuing Phrase'

In [None]:

# Apply the function to the DataFrame
df['Rescuer'] = df.apply(find_rescuer, axis=1)

In [86]:
table_1

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
...,...,...,...
591,33.0,Strauss,The Jewish pharmacist Strauss was likewise ass...
592,33.0,Strauss,The Jewish pharmacist Strauss was likewise ass...
593,33.0,Werner Krumme,Werner Krumme himself was released from Auschw...
594,33.0,Vashem,"On June 16, 1964, Yad Vashem recognized Werner..."


In [30]:
for row in tqdm(table_1.itertuples(), total = table_1.shape[0]):
    # storyid = row.storynumber
    # 본 회차 스토리에서 찾아내야하는 verb 목록 불러오고
    verbs_to_find = list(set(table_2[table_2['Story ID'] == row.storynumber]['Rescuing Verb']))
    # row 한개를 nlp_base에 넣고 verb 추출
    doc = nlp(row.sentence)
    verbs_extracted = [token for token in doc if token.pos_ == 'VERB']
    for verb in verbs_extracted:
        # 존재하는 verb들 중 lexicon verbs 에 있는 verb를 발견하고
        if str(verb) in verbs_to_find:
            # 그 sentence의 사람 이름도 있다면(당연히 있지) 통과
            # 이 부분은 추후에 제거 가능 ( 나혼자 해본것.)
            # for entity in doc.ents:
            #     if entity.label_ == 'PERSON':
            #          print(entity.text)

            new_row = {'Story ID' : row.storynumber, 'Rescuing Verb' : str(verb), 'Rescuing_phrases-name' : row.sentence}
            table_3.loc[len(table_3)] = new_row

100%|██████████| 596/596 [00:14<00:00, 40.41it/s]


In [36]:
# Table 3
table_3.drop_duplicates()

Unnamed: 0,Story ID,Rescuing Verb,Rescuing_phrases-name
0,1.0,sent,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,employed,Appointed as the Lagerälteste (the camp elder)...
3,1.0,saving,Appointed as the Lagerälteste (the camp elder)...
6,1.0,obtain,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...
8,1.0,forge,Ludwig Wörl (1906-1967) would forge selection ...
9,1.0,protecting,"Here, again, Wörl made Wörl beneficial influen..."
11,1.0,exempted,Even prisoners with tuberculosis were able to ...
12,1.0,protected,Even prisoners with tuberculosis were able to ...
13,1.0,helped,"At the time of the evacuation of Auschwitz, Wö..."


In [77]:
table_3

Unnamed: 0,Story ID,Rescuing Verb,Rescuing_phrases-name
0,1.0,sent,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,employed,Appointed as the Lagerälteste (the camp elder)...
3,1.0,saving,Appointed as the Lagerälteste (the camp elder)...
4,1.0,employed,Appointed as the Lagerälteste (the camp elder)...
...,...,...,...
125,29.0,sent,"As part of Ludwik”) underground activity, Wład..."
126,29.0,helped,"The author, Rachel Auerbach, and Dr. A. Berman..."
127,29.0,helped,"The author, Rachel Auerbach, and Dr. A. Berman..."
128,30.0,sent,"The tract, which was also sent abroad, also co..."


In [76]:
table_3.iloc[9]

Story ID                                                               1.0
Rescuing Verb                                                   protecting
Rescuing_phrases-name    Here, again, Wörl made Wörl beneficial influen...
Name: 9, dtype: object

In [37]:
# Table_4 (Merge table_2 and table_3)
# table_4: Complete Lexicon Structure Example
table_4 = pd.merge(table_2, table_3, how = 'left', on = ['Story ID','Rescuing Verb'])

In [38]:
table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A..."
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...
...,...,...,...,...
9121,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
9122,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
9123,5257.0,sheltered,"Initially, they were only supposed to stay wit...",
9124,5257.0,obtained,Helenapresented the fugitives as her brother’s...,


In [39]:
# drop duplicated rows
table_4.drop_duplicates(inplace = True)

In [40]:
table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A..."
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...
...,...,...,...,...
9120,4481.0,recognized,"On January 17, 1990, Yad Vashem recognized Józ...",
9121,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
9123,5257.0,sheltered,"Initially, they were only supposed to stay wit...",
9124,5257.0,obtained,Helenapresented the fugitives as her brother’s...,


In [201]:
table_4.to_csv('./drafts/table_4.xlsx', index = False)

### 1.2.1.2 Step 2:“Rescuers Names filtering” - Step-by-Step Breakdown

In [265]:
table_4 = pd.read_csv('./drafts/table_4.xlsx')

In [41]:
table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A..."
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...
...,...,...,...,...
9120,4481.0,recognized,"On January 17, 1990, Yad Vashem recognized Józ...",
9121,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
9123,5257.0,sheltered,"Initially, they were only supposed to stay wit...",
9124,5257.0,obtained,Helenapresented the fugitives as her brother’s...,


In [47]:
# Function to find the full name of the rescuer, and split it into first name and last name
def extract_name_info(row):
    sentence = row['Rescuing_phrases-name']  # Adjusted column name
    rescuing_verb = row['Rescuing Verb']    # Adjusted column name
    
    # Skip NaN values
    if pd.isna(sentence):
        return pd.Series([None, None, None])  # Return None for full name, first name, and last name
    
    # Process sentence with Spacy
    doc = nlp(sentence)
    
    # Find the full name of the rescuer using NER
    full_name = None
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            full_name = ent.text  # Get the first PERSON entity (assuming it's the rescuer)
            break
    
    if not full_name:
        return pd.Series([None, None, None])  # Return None if no PERSON entity is found
    
    # Split the full name into first and last name
    name_parts = full_name.split()
    if len(name_parts) > 1:
        last_name = name_parts[-1]
        first_name = " ".join(name_parts[:-1])
    else:
        first_name = full_name
        last_name = None  # If no last name, leave it as None
    
    return pd.Series([full_name, first_name, last_name])

In [44]:
sample_table_4 = table_4[:40]

In [46]:
sample_table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A..."
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...
6,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Appointed as the Lagerälteste (the camp elder)...
8,1.0,put himself at risk,"barracks, wörl, against the express orders of ...",
9,1.0,obtain,"the ss, employed jewish doctors, thus saving t...",Ludwig Wörl (1906-1967) also put Ludwig Wörl (...
11,1.0,treatment,himself at risk in order to obtain at least a ...,
12,1.0,forge,obtain at least a minimum of the required medi...,Ludwig Wörl (1906-1967) would forge selection ...


In [48]:
# Apply the function to the DataFrame and store the results in separate columns
sample_table_4[['Full Name', 'First Name', 'Last Name']] = sample_table_4.apply(extract_name_info, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [49]:
sample_table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name,Full Name,First Name,Last Name
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...,Lagerälteste,Lagerälteste,
6,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Appointed as the Lagerälteste (the camp elder)...,Lagerälteste,Lagerälteste,
8,1.0,put himself at risk,"barracks, wörl, against the express orders of ...",,,,
9,1.0,obtain,"the ss, employed jewish doctors, thus saving t...",Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl
11,1.0,treatment,himself at risk in order to obtain at least a ...,,,,
12,1.0,forge,obtain at least a minimum of the required medi...,Ludwig Wörl (1906-1967) would forge selection ...,Ludwig Wörl,Ludwig,Wörl


In [51]:
table_5 = sample_table_4[['Story ID','Full Name', 'Last Name', 'First Name', 'Rescuing_phrases-name']]

In [52]:
table_5

Unnamed: 0,Story ID,Full Name,Last Name,First Name,Rescuing_phrases-name
0,1.0,Ludwig Wörl,Wörl,Ludwig,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,Ludwig Wörl,Wörl,Ludwig,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,Ludwig Wörl,Wörl,Ludwig,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,Wörl,Ludwig,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
4,1.0,Lagerälteste,,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
6,1.0,Lagerälteste,,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
8,1.0,,,,
9,1.0,Ludwig Wörl,Wörl,Ludwig,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...
11,1.0,,,,
12,1.0,Ludwig Wörl,Wörl,Ludwig,Ludwig Wörl (1906-1967) would forge selection ...


In [281]:
table_5 = sample_table_4[['Story ID','Full Name', 'Last Name', 'First Name', 'Resuing_phrases-name']]

Unnamed: 0,Story ID,Full Name,Last Name,First Name,Resuing_phrases-name
0,1.0,,,,
1,1.0,,,,
2,1.0,Lagerälteste,,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
3,1.0,Lagerälteste,,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
4,1.0,Lagerälteste,,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
5,1.0,Lagerälteste,,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
6,1.0,,,,
7,1.0,,,,
8,1.0,,,,
9,1.0,,,,


In [None]:
table_5.to_csv('3415_SPACY_All_Rescuers.xlsx', index = False)

### Rescuing DATE

In [91]:
# Function to find the full name of the rescuer, first name, last name, and the rescued date
def extract_name_and_date(row):
    sentence = row['Rescuing_phrases-name']  # Adjusted column name
    rescuing_verb = row['Rescuing Verb']    # Adjusted column name
    
    # Skip NaN values
    if pd.isna(sentence):
        return pd.Series([None, None, None, None])  # Return None for all if sentence is NaN
    
    # Process sentence with Spacy
    doc = nlp(sentence)
    
    # Extract full name (PERSON) and rescued date (DATE)
    full_name = None
    rescued_date = None
    for ent in doc.ents:
        if ent.label_ == "PERSON" and full_name is None:  # Get the first PERSON entity
            full_name = ent.text
        if ent.label_ == "DATE" and rescued_date is None:  # Get the first DATE entity
            rescued_date = ent.text

    if not full_name:
        return pd.Series([None, None, None, rescued_date])  # Return None for names if no PERSON found
    
    # Split the full name into first name and last name
    name_parts = full_name.split()
    if len(name_parts) > 1:
        last_name = name_parts[-1]
        first_name = " ".join(name_parts[:-1])
    else:
        first_name = full_name
        last_name = None  # If no last name, leave it as None
    
    return pd.Series([full_name, first_name, last_name, rescued_date])

In [92]:
# Apply the function to the DataFrame and store the results in separate columns
sample_table_4[['Full Name', 'First Name', 'Last Name', 'Rescued Date']] = sample_table_4.apply(extract_name_and_date, axis=1)

# Print the resulting DataFrame
print(sample_table_4)

    Story ID        Rescuing Verb  \
0        1.0                 sent   
1        1.0                 sent   
2        1.0                 sent   
3        1.0                 sent   
4        1.0             employed   
6        1.0               saving   
8        1.0  put himself at risk   
9        1.0               obtain   
11       1.0            treatment   
12       1.0                forge   
13       1.0            influence   
14       1.0           protecting   
16       1.0             exempted   
17       1.0            protected   
18       1.0               helped   
19       1.0            dedicated   
20       2.0                 sent   
21       2.0                 sent   
24       2.0                 sent   
25       2.0                 sent   
28       2.0             arranged   
30       2.0               rescue   
31       2.0             provided   
33       2.0             employed   
35       2.0               helped   
37       3.0             smuggled   
3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [93]:
sample_table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name,Full Name,First Name,Last Name,Rescued Date
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl,1906-1967
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl,1942
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl,1906-1967
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl,1942
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...,Lagerälteste,Lagerälteste,,
6,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Appointed as the Lagerälteste (the camp elder)...,Lagerälteste,Lagerälteste,,
8,1.0,put himself at risk,"barracks, wörl, against the express orders of ...",,,,,
9,1.0,obtain,"the ss, employed jewish doctors, thus saving t...",Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl,1906-1967
11,1.0,treatment,himself at risk in order to obtain at least a ...,,,,,
12,1.0,forge,obtain at least a minimum of the required medi...,Ludwig Wörl (1906-1967) would forge selection ...,Ludwig Wörl,Ludwig,Wörl,1906-1967


In [None]:
table_6.to_csv('3415_SPACY_All_Rescuers_Dates.xlsx', index = False)

In [94]:
# 문장 처리
# doc = nlp(sentence)

# 사람(Entity) 추출
people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
print("Detected people:", people)

# 의존성 파싱으로 주어-동사 관계 분석
for token in doc:
    if token.dep_ == "nsubj" and token.head.lemma_ == "help":
        print(f"Subject: {token.text}, Verb: {token.head.text}")

Detected people: ['Vashem', 'Werner Krumme']


In [None]:
3415_SPACY_All_Rescuers.XLS

In [None]:
output_file_name = '3415_SPACY_All_Names.xlsx'
output.to_csv(f'./output/{output_file_name}')