# 0. Read Source Data & Preprocessing

In [1]:
# Export Requirements.txt for this python env
# !pip freeze > requirements.txt

In [2]:
import pandas as pd

In [3]:
data = pd.read_excel('./data/3415_All_STORIES_Updated_ver02.xlsx', index_col = None, header = 0)

In [4]:
# Function to extract story content after the names

def extract_story(text):
    # Step 1: Split the text by line breaks
    lines = text.strip().split('\n')

    # Step 2: Detect the start of the story (the first longer segment, which should be a complete sentence)
    story_started = False
    story_lines = []
    
    for line in lines:
        # If the line contains more than a few words (let's say more than 3), assume it's the start of the story
        if len(line.split()) > 3:
            story_started = True
        # Once we detect the start of the story, append the rest of the lines
        if story_started:
            story_lines.append(line)
    
    # Step 3: Combine the story lines into a single string and return the result
    return ' '.join(story_lines).strip()

In [5]:
data['STORY TEXT'] = data['STORY TEXT'].apply(extract_story)

In [6]:
# remove '\n' and '_x000D_'
data['STORY TEXT'] = data['STORY TEXT'].str.replace("_x000D_|_x000D", " ")
data['STORY TEXT'] = data['STORY TEXT'].replace(r'\s+|\\n', ' ', regex=True)

  


In [7]:
# we have
# 3414 of stories
data

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,Ludwig Wörl (1906-1967) spent 11 years of his ...
1,2.0,Johann Pscheidt was born in the city of Radaut...
2,3.0,"Even before the war, Professor Tadeusz Czeżows..."
3,4.0,"During the occupation, Władysław Kowalski, a q..."
4,6.0,"Władysława Choms, the wife of a major in the P..."
...,...,...
3409,13524.0,"Mordechai Wulkan (b. 1910) and his wife, Chaja..."
3410,13611.0,"Sala Armel-Goldhar lived with her parents, Isr..."
3411,5257.0,Following Helena Zienowicz’s graduation from t...
3412,3053.0,"Before the war, Badowski Stefan Franciszek liv..."


# 1. Milestone 1: Find Names - NER & Co-reference Resolution

## 1.1 Find all Names: Simple NER: Start with SPACY Package and model “en_core_web_md” + "en_coreference_web_trf"

In [8]:
import spacy
import neuralcoref
from spacy import displacy

In [33]:
# !python -m spacy download en_core_web_md

Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [9]:
# Load SpaCy's English model
nlp = spacy.load('en_core_web_md')

# Add neuralcoref to SpaCy's pipeline
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7ff4a1f86b50>

## Coreference Resolution

In [10]:
# import tqdm to track a progress
from tqdm import tqdm
tqdm.pandas()

In [11]:
data['STORY TEXT'] = data['STORY TEXT'].progress_apply(lambda row : nlp(row)._.coref_resolved)

# exporting to draft folder is not the output for this project, it is just temporary file in the process.
data.to_csv('./drafts/3415_all_coref_data.csv', index = False)

100%|██████████| 3414/3414 [39:38<00:00,  1.44it/s] 


In [None]:
data = pd.read_csv('/drafts/3415_all_coref_data.csv')

### Separate into sentences, find Named Entity (PERSON) and make a dataframe

In [14]:
# Function to extract PERSON entities and their sentences from each story
def extract_person_and_sentence(row):
    story_id = row['STORY ID']
    story_text = row['STORY TEXT']
    
    # Parse the story text using Spacy
    doc = nlp(story_text)
    
    # name
    for sentence in doc.sents:
        for entity in sentence.ents:
            if entity.label_ == 'PERSON':
                results.append({
                    'storynumber': story_id,
                    'name': entity.text,
                    'sentence': sentence.text
                })

In [15]:
# Create an empty list to store results
results = []

# Apply the function to each row in the DataFrame
data.progress_apply(extract_person_and_sentence, axis=1)

100%|██████████| 3414/3414 [44:20<00:00,  1.28it/s] 


0       None
1       None
2       None
3       None
4       None
        ... 
3409    None
3410    None
3411    None
3412    None
3413    None
Length: 3414, dtype: object

In [16]:
# Convert the list of results to a DataFrame
table_1 = pd.DataFrame(results)

In [17]:
table_1

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
...,...,...,...
108159,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
108160,4481.0,Władyslawa,"On January 17,1990, Yad Vashem recognized Józe..."
108161,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
108162,4481.0,Teresa Drewek-,"On January 17,1990, Yad Vashem recognized Józe..."


In [71]:
# Extract Output
table_1.to_csv('./output/3415_SPACY_All_Names.xlsx', index = False)

## 1.2 Find Rescuers Names: Filtering Only Rescuers (Righteous Among the Nations) Names

### 1.2.1.1 Step 1: Preparing a table of filtered sentences: “Rescuing Phrases-Name” - Lexicon-based

In [70]:
# table_1 = pd.read_csv('./output/3415_SPACY_All_Names.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: './output/3415_SPACY_All_Names.xlsx'

In [19]:
# Table 1
table_1
# table1 = pd.read_excel('./output/3415_SPACY_All_Names.xlsx', index = False)

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
...,...,...,...
108159,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
108160,4481.0,Władyslawa,"On January 17,1990, Yad Vashem recognized Józe..."
108161,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
108162,4481.0,Teresa Drewek-,"On January 17,1990, Yad Vashem recognized Józe..."


In [20]:
# Table 2 (lexicon)
table_2 = pd.read_excel('./data/Final_Lexicon_Updated.xlsx')

In [21]:
table_2

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase
0,1.0,sent,his life as a political prisoner in nazi conce...
1,1.0,sent,"to the camp’s joinery and later, as a trained ..."
2,1.0,employed,personnel. appointed as the lagerälteste (the ...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ..."
...,...,...,...
8922,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8923,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8924,5257.0,sheltered,"Initially, they were only supposed to stay wit..."
8925,5257.0,obtained,Helenapresented the fugitives as her brother’s...


### Making Table 3 Starts

In [22]:
# input : Table1
# check if rescuing verb appear in the sentence. (Lexicon reference - same story ID)
# If there is rescuing verb existed, (Also name)
# add this sentence to a new table Table3 

In [23]:
#Table3
table_3 = pd.DataFrame(columns = ['Story ID','Rescuing Verb', 'Rescuing_phrases-name'])

for row in tqdm(table_1.itertuples(), total = table_1.shape[0]):
    # storyid = row.storynumber
    # 본 회차 스토리에서 찾아내야하는 verb 목록 불러오고
    verbs_to_find = list(set(table_2[table_2['Story ID'] == row.storynumber]['Rescuing Verb']))

    for verb in verbs_to_find:
        if verb in row.sentence:
            new_row = {'Story ID' : row.storynumber, 'Rescuing Verb' : str(verb), 'Rescuing_phrases-name' : row.sentence}
            table_3.loc[len(table_3)] = new_row

100%|██████████| 108164/108164 [00:53<00:00, 2030.79it/s]


In [24]:
table_3.drop_duplicates(inplace = True)

In [25]:
table_3.reset_index(drop = True, inplace = True)

In [26]:
table_3

Unnamed: 0,Story ID,Rescuing Verb,Rescuing_phrases-name
0,1.0,sent,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,saving,Appointed as the Lagerälteste (the camp elder)...
3,1.0,employed,Appointed as the Lagerälteste (the camp elder)...
4,1.0,obtain,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...
...,...,...,...
8125,4481.0,looked after,"The Matuszewskis, guided by humanitarian motiv..."
8126,4481.0,arranged,the children arranged for Regina to move in wi...
8127,4481.0,looked after,"The Kaczmareks made Regina feel at home, and w..."
8128,4481.0,help,"The Kaczmareks made Regina feel at home, and w..."


In [27]:
table_3.to_csv('./drafts/table_3.xlsx', index = False)

In [80]:
# Table_4 (Merge table_2 and table_3)
# table_4: Complete Lexicon Structure Example
# table_4 = pd.merge(table_2, table_3, how = 'left', on = ['Story ID','Rescuing Verb'])

In [81]:
# table_4
# table_4.to_csv('./drafts/table_4.xlsx', index = False)

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A..."
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...
...,...,...,...,...
31796,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
31797,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
31798,5257.0,sheltered,"Initially, they were only supposed to stay wit...",
31799,5257.0,obtained,Helenapresented the fugitives as her brother’s...,


### 1.2.1.2 Step 2:“Rescuers Names filtering” - Step-by-Step Breakdown

In [28]:
table_4 = table_3.copy()

### THE MOST IDEAL CODE SO FAR

In [29]:
# Function to find the rescuers and handle multiple people connected with 'and'
def find_rescuers(row):
    sentence = row['Rescuing_phrases-name']
    rescue_verb = row['Rescuing Verb']
    doc = nlp(sentence)
    
    # Store all rescuers (can be multiple due to 'and')
    rescuers = []
    verb_token = None
    
    # Step 1: Find the rescue verb and check for rescuers based on its position
    for token in doc:
        if token.text.lower() == rescue_verb.lower():
            verb_token = token
            
            # Special case for the verb 'recognized' to get all names after the verb
            if token.text == "recognized":
                # print('y')
                # Collect all PERSON entities that appear after the verb 'recognized'
                for ent in doc.ents:
                    if ent.label_ == "PERSON" and ent.start > token.i:
                        rescuers.append(ent.text)
            else:
                # General case: Check for nsubj (subject of the verb)
                for child in token.children:
                    if child.dep_ == "nsubj":
                        # Check if the subject is a PERSON entity
                        for ent in doc.ents:
                            if ent.text == child.text and ent.label_ == "PERSON":
                                rescuers.append(child.text)
                                # Handle 'and'-connected people (add each person separately)
                                for conjunct in child.conjuncts:  # Check conjuncts (and-connected tokens)
                                    if conjunct.ent_type_ == "PERSON":
                                        rescuers.append(conjunct.text)

    # Step 2: If no PERSON nsubj was found, find the closest PERSON before the verb
    if not rescuers and verb_token and rescue_verb.lower() != "recognized":
        closest_rescuer = None
        closest_distance = float('inf')
        
        # Find PERSON entities before the verb
        for ent in doc.ents:
            if ent.label_ == "PERSON" and ent.end <= verb_token.i:
                distance = verb_token.i - ent.end
                if distance < closest_distance:
                    closest_distance = distance
                    closest_rescuer = ent.text
        if closest_rescuer:
            rescuers.append(closest_rescuer)

    # Return each rescuer as a separate row
    return [{'Story ID': row['Story ID'], 'Rescuer': rescuer, 'Sentence': sentence, 'Rescue Verb': rescue_verb} for rescuer in rescuers]

In [30]:
# Apply the function to the DataFrame
results = table_4.progress_apply(find_rescuers, axis=1)

100%|██████████| 8130/8130 [03:17<00:00, 41.15it/s]


In [31]:
# Flatten the results (since it's a list of lists)
flattened_results = [item for sublist in results if sublist for item in sublist]

# Convert to DataFrame
fullname_df = pd.DataFrame(flattened_results)

In [32]:
fullname_df

Unnamed: 0,Story ID,Rescuer,Sentence,Rescue Verb
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...,sent
1,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",sent
2,1.0,Wörl,Appointed as the Lagerälteste (the camp elder)...,saving
3,1.0,Wörl,Appointed as the Lagerälteste (the camp elder)...,employed
4,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,obtain
...,...,...,...,...
6605,4481.0,Władyslawa,"On January 17,1990, Yad Vashem recognized Józe...",recognized
6606,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe...",recognized
6607,4481.0,Władyslawa,"On January 17,1990, Yad Vashem recognized Józe...",recognized
6608,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe...",recognized


In [59]:
table_5 = fullname_df[['Story ID', 'Sentence', 'Rescuer']]

In [60]:
table_5.columns = ['Story ID', 'Sentence', 'Fullname']

In [61]:
# Function to split full name into first and last name
def split_name(full_name):
    name_parts = full_name.split()
    if len(name_parts) > 1 :
        last_name = name_parts[-1]
        first_name = " ".join(name_parts[:-1])
    else:
        first_name = full_name
        last_name = None # If there's only one part, we consider it's their first name
    return pd.Series([first_name, last_name])

In [62]:
table_5[['Firstname', 'Lastname']] = table_5['Fullname'].progress_apply(split_name)

100%|██████████| 6610/6610 [00:00<00:00, 9547.37it/s] 


In [63]:
table_5

Unnamed: 0,Story ID,Sentence,Fullname,Firstname,Lastname
0,1.0,Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl
1,1.0,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl
2,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,
3,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,
4,1.0,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl
...,...,...,...,...,...
6605,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,
6606,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek
6607,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,
6608,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek


In [72]:
table_5.to_csv('./output/3415_SPACY_All_Rescuers.xlsx', index = False)

# Milestone 2: Find "Additional Properties" using NER

## 1.3 Adding Rescue dates (table_6)

In [65]:
table_6 = table_5.copy()

In [66]:
from datetime import datetime
import dateutil.parser

# Define the date range
start_date = datetime(1939, 9, 1)
end_date = datetime(1945, 5, 9)

# Function to extract the rescued date from the sentence using NER
def extract_rescue_date(sentence):
    doc = nlp(sentence)
    
    for ent in doc.ents:
        if ent.label_ == "DATE":
            try:
                # Parse the date using dateutil.parser
                extracted_date = dateutil.parser.parse(ent.text, fuzzy=True)
                
                # Check if the date falls within the defined range
                if start_date <= extracted_date <= end_date:
                    return ent.text  # Return the date if it's within the range
            except (ValueError, TypeError):
                # Skip dates that can't be parsed
                pass
                
    return None  # Return None if no valid date is found

In [67]:
# Apply the function to the 'Sentence' column and create a new column 'Rescue Date'
table_6['Rescue Date'] = table_6['Sentence'].progress_apply(extract_rescue_date)

100%|██████████| 6610/6610 [02:54<00:00, 37.96it/s]


In [69]:
table_6

Unnamed: 0,Story ID,Sentence,Fullname,Firstname,Lastname,Rescue Date
0,1.0,Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl,
1,1.0,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl,1942
2,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,,
3,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,,
4,1.0,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl,
...,...,...,...,...,...,...
6605,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,,
6606,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek,
6607,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,,
6608,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek,


In [73]:
table_6.to_csv('./output/3415_SPACY_All_Rescuers_Dates.xlsx', index = False)

## 1.4 Adding Additional Properties (table_7)

i. For each rescuer (“righteous”), Extract these additional properties, if possible, from the stories.  
ii. Create a separate and new Excel ("3415_SPACY_All_Rescuers_Properties.xlsx") like that:

In [74]:
table_7 = table_6.copy()

In [75]:
# List of common professions
profession_keywords = ['doctor', 'nurse', 'soldier', 'engineer', 'teacher']

# List of common religions
religion_keywords = ['Christian', 'Christianity', 'Muslim', 'Islam', 'Jewish', 'Judaism', 'Buddhist', 'Hindu', 'Hinduism']

# Function to extract information
def extract_information(sentence):
    doc = nlp(sentence)
    
    # Initialize information
    rescuer_info = {
        'Rescuer Profession': None,
        'Rescuer Birthplace': None,
        'Rescuer Age': None,
        'Rescuer Birth Date': None,
        'Rescuer Religion': None,
        'Organizational Affiliation': None,
        'Rescue Place': None,
    }
    
    # Extract NER entities and fill in the rescuer_info dictionary
    for ent in doc.ents:
        if ent.label_ == "DATE":
            if "born" in sentence or "birth" in sentence:
                rescuer_info['Rescuer Birth Date'] = ent.text
            elif "years old" in sentence:
                rescuer_info['Rescuer Age'] = ent.text
        elif ent.label_ == "ORG":
            rescuer_info['Organizational Affiliation'] = ent.text
        elif ent.label_ == "GPE":
            if "born" in sentence:
                rescuer_info['Rescuer Birthplace'] = ent.text
            else:
                # Assume GPE might refer to different place types
                rescuer_info['Rescue Place'] = ent.text
        elif ent.label_ == "NORP":
            if ent.text in religion_keywords:
                rescuer_info['Rescuer Religion'] = ent.text

    # Custom keyword-based extraction for profession
    for keyword in profession_keywords:
        if keyword in sentence.lower():
            rescuer_info['Rescuer Profession'] = keyword.capitalize()
            break
    
    return pd.Series(rescuer_info)

In [76]:
# Apply the function to extract information for each sentence
table_7 = table_7.join(table_7['Sentence'].progress_apply(extract_information))

100%|██████████| 6610/6610 [02:51<00:00, 38.57it/s]


In [77]:
table_7

Unnamed: 0,Story ID,Sentence,Fullname,Firstname,Lastname,Rescue Date,Rescuer Profession,Rescuer Birthplace,Rescuer Age,Rescuer Birth Date,Rescuer Religion,Organizational Affiliation,Rescue Place
0,1.0,Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl,,,,,,,Dachau,Munich
1,1.0,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl,1942,Nurse,,,,,Auschwitz,
2,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,,,Doctor,,,,Jewish,SS,
3,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,,,Doctor,,,,Jewish,SS,
4,1.0,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6605,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,,,,,,,,,
6606,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek,,,,,,,,
6607,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,,,,,,,,,
6608,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek,,,,,,,,


In [78]:
table_7.to_csv('./output/3415_SPACY_All_Rescuers_Properties.xlsx', index = False)

In [None]:
# END