# 0. Read Source Data & Preprocessing

In [67]:
!pip install allennlp allennlp-models

Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting allennlp-models
  Downloading allennlp_models-2.10.1-py3-none-any.whl (464 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.5/464.5 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nltk>=3.6.5
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting cached-path<1.2.0,>=1.1.3
  Downloading cached_path-1.1.6-py3-none-any.whl (26 kB)
Collecting fairscale==0.4.6
  Downloading fairscale-0.4.6.tar.gz (248 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.2/248.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requi

In [1]:
# Export Requirements.txt for this python env
# !pip freeze > requirements.txt

In [1]:
import pandas as pd

In [2]:
data = pd.read_excel('./data/3415_All_STORIES_Updated_ver02.xlsx', index_col = None, header = 0)

In [3]:
# Function to extract story content after the names
def extract_story(text):
    """
    Extracts the main content of a story from a block of text by detecting where the actual story begins, 
    typically after introductory names or short segments.

    Parameters:
    - text (str): A block of text containing names, introductions, and the main story content, typically separated 
                  by line breaks.

    Returns:
    - str: The extracted story content as a single string, starting from the first detected full sentence or longer 
           segment and including all subsequent lines.

    Description:
    - The function splits the input text into lines using line breaks.
    - It detects the start of the main story by identifying the first line that is likely a complete sentence, 
      defined here as having more than 3 words.
    - After identifying the start of the story, all subsequent lines are considered part of the story and are combined 
      into a single string.

    Examples:
    - extract_story("John Doe\nMary Smith\n\nOnce upon a time, there was a brave soldier.") ->
        "Once upon a time, there was a brave soldier."
    - extract_story("Name List:\nDr. Alice\nMr. Bob\n\nHe started his journey on a rainy day.") ->
        "He started his journey on a rainy day."
    """
    # Step 1: Split the text by line breaks
    lines = text.strip().split('\n')

    # Step 2: Detect the start of the story (the first longer segment, which should be a complete sentence)
    story_started = False
    story_lines = []

    # Iterate through each line to detect the story start
    for line in lines:
        # If the line contains more than a few words (let's say more than 3), assume it's the start of the story
        if len(line.split()) > 3:
            story_started = True
        # Once we detect the start of the story, append the rest of the lines
        if story_started:
            story_lines.append(line)
    
    # Step 3: Combine the story lines into a single string and return the result
    return ' '.join(story_lines).strip()

In [4]:
data['STORY TEXT'] = data['STORY TEXT'].apply(extract_story)

In [5]:
# remove '\n' and '_x000D_'
data['STORY TEXT'] = data['STORY TEXT'].str.replace("_x000D_|_x000D", " ")
data['STORY TEXT'] = data['STORY TEXT'].replace(r'\s+|\\n', ' ', regex=True)

  


In [6]:
# we have
# 3414 of stories
data

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,Ludwig Wörl (1906-1967) spent 11 years of his ...
1,2.0,Johann Pscheidt was born in the city of Radaut...
2,3.0,"Even before the war, Professor Tadeusz Czeżows..."
3,4.0,"During the occupation, Władysław Kowalski, a q..."
4,6.0,"Władysława Choms, the wife of a major in the P..."
...,...,...
3409,13524.0,"Mordechai Wulkan (b. 1910) and his wife, Chaja..."
3410,13611.0,"Sala Armel-Goldhar lived with her parents, Isr..."
3411,5257.0,Following Helena Zienowicz’s graduation from t...
3412,3053.0,"Before the war, Badowski Stefan Franciszek liv..."


# 1. Milestone 1: Find Names - NER & Co-reference Resolution

## 1.1 Find all Names: Simple NER: Start with SPACY Package and model “en_core_web_md” + "en_coreference_web_trf"

In [7]:
import spacy
import neuralcoref
from spacy import displacy

In [33]:
# !python -m spacy download en_core_web_md

Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [8]:
# Load SpaCy's English model
nlp = spacy.load('en_core_web_md')

# Add neuralcoref to SpaCy's pipeline
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fb5b1922610>

## Step A. Coreference Resolution

In [10]:
# import tqdm to track a progress 
from tqdm import tqdm
# Ensure tqdm progress bar works with pandas
tqdm.pandas()

In [11]:
def resolve_coreferences(data, column_name):
    """
    Applies coreference resolution on a specified column of a DataFrame using SpaCy and NeuralCoref.

    Parameters:
    - data (pd.DataFrame): The DataFrame containing the text data.
    - column_name (str): The name of the column to apply coreference resolution on.

    Returns:
    - pd.DataFrame: A DataFrame with the coreferences resolved in the specified column.
    """
    # Apply coreference resolution using the SpaCy NLP pipeline
    data[column_name] = data[column_name].progress_apply(lambda row: nlp(row)._.coref_resolved)
    
    return data

In [12]:
resolved_data = resolve_coreferences(data, 'STORY TEXT')

100%|██████████| 3414/3414 [39:45<00:00,  1.43it/s] 


In [27]:
resolved_data.to_csv('./output/3415_Coreference_Resolution.csv', index = False)

In [68]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref

  from .autonotebook import tqdm as notebook_tqdm


In [69]:
# Load the pre-trained coreference resolution model from AllenNLP
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")

def resolve_coreferences_with_allennlp(text):
    """
    Resolves coreferences in the given text using AllenNLP's coreference resolution model.

    Parameters:
    - text (str): The input text in which coreferences need to be resolved.

    Returns:
    - str: The text with resolved coreferences.
    """
    # Use the predictor to process the text and resolve coreferences
    result = predictor.predict(document=text)
    resolved_text = predictor.coref_resolved(text)
    return resolved_text

Downloading: 100%|██████████| 414/414 [00:00<00:00, 986kB/s]
Downloading: 100%|██████████| 208k/208k [00:00<00:00, 26.3MB/s]
Downloading: 100%|██████████| 634M/634M [01:03<00:00, 10.4MB/s]   
Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Spacy models 'en_core_web_sm' not found.  Downloading and installing.


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [70]:
data['STORY TEXT'][:10]

0    Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1    Johann Pscheidt was born in the city of Radaut...
2    Even before the war, Professor Tadeusz Czeżows...
3    During the occupation, Władysław Kowalski, a q...
4    Władysława Choms, the wife of a major in the P...
5    At various times during the occupation, Ludwik...
6    Hauptmann Hans Hartmann, aged 40, was one of t...
7    Gertruda Babilinska was born in 1902 in Starog...
8    Edward Chacza, who lived in Baranowicze in the...
9    During the war, Domna Semenyuk was a farmer li...
Name: STORY TEXT, dtype: object

In [71]:
sample_data = data['STORY TEXT'][:10]

In [73]:
sample_resolved = sample_data.progress_apply(resolve_coreferences_with_allennlp)

  num_effective_segments = (seq_lengths + self._max_length - 1) // self._max_length
100%|██████████| 10/10 [03:01<00:00, 18.14s/it]


In [94]:
sample_story = resolve_coreferences_with_allennlp(data['STORY TEXT'][3401])

  num_effective_segments = (seq_lengths + self._max_length - 1) // self._max_length


In [95]:
data['STORY TEXT'][3401]

'Rudolf Hermelin, born in Warsaw in 1897, found himself, together with himself wife and daughter, in the Warsaw Ghetto as soon as it was established. himself could not find a job and failed to adapt to life there. In April 1941 himself and himself family moved to the area called Plebania, which was centered around the All Saints church. the All Saints church was included in the Warsaw Ghetto and run by Rev. Marceli Godlewski (recognized by Yad Vashem as Righteous Among the Nations in 2009). the All Saints church served the ghetto’s community of converts, and there Hermelin managed to get the position as manager and community board member. During the Great Aktion (mass execution) of 1942, the Germans eradicated the converted community. the Germans members were sent to the Umschlagplatz (The departure point in Warsaw from which hundreds of thousands of Jews were deported to Nazi extermination camps for final selection and from there to Treblinka). Among Nazi extermination camps were Rudo

In [96]:
sample_story # ALLEN NLP

'Rudolf Hermelin, born in Warsaw in 1897, found Rudolf Hermelin, born in Warsaw in 1897, together with Rudolf Hermelin, born in Warsaw in 1897 wife and daughter, in the Warsaw Ghetto as soon as the Warsaw Ghetto was established. Rudolf Hermelin, born in Warsaw in 1897 could not find a job and failed to adapt to life there. In April 1941 Rudolf Hermelin, born in Warsaw in 1897 and Rudolf Hermelin, born in Warsaw in 1897 family moved to the area called Plebania, which was centered around the All Saints church. the All Saints church was included in the Warsaw Ghetto and run by Rev. Marceli Godlewski (recognized by Yad Vashem as Righteous Among the Nations in 2009). the All Saints church served the Warsaw Ghetto community of converts, and there Rudolf Hermelin, born in Warsaw in 1897 managed to get the position as manager and community board member. During the Great Aktion (mass execution) of 1942, the Germans eradicated the ghetto’s community of converts. the Germans members were sent to 

In [97]:
resolved_data['STORY TEXT'][3401] # NEURAL COREF

'Rudolf Hermelin, born in Warsaw in 1897, found himself, together with himself wife and daughter, in the Warsaw Ghetto as soon as it was established. himself could not find a job and failed to adapt to life there. In April 1941 himself and himself family moved to the area called Plebania, which was centered around the All Saints church. the All Saints church was included in the Warsaw Ghetto and run by Rev. Marceli Godlewski (recognized by Yad Vashem as Righteous Among the Nations in 2009). the All Saints church served the ghetto’s community of converts, and there Hermelin managed to get the position as manager and community board member. During the Great Aktion (mass execution) of 1942, the Germans eradicated the converted community. the Germans members were sent to the Umschlagplatz (The departure point in Warsaw from which hundreds of thousands of Jews were deported to Nazi extermination camps for final selection and from there to Treblinka). Among Nazi extermination camps were Rudo

In [None]:

# Example usage
input_text = """
Johann Pscheidt was born in the city of Radauti near Czernowitz. He helped poor Jews in the city with food and money.
"""
resolved_text = resolve_coreferences_with_allennlp(input_text)
print("Original Text:")
print(input_text)
print("\nResolved Text:")
print(resolved_text)

## Step B. find Named Entity (PERSON)(Table_1)

In [112]:
def extract_person_and_sentence(data):
    """
    Extracts PERSON entities and their corresponding sentences from each story in the DataFrame.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the stories with 'STORY ID' and 'STORY TEXT' columns.

    Returns:
    - pd.DataFrame: A DataFrame with columns ['storynumber', 'name', 'sentence'], 
                    containing PERSON entities and the sentences they appear in.
    """
    # Initialize a list to store the results
    results = []

    # Function to extract PERSON entities and their sentences from a single row
    def extract_from_row(row):
        story_id = row['STORY ID']
        story_text = row['STORY TEXT']
        
        # Parse the story text using SpaCy
        doc = nlp(story_text)
        
        # Extract PERSON entities and their sentences
        for sentence in doc.sents:
            for entity in sentence.ents:
                if entity.label_ == 'PERSON':
                    results.append({
                        'storynumber': story_id,
                        'name': entity.text,
                        'sentence': sentence.text
                    })

    # Apply the extraction function to each row in the DataFrame
    data.progress_apply(extract_from_row, axis=1)
    # data.progress_apply(extract_from_row)

    # Convert results to a DataFrame and return
    results_df = pd.DataFrame(results, columns=['storynumber', 'name', 'sentence'])
    return results_df

In [14]:
table_1 = extract_person_and_sentence(resolved_data)

100%|██████████| 3414/3414 [45:15<00:00,  1.26it/s] 


In [113]:
df = {'STORY ID' : [13253],
      'STORY TEXT' : sample_story}

In [114]:
pd.DataFrame(df)

Unnamed: 0,STORY ID,STORY TEXT
0,13253,"Rudolf Hermelin, born in Warsaw in 1897, found..."


In [115]:
sample_table_1 = extract_person_and_sentence(pd.DataFrame(df))

100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


In [116]:
sample_table_1


Unnamed: 0,storynumber,name,sentence
0,13253,Rudolf Hermelin,"Rudolf Hermelin, born in Warsaw in 1897, found..."
1,13253,Rudolf Hermelin,"Rudolf Hermelin, born in Warsaw in 1897, found..."
2,13253,Rudolf Hermelin,"Rudolf Hermelin, born in Warsaw in 1897, found..."
3,13253,Rudolf Hermelin,"Rudolf Hermelin, born in Warsaw in 1897 could ..."
4,13253,Rudolf Hermelin,"In April 1941 Rudolf Hermelin, born in Warsaw ..."
...,...,...,...
87,13253,Maria Swiader,"On April 11, 2016, Yad Vashem recognized Adam ..."
88,13253,Magdalena Miedziejewska,"On April 11, 2016, Yad Vashem recognized Adam ..."
89,13253,Marta Kielak,"On April 11, 2016, Yad Vashem recognized Adam ..."
90,13253,Adam Swiader,"On April 11, 2016, Yad Vashem recognized Adam ..."


In [15]:
table_1

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
...,...,...,...
108159,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
108160,4481.0,Władyslawa,"On January 17,1990, Yad Vashem recognized Józe..."
108161,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
108162,4481.0,Teresa Drewek-,"On January 17,1990, Yad Vashem recognized Józe..."


In [16]:
# Extract Output
table_1.to_csv('./output/3415_SPACY_All_Names.csv', index = False)

## Step C. extract_rescuing_phrases

## 1.2 Find Rescuers Names: Filtering Only Rescuers (Righteous Among the Nations) Names

### 1.2.1.1 Step 1: Preparing a table of filtered sentences: “Rescuing Phrases-Name” - Lexicon-based

In [17]:
# Table 2 (lexicon)
table_2 = pd.read_excel('./data/Final_Lexicon_Updated.xlsx')

In [18]:
table_2

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase
0,1.0,sent,his life as a political prisoner in nazi conce...
1,1.0,sent,"to the camp’s joinery and later, as a trained ..."
2,1.0,employed,personnel. appointed as the lagerälteste (the ...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ..."
...,...,...,...
8922,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8923,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8924,5257.0,sheltered,"Initially, they were only supposed to stay wit..."
8925,5257.0,obtained,Helenapresented the fugitives as her brother’s...


### Table 3 (Extract_Rescuing_Phrases from table_1 using table_2)

In [117]:
def extract_rescuing_phrases(table_1, table_2):
    """
    Extracts rescuing phrases from table_1 based on the rescuing verbs found in table_2 and
    constructs a new DataFrame (table_3) with the matching Story ID, Rescuing Verb, and corresponding phrases.

    Parameters:
    - table_1 (pd.DataFrame): DataFrame containing story numbers and sentences.
    - table_2 (pd.DataFrame): DataFrame containing story IDs and rescuing verbs.

    Returns:
    - pd.DataFrame: A new DataFrame (table_3) with columns ['Story ID', 'Rescuing Verb', 'Rescuing_phrases-name'].
    """
    # Initialize the resulting DataFrame
    table_3 = pd.DataFrame(columns=['Story ID', 'Rescuing Verb', 'Rescuing_phrases-name'])

    # Iterate through each row of table_1
    for row in tqdm(table_1.itertuples(), total=table_1.shape[0]):
        # Extract the list of verbs to find from table_2 based on the Story ID
        verbs_to_find = list(set(table_2[table_2['Story ID'] == row.storynumber]['Rescuing Verb']))

        # Check if each verb is present in the current sentence
        for verb in verbs_to_find:
            if verb in row.sentence:
                # Create a new row with the matching story ID, verb, and sentence
                new_row = {
                    'Story ID': row.storynumber,
                    'Rescuing Verb': str(verb),
                    'Rescuing_phrases-name': row.sentence
                }
                # Append the new row to table_3
                table_3.loc[len(table_3)] = new_row

    return table_3

In [118]:
sample_table_3 = extract_rescuing_phrases(sample_table_1, table_2)

100%|██████████| 92/92 [00:00<00:00, 407.18it/s]


In [53]:
table_3 = extract_rescuing_phrases(table_1, table_2)

100%|██████████| 108164/108164 [00:56<00:00, 1927.91it/s]


In [54]:
table_3.drop_duplicates(inplace = True)
table_3.reset_index(drop = True, inplace = True)

In [119]:
sample_table_3

Unnamed: 0,Story ID,Rescuing Verb,Rescuing_phrases-name
0,13253,sent,the Germans members were sent to the Umschlagp...
1,13253,sent,"In February 1943, Rev. Czarnecki, Rev. Godlews..."
2,13253,sent,"In February 1943, Rev. Czarnecki, Rev. Godlews..."
3,13253,sent,"In February 1943, Rev. Czarnecki, Rev. Godlews..."
4,13253,sent,"In February 1943, Rev. Czarnecki, Rev. Godlews..."
...,...,...,...
95,13253,provided,"When Adam Swiader sister, Franciszka Setkowska..."
96,13253,to take care of,"When Adam Swiader sister, Franciszka Setkowska..."
97,13253,cared for,"Rudolf Hermelin, born in Warsaw in 1897 dedica..."
98,13253,cared for,"Rudolf Hermelin, born in Warsaw in 1897 dedica..."


In [55]:
table_3

Unnamed: 0,Story ID,Rescuing Verb,Rescuing_phrases-name
0,1.0,sent,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,saving,Appointed as the Lagerälteste (the camp elder)...
3,1.0,employed,Appointed as the Lagerälteste (the camp elder)...
4,1.0,treatment,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...
...,...,...,...
8125,4481.0,looked after,"The Matuszewskis, guided by humanitarian motiv..."
8126,4481.0,arranged,the children arranged for Regina to move in wi...
8127,4481.0,help,"The Kaczmareks made Regina feel at home, and w..."
8128,4481.0,looked after,"The Kaczmareks made Regina feel at home, and w..."


In [56]:
samples = table_3[table_3['Story ID'] == 13253]

In [66]:
samples.to_csv('./drafts_csv/samples.csv', index = False)

## Step D. Find Rescuer (Table_5)

### 1.2.1.2 Step 2:“Rescuers Names filtering” - Step-by-Step Breakdown

In [120]:
def find_rescuers(row):
    """
    Identifies rescuers in a sentence by analyzing the rescuing verb and extracting PERSON entities 
    associated with it, including handling cases where multiple people are connected with 'and'.

    Parameters:
    - row (pd.Series): A row from a DataFrame containing at least the following columns:
                       - 'Rescuing_phrases-name': A sentence that describes the rescuing action.
                       - 'Rescuing Verb': The verb that indicates the rescuing action.
                       - 'Story ID': A unique identifier for the story the sentence belongs to.

    Returns:
    - list of dict: A list of dictionaries, each containing the following keys:
        - 'Story ID': The ID of the story to which the sentence belongs.
        - 'Rescuer': The name of the rescuer identified in the sentence.
        - 'Sentence': The original sentence from which the rescuer was extracted.
        - 'Rescue Verb': The rescuing verb used to identify the rescuer.
    """
    sentence = row['Rescuing_phrases-name']
    rescue_verb = row['Rescuing Verb']
    doc = nlp(sentence)
    
    # Store all rescuers (can be multiple due to 'and')
    rescuers = []
    verb_token = None
    
    # Step 1: Find the rescue verb and check for rescuers based on its position
    for token in doc:
        if token.text.lower() == rescue_verb.lower():
            verb_token = token
            
            # Special case for the verb 'recognized' to get all names after the verb
            if token.text == "recognized":
                # Collect all PERSON entities that appear after the verb 'recognized'
                for ent in doc.ents:
                    if ent.label_ == "PERSON" and ent.start > token.i:
                        rescuers.append(ent.text)
            else:
                # General case: Check for nsubj (subject of the verb)
                for child in token.children:
                    if child.dep_ == "nsubj":
                        # Check if the subject is a PERSON entity and likely a rescuer
                        for ent in doc.ents:
                            if ent.text == child.text and ent.label_ == "PERSON":
                                # Add context filtering to ensure it's a rescuer, not a rescued person
                                if "help" in child.head.lemma_ or "rescue" in child.head.lemma_:
                                    rescuers.append(child.text)
                                # Handle 'and'-connected people (add each person separately)
                                for conjunct in child.conjuncts:
                                    if conjunct.ent_type_ == "PERSON":
                                        rescuers.append(conjunct.text)

    # Step 2: If no PERSON nsubj was found, find the closest PERSON before the verb
    if not rescuers and verb_token and rescue_verb.lower() != "recognized":
        closest_rescuer = None
        closest_distance = float('inf')
        
        # Find PERSON entities before the verb, with additional validation
        for ent in doc.ents:
            if ent.label_ == "PERSON" and ent.end <= verb_token.i:
                distance = verb_token.i - ent.end
                if distance < closest_distance:
                    # Validate the closest entity to avoid including rescued persons
                    if not any(rescue_verb in sent for sent in ent.sent.text):
                        closest_distance = distance
                        closest_rescuer = ent.text
        if closest_rescuer:
            rescuers.append(closest_rescuer)

    # Return each rescuer as a separate row
    return [{'Story ID': row['Story ID'], 'Rescuer': rescuer, 'Sentence': sentence, 'Rescue Verb': rescue_verb} for rescuer in rescuers]

In [121]:
# Apply the function to the DataFrame
sample_results = sample_table_3.progress_apply(find_rescuers, axis=1)

100%|██████████| 100/100 [00:08<00:00, 12.33it/s]


In [122]:
# Flatten the results (since it's a list of lists)
flattened_results = [item for sublist in sample_results if sublist for item in sublist]

# Convert to DataFrame
fullname_df = pd.DataFrame(flattened_results)

In [135]:
fullname_df.drop_duplicates(inplace = True)

In [65]:
fullname_df

Unnamed: 0,Story ID,Rescuer,Sentence,Rescue Verb
0,13253.0,Godlewski,"In February 1943, Rev. Czarnecki, Rev. Godlews...",sent
1,13253.0,Rudolf,"two Poles to him, Adam Swiader and another man...",arranged
2,13253.0,Adam Swiader,"two Poles to him, Adam Swiader and another man...",transfer
3,13253.0,Rudolf,Adam Swiader took Rudolf home while The plan w...,developed
4,13253.0,Rudolf,"Between then and the liberation, Rudolf change...",aiding
5,13253.0,Rudolf,"Between then and the liberation, Rudolf change...",safe
6,13253.0,Rudolf,"Between then and the liberation, Rudolf change...",helping
7,13253.0,Rudolf,"Between then and the liberation, Rudolf change...",dedicating
8,13253.0,Rudolf,"Though she worked for a German woman, she took...",provided
9,13253.0,Adam Swiader,"Adam Swiader came to the rescue again, placing...",rescue


In [136]:
table_5 = fullname_df[['Story ID', 'Sentence', 'Rescuer']]

In [137]:
table_5.columns = ['Story ID', 'Sentence', 'Fullname']

In [126]:
# Function to split full name into first and last name
def split_name(full_name):
    """
    Splits a full name into first and last names.

    Parameters:
    - full_name (str): The full name of a person, which may consist of one or more parts.

    Returns:
    - pd.Series: A pandas Series with two elements:
        - First element: First name (str), containing all parts of the name except the last one.
        - Second element: Last name (str or None), containing the last part of the name if present, 
                          or None if the full name consists of a single part.
                          
    Examples:
    - split_name('John Doe') -> Series(['John', 'Doe'])
    - split_name('Jane') -> Series(['Jane', None])
    - split_name('Alice Marie Smith') -> Series(['Alice Marie', 'Smith'])
    """
    # Split the full name into parts using spaces as separators
    name_parts = full_name.split()

    # If there are multiple parts, assign the last part as the last name and the rest as the first name
    if len(name_parts) > 1:
        last_name = name_parts[-1]  # The last part of the split is assumed to be the last name
        first_name = " ".join(name_parts[:-1])  # Join all parts except the last one as the first name
    else:
        # If there's only one part, consider it as the first name with no last name
        first_name = full_name
        last_name = None

    # Return the first and last names as a pandas Series
    return pd.Series([first_name, last_name])

In [138]:
table_5[['Firstname', 'Lastname']] = table_5['Fullname'].progress_apply(split_name)

100%|██████████| 11/11 [00:00<00:00, 4029.11it/s]


In [139]:
table_5

Unnamed: 0,Story ID,Sentence,Fullname,Firstname,Lastname
0,13253,"In February 1943, Rev. Czarnecki, Rev. Godlews...",Godlewski,Godlewski,
4,13253,another man mission was to transfer Rudolf Her...,Rudolf Hermelin,Rudolf,Hermelin
6,13253,"Adam Swiader took Rudolf Hermelin, born in War...",Rudolf Hermelin,Rudolf,Hermelin
8,13253,"Between then and the liberation, Rudolf Hermel...",Rudolf Hermelin,Rudolf,Hermelin
9,13253,"Between then and the liberation, Rudolf Hermel...",Rudolf Hermelin,Rudolf,Hermelin
10,13253,"Between then and the liberation, Rudolf Hermel...",Rudolf Hermelin,Rudolf,Hermelin
11,13253,"Between then and the liberation, Rudolf Hermel...",Rudolf Hermelin,Rudolf,Hermelin
44,13253,"Rudolf Hermelin, born in Warsaw in 1897 then f...",Rudolf Hermelin,Rudolf,Hermelin
46,13253,Though Magdalena Miedziejewska worked for a Ge...,Magdalena Miedziejewska,Magdalena,Miedziejewska
52,13253,"Adam Swiader came to the rescue again, placing...",Adam Swiader,Adam,Swiader


In [35]:
table_5.to_csv('./output/3415_SPACY_All_Rescuers.csv', index = False)

## Step E. Find Rescue Dates

# Milestone 2: Find "Additional Properties" using NER

## 1.3 Adding Rescue dates (table_6)

In [36]:
table_6 = table_5.copy()

In [37]:
from datetime import datetime
import dateutil.parser

# Define the date range
start_date = datetime(1939, 9, 1)
end_date = datetime(1945, 5, 9)

# Function to extract the rescued date from the sentence using NER
def extract_rescue_date(sentence):
    """
    Extracts a date from the given sentence using Named Entity Recognition (NER) 
    and checks if the extracted date falls within the specified range.

    Parameters:
    - sentence (str): A sentence from which the function will attempt to extract a date.

    Returns:
    - str or None: The extracted date as a string if it falls within the defined date range 
                   (from September 1, 1939, to May 9, 1945). Returns None if no valid date 
                   is found or if the date is outside the specified range.
    """
    # Parse the sentence using SpaCy to extract entities
    doc = nlp(sentence)
    
    # Iterate over the identified entities
    for ent in doc.ents:
        # Check if the entity is labeled as a DATE
        if ent.label_ == "DATE":
            try:
                # Parse the date using dateutil.parser
                extracted_date = dateutil.parser.parse(ent.text, fuzzy=True)
                
                # Check if the parsed date is within the defined range
                if start_date <= extracted_date <= end_date:
                    return ent.text  # Return the date if it's within the range
            except (ValueError, TypeError):
                # Skip dates that cannot be parsed
                pass
    
    # Return None if no valid date is found or it's outside the range
    return None

In [38]:
# Apply the function to the 'Sentence' column and create a new column 'Rescue Date'
table_6['Rescue Date'] = table_6['Sentence'].progress_apply(extract_rescue_date)

100%|██████████| 6610/6610 [02:48<00:00, 39.24it/s]


In [44]:
table_6

Unnamed: 0,Story ID,Sentence,Fullname,Firstname,Lastname,Rescue Date
0,1.0,Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl,
1,1.0,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl,1942
2,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,,
3,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,,
4,1.0,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl,
...,...,...,...,...,...,...
6605,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,,
6606,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek,
6607,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,,
6608,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek,


In [45]:
table_6.to_csv('./output/3415_SPACY_All_Rescuers_Dates.csv', index = False)

## Step F. Extract other information

## 1.4 Adding Additional Properties (table_7)

i. For each rescuer (“righteous”), Extract these additional properties, if possible, from the stories.  
ii. Create a separate and new Excel ("3415_SPACY_All_Rescuers_Properties.xlsx") like that:

In [41]:
table_7 = table_6.copy()

In [42]:
# List of common professions
profession_keywords = ['doctor', 'nurse', 'soldier', 'engineer', 'teacher']

# List of common religions
religion_keywords = ['Christian', 'Christianity', 'Muslim', 'Islam', 'Jewish', 'Judaism', 'Buddhist', 'Hindu', 'Hinduism']

# Function to extract information
def extract_information(sentence):
    """
    Extracts various attributes related to rescuers from a given sentence using Named Entity Recognition (NER) 
    and custom keyword matching.

    Parameters:
    - sentence (str): A sentence containing potential information about a rescuer, such as profession, birthplace, age, 
                      birth date, religion, organizational affiliation, and rescue place.

    Returns:
    - pd.Series: A pandas Series containing the extracted information with the following fields:
        - 'Rescuer Profession' (str or None): Extracted profession of the rescuer based on predefined keywords.
        - 'Rescuer Birthplace' (str or None): Place of birth if identified in the sentence.
        - 'Rescuer Age' (str or None): Age of the rescuer if specified in the sentence.
        - 'Rescuer Birth Date' (str or None): Birth date of the rescuer if found in the sentence.
        - 'Rescuer Religion' (str or None): Religion of the rescuer matched from the predefined list of religions.
        - 'Organizational Affiliation' (str or None): Name of the organization the rescuer is associated with.
        - 'Rescue Place' (str or None): Location related to the rescue operation.
    """
    # Parse the sentence using SpaCy to extract entities
    doc = nlp(sentence)
    
    # Initialize information dictionary
    rescuer_info = {
        'Rescuer Profession': None,
        'Rescuer Birthplace': None,
        'Rescuer Age': None,
        'Rescuer Birth Date': None,
        'Rescuer Religion': None,
        'Organizational Affiliation': None,
        'Rescue Place': None,
    }
    
    # Extract NER entities and fill in the rescuer_info dictionary
    for ent in doc.ents:
        if ent.label_ == "DATE":
            # Check for context indicating birth date or age
            if "born" in sentence or "birth" in sentence:
                rescuer_info['Rescuer Birth Date'] = ent.text
            elif "years old" in sentence:
                rescuer_info['Rescuer Age'] = ent.text
        elif ent.label_ == "ORG":
            # Extract organizational affiliation
            rescuer_info['Organizational Affiliation'] = ent.text
        elif ent.label_ == "GPE":
            # Distinguish between birthplace and rescue place based on context
            if "born" in sentence:
                rescuer_info['Rescuer Birthplace'] = ent.text
            else:
                rescuer_info['Rescue Place'] = ent.text
        elif ent.label_ == "NORP":
            # Match with predefined religions
            if ent.text in religion_keywords:
                rescuer_info['Rescuer Religion'] = ent.text

    # Custom keyword-based extraction for profession
    for keyword in profession_keywords:
        if keyword in sentence.lower():
            rescuer_info['Rescuer Profession'] = keyword.capitalize()
            break
    
    # Return the extracted information as a pandas Series
    return pd.Series(rescuer_info)

In [43]:
# Apply the function to extract information for each sentence
table_7 = table_7.join(table_7['Sentence'].progress_apply(extract_information))

100%|██████████| 6610/6610 [02:56<00:00, 37.55it/s]


In [46]:
table_7

Unnamed: 0,Story ID,Sentence,Fullname,Firstname,Lastname,Rescue Date,Rescuer Profession,Rescuer Birthplace,Rescuer Age,Rescuer Birth Date,Rescuer Religion,Organizational Affiliation,Rescue Place
0,1.0,Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl,,,,,,,Dachau,Munich
1,1.0,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl,1942,Nurse,,,,,Auschwitz,
2,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,,,Doctor,,,,Jewish,SS,
3,1.0,Appointed as the Lagerälteste (the camp elder)...,Wörl,Wörl,,,Doctor,,,,Jewish,SS,
4,1.0,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6605,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,,,,,,,,,
6606,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek,,,,,,,,
6607,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Władyslawa,Władyslawa,,,,,,,,,
6608,4481.0,"On January 17,1990, Yad Vashem recognized Józe...",Franciszek Kaczmarek,Franciszek,Kaczmarek,,,,,,,,


In [47]:
table_7.to_csv('./output/3415_SPACY_All_Rescuers_Properties.csv', index = False)

In [None]:
# END