In [None]:
# ENV (NLP)

# 0. Read Source Data & Preprocessing

In [1]:
import pandas as pd

In [2]:
data = pd.read_excel('./data/3415_All_STORIES_Updated_ver02.xlsx', index_col = None, header = 0)

In [3]:
# Function to extract story content after the names

def extract_story(text):
    # Step 1: Split the text by line breaks
    lines = text.strip().split('\n')

    # Step 2: Detect the start of the story (the first longer segment, which should be a complete sentence)
    story_started = False
    story_lines = []
    
    for line in lines:
        # If the line contains more than a few words (let's say more than 3), assume it's the start of the story
        if len(line.split()) > 3:
            story_started = True
        # Once we detect the start of the story, append the rest of the lines
        if story_started:
            story_lines.append(line)
    
    # Step 3: Combine the story lines into a single string and return the result
    return ' '.join(story_lines).strip()

In [4]:
data['STORY TEXT'] = data['STORY TEXT'].apply(extract_story)

In [5]:
# we have
# 3414 of stories
data

Unnamed: 0,STORY ID,STORY TEXT
0,1.0,Ludwig Wörl (1906-1967) spent 11 years of his ...
1,2.0,Johann Pscheidt was born in the city of Radaut...
2,3.0,"Even before the war, Professor Tadeusz Czeżows..."
3,4.0,"During the occupation, Władysław Kowalski, a q..."
4,6.0,"Władysława Choms, the wife of a major in the P..."
...,...,...
3409,13524.0,"Mordechai Wulkan (b. 1910) and his wife, Chaja..."
3410,13611.0,"Sala Armel-Goldhar lived with her parents, Isr..."
3411,5257.0,Following Helena Zienowicz’s graduation from t...
3412,3053.0,"Before the war, Badowski Stefan Franciszek liv..."


# 1. Milestone 1: Find Names - NER & Co-reference Resolution

## 1.1 Find all Names: Simple NER: Start with SPACY Package and model “en_core_web_md” + "en_coreference_web_trf"

In [6]:
import spacy
import neuralcoref
from spacy import displacy

In [33]:
# !python -m spacy download en_core_web_md

Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [7]:
# Load SpaCy's English model
nlp = spacy.load('en_core_web_md')

# Add neuralcoref to SpaCy's pipeline
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f9118bbc490>

## Coreference Resolution

In [8]:
# import tqdm to track a progress
from tqdm import tqdm
tqdm.pandas()

In [9]:
data['STORY TEXT'] = data['STORY TEXT'].progress_apply(lambda row : nlp(row)._.coref_resolved)
data.to_csv('./drafts/3415_all_coref_data.csv', index = False)

100%|██████████| 3414/3414 [1:39:23<00:00,  1.75s/it]    


In [None]:
data = pd.read_csv('/drafts/3415_all_coref_data.csv')

### Separate into sentences, find Named Entity (PERSON) and make a dataframe

In [22]:
# Function to extract PERSON entities and their sentences from each story
def extract_person_and_sentence(row):
    story_id = row['STORY ID']
    story_text = row['STORY TEXT']
    
    # Parse the story text using Spacy
    doc = nlp(story_text)
    
    # For each sentence in the document, check for PERSON entities and store the result
    for sentence in doc.sents:
        for entity in sentence.ents:
            if entity.label_ == 'PERSON':
                results.append({
                    'storynumber': story_id,
                    'name': entity.text,
                    'sentence': sentence.text
                })

In [23]:
# Create an empty list to store results
results = []

# Apply the function to each row in the DataFrame
data.progress_apply(extract_person_and_sentence, axis=1)

100%|██████████| 3414/3414 [1:28:26<00:00,  1.55s/it]    


0       None
1       None
2       None
3       None
4       None
        ... 
3409    None
3410    None
3411    None
3412    None
3413    None
Length: 3414, dtype: object

In [24]:
# Convert the list of results to a DataFrame
table_1 = pd.DataFrame(results)

In [25]:
table_1

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
...,...,...,...
107690,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
107691,4481.0,Władyslawa,"On January 17,1990, Yad Vashem recognized Józe..."
107692,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
107693,4481.0,Teresa Drewek-,"On January 17,1990, Yad Vashem recognized Józe..."


In [26]:
# Extract Output
table_1.to_csv('./output/3415_SPACY_All_Names.xlsx', index = False)

## 1.2 Find Rescuers Names: Filtering Only Rescuers (Righteous Among the Nations) Names

### 1.2.1.1 Step 1: Preparing a table of filtered sentences: “Rescuing Phrases-Name” - Lexicon-based

In [None]:
table_1 = pd.read_csv('./output/3415_SPACY_All_Names.xlsx')

In [28]:
# Table 1
table_1
# table1 = pd.read_excel('./output/3415_SPACY_All_Names.xlsx', index = False)

Unnamed: 0,storynumber,name,sentence
0,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,Ludwig Wörl,Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,Ludwig Wörl,After spending some nine months in a dark dete...
4,1.0,Ludwig Wörl,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
...,...,...,...
107690,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
107691,4481.0,Władyslawa,"On January 17,1990, Yad Vashem recognized Józe..."
107692,4481.0,Franciszek Kaczmarek,"On January 17,1990, Yad Vashem recognized Józe..."
107693,4481.0,Teresa Drewek-,"On January 17,1990, Yad Vashem recognized Józe..."


In [29]:
# Table 2 (lexicon)
table_2 = pd.read_excel('./data/Final_Lexicon_Updated.xlsx')

In [30]:
table_2

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase
0,1.0,sent,his life as a political prisoner in nazi conce...
1,1.0,sent,"to the camp’s joinery and later, as a trained ..."
2,1.0,employed,personnel. appointed as the lagerälteste (the ...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ..."
...,...,...,...
8922,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8923,5257.0,looked after,"Helena looked after the children, obtaining ""A..."
8924,5257.0,sheltered,"Initially, they were only supposed to stay wit..."
8925,5257.0,obtained,Helenapresented the fugitives as her brother’s...


#### Trying rescuing verb 등장하는 부분을 lexicon 의 Rescuing Phrase 활용해서 Story 에서 찾기 중인데 쉽지않음

In [98]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [118]:
# Step 1: Merge the two dataframes on 'STORY ID'
merged_df = pd.merge(table_2, data, on='Story ID')

In [119]:
# Step 2: Function to find and replace 'Rescuing Phrase' with the best match in 'STORY TEXT'
def update_rescuing_phrase(row):
    story_text = row['Story Text']
    rescuing_phrase = row['Rescuing Phrase']
    
    # Split the STORY TEXT into sentences
    sentences = story_text.split('. ')
    
    # Split the Rescuing Phrase into its own sentences for matching
    rescuing_sentences = rescuing_phrase.split('. ')
    
    # Track which sentences should be updated
    updated_sentences = []
    
    # Iterate through each rescuing sentence and match it to sentences in the story
    for resc_sentence in rescuing_sentences:
        match = process.extractOne(resc_sentence, sentences, scorer=fuzz.partial_ratio)
        
        # If a good match is found, replace the matched sentence with the rescuing sentence
        if match and match[1] > 80:  # You can adjust the threshold as needed
            matched_sentence = match[0]
            updated_sentences.append(matched_sentence)
            sentences[sentences.index(matched_sentence)] = resc_sentence  # Replace in STORY TEXT
            
    # Join the updated sentences back together
    return '. '.join(sentences)

In [120]:
# Step 3: Apply the function to each row to update the 'Rescuing Phrase'
merged_df['Updated Rescuing Phrase'] = merged_df.apply(update_rescuing_phrase, axis=1)

In [121]:
merged_df

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Story Text,Updated Rescuing Phrase
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,his life as a political prisoner in nazi conce...
1,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,employed,personnel. appointed as the lagerälteste (the ...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ...",Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
...,...,...,...,...,...
84,29.0,sent,"feiner, the bund representative and adolf berm...","From September 1942, even before the establish...",the. When Zegota was established in December 1...
85,29.0,helping,"in december 1942, bartoszewski (whose undercov...","From September 1942, even before the establish...","From September 1942, even before the establish..."
86,29.0,helped,the polish public. he published many articles ...,"From September 1942, even before the establish...","From September 1942, even before the establish..."
87,30.0,sent,"the world looks on”), describing the warsaw gh...","During the occupation, Kann organized the Scou...","During the occupation, Kann organized the Scou..."


In [122]:
merged_df.iloc[-1]['Rescuing Phrase']

'his wife were held in prison in germany until january 31, 1943, and then deported to auschwitz. mrs. krumme was sent to the gas chambers. an ss-man notified werner of her death, in may 1943, with the following words: “be happy'

In [123]:
merged_df.iloc[-1]['Updated Rescuing Phrase']

'Three months after Hitler’s rise to power, in May 1933, Werner Krumme (b. 1909 in Dortmund) married a Jewish wife. Despite official discrimination and frequent harassment by the Gestapo, Krumme stood by Krumme and refused to divorce a Jewish wife. mrs. Krumme, Renate and Anita Lasker, to flee to unoccupied France. his wife were held in prison in germany until january 31, 1943, and then deported to auschwitz. krumme was sent to the gas chambers. an ss-man notified werner of her death, in may 1943, with the following words: “be happy.  Werner used Werner position to falsify so-called selection lists and to assign Jews to work units with relatively tolerable conditions, which increased the chances of Jews survival. Werner thus added the names of Ernst Krinski and Peter Schwartz to a list of a transport that was leaving Auschwitz to the concentration camp of “Warschau,” where no gassing of prisoners took place. The Jewish pharmacist Strauss was likewise assigned by The Jewish pharmacist S

In [106]:
# Step 2: Function to find and replace 'Rescuing Phrase' in 'STORY TEXT'
def replace_rescuing_phrase(row):
    story_text = row['Story Text']
    rescuing_phrase = row['Rescuing Phrase']
    
    # Find the closest matching sentence in the STORY TEXT using fuzzy matching
    match = process.extractOne(rescuing_phrase, story_text.split('. '), scorer=fuzz.partial_ratio)
    
    # If a match is found, replace it with the rescuing phrase
    if match and match[1] > 80:  # Match score should be high enough, e.g., above 80
        matched_sentence = match[0]
        new_story_text = story_text.replace(matched_sentence, rescuing_phrase)
        return new_story_text
    else:
        return story_text  # If no good match, return the original story text

In [107]:
# Step 3: Apply the function to each row to replace the rescuing phrase
merged_df['Updated STORY TEXT'] = merged_df.apply(replace_rescuing_phrase, axis=1)

In [108]:
merged_df

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Story Text,Updated STORY TEXT
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
1,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
2,1.0,employed,personnel. appointed as the lagerälteste (the ...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
3,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
4,1.0,put himself at risk,"barracks, wörl, against the express orders of ...",Ludwig Wörl (1906-1967) spent 11 years of Ludw...,Ludwig Wörl (1906-1967) spent 11 years of Ludw...
...,...,...,...,...,...
84,29.0,sent,"feiner, the bund representative and adolf berm...","From September 1942, even before the establish...","From September 1942, even before the establish..."
85,29.0,helping,"in december 1942, bartoszewski (whose undercov...","From September 1942, even before the establish...","From September 1942, even before the establish..."
86,29.0,helped,the polish public. he published many articles ...,"From September 1942, even before the establish...","From September 1942, even before the establish..."
87,30.0,sent,"the world looks on”), describing the warsaw gh...","During the occupation, Kann organized the Scou...","During the occupation, Kann organized the Scou..."


### Making Table 3 Starts

In [32]:
# input
# Table1
table_1['sentence'][0]
# check if rescuing verb appear in the sentence. (Lexicon reference - same story ID)
# If there is rescuing verb existed,
# (Also name)
# add this sentence to a new table Table3 

'Ludwig Wörl (1906-1967) spent 11 years of Ludwig Wörl (1906-1967) life as a political prisoner in Nazi concentration camps.  '

In [144]:
#Table3
table_3 = pd.DataFrame(columns = ['Story ID','Rescuing Verb', 'Rescuing_phrases-name'])

for row in tqdm(table_1.itertuples(), total = table_1.shape[0]):
    # storyid = row.storynumber
    # 본 회차 스토리에서 찾아내야하는 verb 목록 불러오고
    verbs_to_find = list(set(table_2[table_2['Story ID'] == row.storynumber]['Rescuing Verb']))
    verbs_to_find += ['Righteous Among the Nations']

    for verb in verbs_to_find:
        if verb in row.sentence:
            new_row = {'Story ID' : row.storynumber, 'Rescuing Verb' : str(verb), 'Rescuing_phrases-name' : row.sentence}
            table_3.loc[len(table_3)] = new_row

100%|██████████| 107695/107695 [01:18<00:00, 1373.60it/s]


In [145]:
table_3.drop_duplicates(inplace = True)

In [146]:
table_3.reset_index(drop = True, inplace = True)

In [147]:
table_3

Unnamed: 0,Story ID,Rescuing Verb,Rescuing_phrases-name
0,1.0,sent,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,saving,Appointed as the Lagerälteste (the camp elder)...
3,1.0,employed,Appointed as the Lagerälteste (the camp elder)...
4,1.0,treatment,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...
...,...,...,...
11731,4481.0,arranged,the children arranged for Regina to move in wi...
11732,4481.0,looked after,"The Kaczmareks made Regina feel at home, and w..."
11733,4481.0,help,"The Kaczmareks made Regina feel at home, and w..."
11734,4481.0,recognized,"On January 17,1990, Yad Vashem recognized Józe..."


In [148]:
table_3.to_csv('./drafts/table_3.xlsx', index = False)

In [80]:
# Table_4 (Merge table_2 and table_3)
# table_4: Complete Lexicon Structure Example
# table_4 = pd.merge(table_2, table_3, how = 'left', on = ['Story ID','Rescuing Verb'])

In [81]:
# table_4
# table_4.to_csv('./drafts/table_4.xlsx', index = False)

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A..."
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...
...,...,...,...,...
31796,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
31797,5257.0,looked after,"Helena looked after the children, obtaining ""A...",
31798,5257.0,sheltered,"Initially, they were only supposed to stay wit...",
31799,5257.0,obtained,Helenapresented the fugitives as her brother’s...,


### 1.2.1.2 Step 2:“Rescuers Names filtering” - Step-by-Step Breakdown

In [149]:
table_4 = table_3.copy()

In [165]:
table_4 = table_3.copy()[:100]

#### 0917 Dependency Parsing.. hmm

In [235]:
table_4.dropna(inplace = True)

In [204]:
table_4.iloc[0]

Story ID                                                               1.0
Rescuing Verb                                                         sent
Rescuing Phrase          his life as a political prisoner in nazi conce...
Rescuing_phrases-name    Ludwig Wörl (1906-1967) was first arrested by ...
Name: 0, dtype: object

In [223]:
table_4.iloc[4]['Rescuing_phrases-name']

'Appointed as the Lagerälteste (the camp elder) of the hospital barracks, Wörl, against the express orders of the SS, employed Jewish doctors, thus saving Jewish doctors from certain death.'

In [206]:
txt = 'Ludwig Wörl (1906-1967) was first arrested by the Gestapo in 1934, and sent to Dachau for distributing a pamphlet in which the citizens of Munich were informed about the horrors of the camp.'


In [207]:
doc = nlp(txt)

In [212]:
for token in doc :
    if token.text.lower() == 'sent':
        print(token.dep_)
        print([token for token in token.children])
        print([token.dep_ for token in token.children])

conj
[to, for]
['dative', 'prep']


In [None]:
# 09-17 !!!!!!!!
# Function to find the rescuer (subject) for each sentence
def find_rescuer(df):
    sentence = df['Rescuing Phrase']
    rescuing_verb = df['Rescuing Verb']
    
    # Skip NaN values
    if pd.isna(sentence):
        return None
    
    # Process sentence with Spacy
    doc = nlp(sentence)
    
    # If the verb is multi-word (like "put himself at risk"), we handle it separately.
    if rescuing_verb in sentence:
        # Find the subject (nsubj) related to the rescuing verb
        for token in doc:
            if rescuing_verb.split()[0] == token.lemma_ or token.text == rescuing_verb.split()[0]:
                # Find the subject (nsubj) linked to the verb
                subject = [child for child in token.children if child.dep_ == "nsubj"]
                if subject:
                    return subject[0].text  # Return the first subject found
    
    return None  # Return None if no subject is found

In [107]:
# 주어진 rescuing verb와 관련된 주어(구조하는 사람) 찾기
def find_rescuer(df):
    # 문장과 rescuing verb
    sentence = df['Rescuing_phrases-name']
    rescuing_verb = df['Rescuing Verb']
    
    # 문장을 Spacy로 파싱
    doc = nlp(sentence)
    
    # 구조하는 사람(주어)을 저장할 리스트
    # rescuers = []
    
    # 문장에서 주어진 rescuing verb의 텍스트와 일치하는 주어 찾기
    # subject = None
    rescuer = None
    for token in doc:
        
        # if token.dep_ == 'nsubj':
        #     subject = token.text

        # token.text가 주어진 rescuing verb와 일치할 경우
        if token.text.lower() == rescuing_verb.lower():  # 대소문자 구분 무시
            # 해당 동사에 연결된 주어 찾기 (nsubj)
            for child in token.children:
                if child.dep_ == "nsubj":
                    rescuer = child.text
    
    # 결과 반환 (찾은 구조하는 사람 또는 None)
    if rescuer:
        return rescuer
    # else:
    #     return subject
    # if rescuers else None

In [108]:
sample_table_4 = table_3[:40]

In [109]:
# 데이터프레임에서 rescuing verb와 관련된 주어(구조하는 사람) 찾기
sample_table_4['Rescuer'] = sample_table_4.apply(find_rescuer, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [110]:
sample_table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing_phrases-name,Rescuer
0,1.0,sent,Ludwig Wörl (1906-1967) was first arrested by ...,
1,1.0,sent,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",
2,1.0,saving,Appointed as the Lagerälteste (the camp elder)...,
3,1.0,employed,Appointed as the Lagerälteste (the camp elder)...,
4,1.0,treatment,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,
5,1.0,obtain,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,
6,1.0,forge,Ludwig Wörl (1906-1967) would forge selection ...,Wörl
7,1.0,treatment,"Here, again, Wörl made Wörl beneficial influen...",
8,1.0,protecting,"Here, again, Wörl made Wörl beneficial influen...",
9,1.0,influence,"Here, again, Wörl made Wörl beneficial influen...",


### 그냥 이걸로 갈까봐... 첫번째 이름 찾기

In [166]:
# 가장 이름이 많이 찾아짐. (but not sure if it's 100% Rescuer)
# Function to find the full name of the rescuer, and split it into first name and last name
def extract_name_info(row):
    sentence = row['Rescuing_phrases-name']  # Adjusted column name
    rescuing_verb = row['Rescuing Verb']    # Adjusted column name
    
    # Skip NaN values
    if pd.isna(sentence):
        return pd.Series([None, None, None])  # Return None for full name, first name, and last name
    
    # Process sentence with Spacy
    doc = nlp(sentence)
    
    # Find the full name of the rescuer using NER
    full_name = None
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            full_name = ent.text  # Get the first PERSON entity (assuming it's the rescuer)
            break
    
    if not full_name:
        return pd.Series([None, None, None])  # Return None if no PERSON entity is found
    
    # Split the full name into first and last name
    name_parts = full_name.split()
    if len(name_parts) > 1:
        last_name = name_parts[-1]
        first_name = " ".join(name_parts[:-1])
    else:
        first_name = full_name
        last_name = None  # If no last name, leave it as None
    
    return pd.Series([full_name, first_name, last_name])

In [167]:
# Apply the function to the DataFrame and store the results in separate columns
table_4[['Full Name', 'First Name', 'Last Name']] = table_4.progress_apply(extract_name_info, axis=1)

100%|██████████| 100/100 [00:02<00:00, 38.18it/s]


In [168]:
# 이상적인 방법. but 잘 안찾아짐.
# Function to find rescuer in the sentence
def find_rescuer(row):
    sentence = row['Rescuing_phrases-name']
    rescue_verb = row['Rescuing Verb']
    doc = nlp(sentence)
    rescuer = None
    
    # Iterate through tokens in the parsed sentence
    for token in doc:
        # Match the rescue verb
        if token.text.lower() == rescue_verb.lower():
            # Find the subject (rescuer) connected to the verb
            for child in token.children:
                if child.dep_ == "nsubj":
                    rescuer = child.text
                    return rescuer
                    # for ent in doc.ents:
                    #     if ent.text == child.text and ent.label_ == 'PERSON' :
                    #         rescuer = child.text
                    #         return rescuer
    return None

In [169]:
table_4['NAME'] = table_4.progress_apply(find_rescuer, axis = 1)

100%|██████████| 100/100 [00:02<00:00, 40.05it/s]


In [176]:
# 가장 베스트 코드 so far 
# Function to find the rescuer: First check for nsubj, if not a PERSON, find the closest PERSON before the verb
def find_rescuer(row):
    sentence = row['Rescuing_phrases-name']
    rescue_verb = row['Rescuing Verb']
    doc = nlp(sentence)
    
    rescuer = None
    verb_token = None
    
    # Step 1: Find the rescue verb and check its nsubj
    for token in doc:
        if token.text.lower() == rescue_verb.lower():
            verb_token = token
            # Check for nsubj (subject of the verb)
            for child in token.children:
                if child.dep_ == "nsubj":
                    # Check if the subject is a PERSON entity
                    for ent in doc.ents:
                        if ent.text == child.text and ent.label_ == "PERSON":
                            rescuer = child.text
                            return rescuer  # If found, return the rescuer immediately
    
    # Step 2: If no PERSON nsubj was found, find the closest PERSON before the verb
    if verb_token:
        closest_rescuer = None
        closest_distance = float('inf')
        
        # Find PERSON entities before the verb
        for ent in doc.ents:
            if ent.label_ == "PERSON" and ent.end <= verb_token.i:
                distance = verb_token.i - ent.end
                if distance < closest_distance:
                    closest_distance = distance
                    closest_rescuer = ent.text
        
        return closest_rescuer
    
    return None  # If no rescuer is found

In [177]:
# Apply the function to the DataFrame
table_4['rescuer2'] = table_4.apply(find_rescuer, axis=1)

In [178]:
table_4[50:]

Unnamed: 0,Story ID,Rescuing Verb,Rescuing_phrases-name,Full Name,First Name,Last Name,NAME,rescuer,rescuer2
50,11.0,arranged,Since immigration to British controlled Palest...,Michael,Michael,,,Michael,Michael
51,11.0,Righteous Among the Nations,"On June 4, 1963, Yad Vashem recognized Gertrud...",Vashem,Vashem,,,,
52,13.0,rescue,From the beginning of 1942 until Edward Chacza...,Edward Chacza,Edward,Chacza,,Edward Chacza,Edward Chacza
53,13.0,shelter,Chacza address was known to Jewish fugitives a...,Chacza,Chacza,,,Chacza,Chacza
54,13.0,Righteous Among the Nations,"On March 24, 1964, Yad Vashem recognized Edwar...",Vashem,Vashem,,,,
55,14.0,discovered,"One night in September 1942, Domna Semenyuk we...",Domna Semenyuk,Domna,Semenyuk,Semenyuk,Domna Semenyuk,Domna Semenyuk
56,14.0,invited,"After some deliberation, Domna Semenyuk invite...",Domna Semenyuk,Domna,Semenyuk,Semenyuk,Domna Semenyuk,Domna Semenyuk
57,14.0,Righteous Among the Nations,"On July 16, 1963, Yad Vashem recognized Domna ...",Vashem,Vashem,,,,
58,14.0,Righteous Among the Nations,"On November 12, 1995, Yad Vashem recognized Na...",Vashem,Vashem,,,,
59,15.0,Righteous Among the Nations,"On June 4, 1963, Yad Vashem recognized Bronisł...",Vashem,Vashem,,,,


In [180]:
table_4.iloc[97]['Rescuing_phrases-name']

'The author, Rachel Auerbach, and Dr. A. Berman published many articles and essays noted for their objectivity and sympathy toward the Jewish people, and helped promote harmony between Poles and Jews.  '

In [115]:
table_5 = table_3[['Story ID','Full Name', 'Last Name', 'First Name', 'Rescuing_phrases-name']]

In [120]:
table_5

Unnamed: 0,Story ID,Full Name,Last Name,First Name,Rescuing_phrases-name
0,1.0,Ludwig Wörl,Wörl,Ludwig,Ludwig Wörl (1906-1967) was first arrested by ...
1,1.0,Ludwig Wörl,Wörl,Ludwig,"In 1942, Ludwig Wörl (1906-1967) was sent to A..."
2,1.0,Lagerälteste,,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
3,1.0,Lagerälteste,,Lagerälteste,Appointed as the Lagerälteste (the camp elder)...
4,1.0,Ludwig Wörl,Wörl,Ludwig,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...
...,...,...,...,...,...
11731,4481.0,Franciszek,,Franciszek,the children arranged for Regina to move in wi...
11732,4481.0,Kaczmareks,,Kaczmareks,"The Kaczmareks made Regina feel at home, and w..."
11733,4481.0,Kaczmareks,,Kaczmareks,"The Kaczmareks made Regina feel at home, and w..."
11734,4481.0,Vashem,,Vashem,"On January 17,1990, Yad Vashem recognized Józe..."


In [134]:
table_3[table_3['Full Name'].isna()].iloc[4]

Story ID                                                              77.0
Rescuing Verb                                                     arranged
Rescuing_phrases-name    Helena took it upon Helena to look after the r...
Full Name                                                             None
First Name                                                            None
Last Name                                                             None
Name: 171, dtype: object

In [142]:
# 분석할 문장
sentence = "Helena took it upon Helena to look after the refugees and when extortionists and informers began harassing the refugees, Helena arranged alternative shelters for the refugees."
rescue_verb = 'arranged'

# 문장을 Spacy로 파싱
# doc = nlp(sentence)

# 'arranged' 동사에 연결된 주어(구조자) 찾기
def find_rescuer(sentence, rescue_verb):
    doc = nlp(sentence)
    rescuer = None
    for token in doc:
        # if token.lemma_ == verb_lemma and token.pos_ == "VERB":
        # token.text가 주어진 rescuing verb와 일치할 경우
        if token.text.lower() == rescue_verb.lower():  # 대소문자 구분 무시
            # 동사에 연결된 주어(nsubj) 찾기
            for child in token.children:
                if child.dep_ == "nsubj":
                    rescuer = child.text
                    
                    print(f"Found rescuer: {rescuer} for verb: {token.text}")
    return rescuer

# 'arranged' 동사의 주어(구조자) 찾기
rescuer = find_rescuer(sentence, rescue_verb)

Found rescuer: Helena for verb: arranged


In [133]:
# out of ,
table_3[table_3['Full Name'].isna()].iloc[4]['Rescuing_phrases-name']

'Helena took it upon Helena to look after the refugees and when extortionists and informers began harassing the refugees, Helena arranged alternative shelters for the refugees.'

In [117]:
table_5.to_csv('./output/3415_SPACY_All_Rescuers.xlsx', index = False)

### Rescuing DATE

In [164]:
# Function to find the full name of the rescuer, first name, last name, and the rescued date
def extract_name_and_date(row):
    sentence = row['Rescuing_phrases-name']  # Adjusted column name
    rescuing_verb = row['Rescuing Verb']    # Adjusted column name
    
    # Skip NaN values
    if pd.isna(sentence):
        return pd.Series([None, None, None, None])  # Return None for all if sentence is NaN
    
    # Process sentence with Spacy
    doc = nlp(sentence)
    
    # Extract full name (PERSON) and rescued date (DATE)
    full_name = None
    rescued_date = None
    for ent in doc.ents:
        if ent.label_ == "PERSON" and full_name is None:  # Get the first PERSON entity
            full_name = ent.text
        if ent.label_ == "DATE" and rescued_date is None:  # Get the first DATE entity
            rescued_date = ent.text

    if not full_name:
        return pd.Series([None, None, None, rescued_date])  # Return None for names if no PERSON found
    
    # Split the full name into first name and last name
    name_parts = full_name.split()
    if len(name_parts) > 1:
        last_name = name_parts[-1]
        first_name = " ".join(name_parts[:-1])
    else:
        first_name = full_name
        last_name = None  # If no last name, leave it as None
    
    return pd.Series([full_name, first_name, last_name, rescued_date])

In [165]:
# Apply the function to the DataFrame and store the results in separate columns
sample_table_4[['Full Name', 'First Name', 'Last Name', 'Rescued Date']] = sample_table_4.apply(extract_name_and_date, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [166]:
sample_table_4

Unnamed: 0,Story ID,Rescuing Verb,Rescuing Phrase,Rescuing_phrases-name,Full Name,First Name,Last Name,Rescued Date
0,1.0,sent,his life as a political prisoner in nazi conce...,Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl,1906-1967
1,1.0,sent,his life as a political prisoner in nazi conce...,"In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl,1942
2,1.0,sent,"to the camp’s joinery and later, as a trained ...",Ludwig Wörl (1906-1967) was first arrested by ...,Ludwig Wörl,Ludwig,Wörl,1906-1967
3,1.0,sent,"to the camp’s joinery and later, as a trained ...","In 1942, Ludwig Wörl (1906-1967) was sent to A...",Ludwig Wörl,Ludwig,Wörl,1942
4,1.0,employed,personnel. appointed as the lagerälteste (the ...,Appointed as the Lagerälteste (the camp elder)...,Lagerälteste,Lagerälteste,,
6,1.0,saving,lagerälteste (the camp elder) of the hospital ...,Appointed as the Lagerälteste (the camp elder)...,Lagerälteste,Lagerälteste,,
8,1.0,put himself at risk,"barracks, wörl, against the express orders of ...",,,,,
9,1.0,obtain,"the ss, employed jewish doctors, thus saving t...",Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl,1906-1967
11,1.0,treatment,himself at risk in order to obtain at least a ...,Ludwig Wörl (1906-1967) also put Ludwig Wörl (...,Ludwig Wörl,Ludwig,Wörl,1906-1967
13,1.0,treatment,himself at risk in order to obtain at least a ...,"Here, again, Wörl made Wörl beneficial influen...",Wörl,Wörl,,


In [None]:
table_6.to_csv('3415_SPACY_All_Rescuers_Dates.xlsx', index = False)

In [94]:
# 문장 처리
# doc = nlp(sentence)

# 사람(Entity) 추출
people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
print("Detected people:", people)

# 의존성 파싱으로 주어-동사 관계 분석
for token in doc:
    if token.dep_ == "nsubj" and token.head.lemma_ == "help":
        print(f"Subject: {token.text}, Verb: {token.head.text}")

Detected people: ['Vashem', 'Werner Krumme']


In [None]:
3415_SPACY_All_Rescuers.XLS

In [None]:
output_file_name = '3415_SPACY_All_Names.xlsx'
output.to_csv(f'./output/{output_file_name}')