In [1]:
### IMPORTING LIBRARIES

In [2]:
import pandas as pd
import spacy
from spacy import displacy
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

 Load and Prepare Text

In [4]:
with open(r"C:\Users\henry\OneDrive\20th-century\Key_Events_20th_Century.txt", "r", encoding="utf-8", errors="ignore") as file:
    text = file.read().replace("\n", " ")


Evaluate and Clean the Text

In [5]:
# Display the first 500 characters
print(text[:500])


  Key events of the 20th century - Wikipedia                            Jump to content        Main menu      Main menu move to sidebar hide    		Navigation 	   Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us      		Contribute 	   HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages                    Search            Search                       Appearance      Appearance move to sidebar hide  TextSmallStandardLargeThis page always uses small font size


In [6]:
# Create the NER Object
doc = nlp(text)

In [7]:
# Visualize Sample Entities
displacy.render(doc[300:1000], style="ent", jupyter=True)

In [8]:
# Extract Entities per Sentence
sent_data = []
for sent in doc.sents:
    ents = [ent.text for ent in sent.ents]
    sent_data.append({"sentence": sent.text, "entities": ents})

df_sentences = pd.DataFrame(sent_data)
df_sentences.head()

Unnamed: 0,sentence,entities
0,Key events of the 20th century - Wikipedia ...,[the 20th century]
1,Main menu Main menu move to sidebar hide ...,"[Navigation \t Main, Contribute, HelpLearn]"
2,Search Search ...,[]
3,Color (beta)AutomaticLightDarkThis page is alw...,[]
4,Donate Create account Log in Persona...,[Log in Personal]


In [9]:
# Define Country List
countries = [
    "United States", "Germany", "France", "Russia", "United Kingdom", "Japan",
    "Italy", "China", "India", "Canada", "Austria", "Korea", "Vietnam",
    "Poland", "Spain", "Cuba", "Turkey", "Iraq", "Iran", "Afghanistan"
]

In [10]:
# Filter Sentences for Country Mentions
def filter_entities(ent_list, country_list):
    return [ent for ent in ent_list if ent in country_list]

df_sentences["country_entities"] = df_sentences["entities"].apply(lambda x: filter_entities(x, countries))

In [11]:
# Keep Only Sentences with Countries
df_filtered = df_sentences[df_sentences["country_entities"].map(len) > 0]
df_filtered.head()

Unnamed: 0,sentence,entities,country_entities
32,After a period of diplomatic and military esca...,"[the July Crisis, the end of July 1914, Britis...",[France]
33,"In 1917, Russia ended hostile actions against ...","[1917, Russia, the Central Powers, Tsar]",[Russia]
34,The Bolsheviks negotiated the Treaty of Brest-...,"[the Treaty of Brest-Litovsk, Germany, Russia]","[Germany, Russia]"
35,"In the treaty, Bolshevik Russia ceded the Balt...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
36,It also recognized the independence of Ukraine...,"[Germany, Allied, American, 1918.[4]",[Germany]


In [13]:
# Simplify country names
df_filtered["country_entities"] = df_filtered["country_entities"].apply(lambda x: [item.split()[0] for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["country_entities"] = df_filtered["country_entities"].apply(lambda x: [item.split()[0] for item in x])


In [14]:
# Create Relationship Pair
relationships = []
for i in range(df_filtered.index[-1]):
    end_i = min(i + 5, df_filtered.index[-1])
    entity_window = sum(df_filtered.loc[i:end_i].country_entities, [])
    unique_entities = [entity_window[i] for i in range(len(entity_window)) if i == 0 or entity_window[i] != entity_window[i-1]]

    if len(unique_entities) > 1:
        for idx, a in enumerate(unique_entities[:-1]):
            b = unique_entities[idx + 1]
            relationships.append({"source": a, "target": b})

In [15]:
rel_df = pd.DataFrame(relationships)
rel_df = pd.DataFrame(np.sort(rel_df.values, axis=1), columns=rel_df.columns)
rel_df["value"] = 1
rel_df = rel_df.groupby(["source", "target"], sort=False, as_index=False).sum()
rel_df.to_csv("C:/Users/henry/OneDrive/20th-century/country_relationships.csv", index=False)