# Step 1: Install Necessary Libraries

In [1]:
!pip install -r requirements.txt

Collecting spacy (from -r requirements.txt (line 9))
  Using cached spacy-3.8.2-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting pyvis (from -r requirements.txt (line 11))
  Using cached pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting python-igraph (from -r requirements.txt (line 12))
  Using cached python_igraph-0.11.8-py3-none-any.whl.metadata (2.8 kB)
Collecting leidenalg (from -r requirements.txt (line 13))
  Using cached leidenalg-0.10.2-cp38-abi3-win_amd64.whl.metadata (10 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy->-r requirements.txt (line 9))
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy->-r requirements.txt (line 9))
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy->-r requirements.txt (line 9))
  Using cached murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (f

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.59.1 requires numpy<1.27,>=1.22, but you have numpy 2.0.2 which is incompatible.
pywavelets 1.5.0 requires numpy<2.0,>=1.22.4, but you have numpy 2.0.2 which is incompatible.
streamlit 1.32.0 requires numpy<2,>=1.19.3, but you have numpy 2.0.2 which is incompatible.


# Step 2: Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import datetime as dt
import os
import glob
import re

# Step 3: Load the Text File

In [3]:
# Load the text file
with open('key_events_20th_century.txt', 'r') as file:
    text = file.read()

# Step 4: Evaluate and Clean the Text

In [4]:
# Display the first 1000 characters for a quick check
print(text[:1000])

Jump to content
Main menu
Search
Donate
Appearance
Create account
Log in
Personal tools
Key Events of the 20th Century
Add languages
Article
Talk
Tools
From Wikipedia, the free encyclopedia
Look for Key Events of the 20th Century on one of Wikipedia's sister projects:
Wiktionary (dictionary)
Wikibooks (textbooks)
Wikiquote (quotations)
Wikisource (library)
Wikiversity (learning resources)
Commons (media)
Wikivoyage (travel guide)
Wikinews (news source)
Wikidata (linked database)
Wikispecies (species directory)
Wikipedia does not have an article with this exact name. Please search for Key Events of the 20th Century in Wikipedia to check for alternative titles or spellings.
You need to log in or create an account and be autoconfirmed to create new articles. Alternatively, you can use the article wizard to submit a draft for review, or request a new article.
Search for "Key Events of the 20th Century" in existing articles.
Look for pages within Wikipedia that link to this title.
Other rea

# Text Cleaning Example:

In [5]:
# Replace unusual characters
text = text.replace("—", "-").replace("‘", "'").replace("’", "'")

# Save the cleaned text to a new file

In [6]:
# Save the cleaned text to a new file
with open('twentieth-century-cleaned.txt', 'w') as file:
    file.write(text)

# Step 5: Create a Named Entity Recognition (NER) Object

In [10]:
# Load SpaCy's English NER model
nlp = spacy.load("en_core_web_sm")

In [11]:
# Create the NER object
doc = nlp(text)

# Step 6: Extract Sentences and Entities

In [12]:
# List to store sentences and entities
sentences_entities = []

# Iterate through sentences in the NER document
for sentence in doc.sents:
    # Extract entities in each sentence
    entities = [(ent.text, ent.label_) for ent in sentence.ents]
    sentences_entities.append((sentence.text, entities))

# Display the first few sentences with their entities
print(sentences_entities[:5])

[("Jump to content\nMain menu\nSearch\nDonate\nAppearance\nCreate account\nLog in\nPersonal tools\nKey Events of the 20th Century\nAdd languages\nArticle\nTalk\nTools\nFrom Wikipedia, the free encyclopedia\nLook for Key Events of the 20th Century on one of Wikipedia's sister projects:\n", [('the 20th Century\nAdd', 'DATE'), ('Wikipedia', 'ORG'), ('Look for Key Events of', 'WORK_OF_ART'), ('the 20th Century', 'DATE'), ('one', 'CARDINAL'), ('Wikipedia', 'ORG')]), ('Wiktionary (dictionary)\nWikibooks (textbooks)\nWikiquote (quotations)\nWikisource (library)\nWikiversity (learning resources)\nCommons (media)\nWikivoyage (travel guide)\nWikinews (news source)\nWikidata (linked database)\nWikispecies (species directory)\nWikipedia does not have an article with this exact name.', [('Wikivoyage', 'PERSON'), ('Wikipedia', 'ORG')]), ('Please search for Key Events of the 20th Century in Wikipedia to check for alternative titles or spellings.\n', [('Key Events', 'ORG'), ('the 20th Century', 'DATE'

# Step 7: Entity Extraction and Filtering

In [15]:
# Define a list of countries
countries_list = ['United States', 'Canada', 'UK', 'France', 'Germany', 'India', 'China', 'Japan', 'Russia', 'Italy']

# Normalize country names (e.g., "USA" -> "United States")
country_name_map = {
    'USA': 'United States',
    'U.S.A.': 'United States',
    'UK': 'United Kingdom',
    'Republic of Korea': 'South Korea',
    # Add more mappings as necessary
}

# Filter the sentences_entities to include only sentences with countries from the list
filtered_sentences = []

for sentence, entities in sentences_entities:
    filtered_entities = [ent for ent, label in entities if ent in countries_list or ent in country_name_map.keys()]
    # Replace any mapped country names with their normalized version
    normalized_entities = [country_name_map.get(ent, ent) for ent in filtered_entities]
    
    if normalized_entities:  # If any entity matches the countries list, include it
        filtered_sentences.append((sentence, normalized_entities))

# Display the first few filtered sentences with entities
print(filtered_sentences[:5])

[]


# 8. Create the Relationships DataFrame

In [16]:
relationships = []

for sentence, entities in filtered_sentences:
    if len(entities) > 1:  # Only create relationships if there are at least 2 entities
        for i in range(len(entities) - 1):
            relationships.append({'Entity1': entities[i], 'Entity2': entities[i + 1], 'Sentence': sentence})

# Create a DataFrame from the relationships list
relationships_df = pd.DataFrame(relationships)

# Display the first few rows of the relationships DataFrame
print(relationships_df.head())

Empty DataFrame
Columns: []
Index: []


# 9. Save and Export Your DataFrame

In [17]:
# Save the relationships DataFrame to a CSV file
relationships_df.to_csv('relationships.csv', index=False)
print("Relationships DataFrame saved to 'relationships.csv'")

# Optionally, save the filtered sentences with entities to a CSV file
filtered_sentences_df = pd.DataFrame(filtered_sentences, columns=['Sentence', 'Entities'])
filtered_sentences_df.to_csv('filtered_sentences.csv', index=False)
print("Filtered sentences DataFrame saved to 'filtered_sentences.csv'")

Relationships DataFrame saved to 'relationships.csv'
Filtered sentences DataFrame saved to 'filtered_sentences.csv'
