# World Events Dataset Preprocessing

## imports

In [11]:
#libraries to handle rdf file format
%pip install numpy pandas rdflib tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
import numpy as np
import pandas as pd

import re
from pathlib import Path

from rdflib import Graph, Namespace
from rdflib.namespace import RDFS

DCT = Namespace("http://purl.org/dc/terms/")


pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

## load data

In [None]:
def clean_nt_file(src_path, dst_path):
    src_path = Path(src_path)
    dst_path = Path(dst_path)
    
    bad = 0
    total = 0
    
    with src_path.open("r", encoding="utf-8") as fin, dst_path.open("w", encoding="utf-8") as fout:
        for line in fin:
            total += 1
            stripped = line.strip()
            
            # skip empty lines + comment lines
            if not stripped or stripped.startswith("#"):
                continue
            
            # check for correct line ending
            if not stripped.endswith("."):
                bad += 1
                continue
            
            # nt file format: <subject> <predicate> <object> .
            m = re.match(r'^<[^>]+>\s+<[^>]+>\s+.+\s\.\s*$', stripped)
            if not m:
                bad += 1
                continue
            
            fout.write(line)
    
    print(f"{src_path.name}: kept {total - bad} lines, skipped {bad} malformed lines")


In [14]:
data_dir = Path("./data")

events_path      = data_dir / "events.nt"
first_sent_path  = data_dir / "events_first_sentences.nt"
relations_path   = data_dir / "relations_events_base.nt"
labels_path      = data_dir / "preferred_labels.nt"

events_clean      = data_dir / "events_clean.nt"
first_sent_clean  = data_dir / "events_first_sentences_clean.nt"
relations_clean   = data_dir / "relations_events_base_clean.nt"
labels_clean      = data_dir / "preferred_labels_clean.nt"

clean_nt_file(events_path,     events_clean)
clean_nt_file(first_sent_path, first_sent_clean)
clean_nt_file(relations_path,  relations_clean)
clean_nt_file(labels_path,     labels_clean)


events.nt: kept 8814753 lines, skipped 0 malformed lines
events_first_sentences.nt: kept 1251819 lines, skipped 0 malformed lines
relations_events_base.nt: kept 4928718 lines, skipped 0 malformed lines


KeyboardInterrupt: 

In [None]:
g_desc = Graph()
g_desc.parse(str(first_sent_clean), format="nt")

len(g_desc)


1251819

In [None]:
event_desc = {}

for s, p, o in g_desc:
    # filter out English events since site is tailored for an English-speaking audience
    if getattr(o, "language", None) == "en":
        event_desc[str(s)] = str(o)

len(event_desc)


371221

In [None]:
list(event_desc.items())[:5]

[('https://eventkg.l3s.uni-hannover.de/resource/event_148186',
  "The 2002 Women's British Open was held 8–11 August at the Ailsa Course at Turnberry Golf Club in South Ayrshire, Scotland."),
 ('https://eventkg.l3s.uni-hannover.de/resource/event_1011202',
  'Local elections were held in Baringo County on 4 March 2013 to elect a Governor and County Assembly.'),
 ('https://eventkg.l3s.uni-hannover.de/resource/event_2794267',
  'United Nations Security Council resolution 744, adopted without a vote on 25 February 1992, after examining the application of the Republic of San Marino for membership in the United Nations, the Council recommended to the General Assembly that San Marino be admitted.'),
 ('https://eventkg.l3s.uni-hannover.de/resource/event_1838594',
  'The Guldbagge for Best Cinematography is a Swedish film award presented annually by the Swedish Film Institute as part of the Guldbagge Awards to cinematographers working in the Swedish motion picture industry.'),
 ('https://eventk

In [None]:
# regex helpers for weird bracketed urls in nt files
# <s> <p> "text"@lang .
literal_pat = re.compile(
    r'^<([^>]+)>\s+<([^>]+)>\s+"(.*)"@([a-zA-Z\-]+)\s+\.\s*$'
)

# <s> <p> "2020-09-27"^^<...> .
date_pat = re.compile(
    r'^<([^>]+)>\s+<([^>]+)>\s+"([^"]+)"\^\^<[^>]+>\s+\.\s*$'
)

# Descriptions from events_first_sentences_clean.nt
first_sent_rows = []

with first_sent_clean.open("r", encoding="utf-8") as f:
    for line in f:
        m = literal_pat.match(line.strip())
        if not m:
            continue
        subj, pred, text, lang = m.groups()
        if pred == "http://purl.org/dc/terms/description" and lang == "en":
            first_sent_rows.append({
                "EventIRI": subj,
                "Description": text
            })

event_desc = pd.DataFrame(first_sent_rows)
print("event_desc rows:", len(event_desc))
display(event_desc.head())

# Event titles from events_clean.nt
title_rows = []

with events_clean.open("r", encoding="utf-8") as f:
    for line in f:
        m = literal_pat.match(line.strip())
        if not m:
            continue
        subj, pred, text, lang = m.groups()
        if pred == "http://www.w3.org/2000/01/rdf-schema#label" and lang == "en":
            title_rows.append({
                "EventIRI": subj,
                "Title": text
            })

event_labels = pd.DataFrame(title_rows)
print("event_labels rows:", len(event_labels))
display(event_labels.head())

# labels from preferred_labels_clean.nt
label_rows = []

with labels_clean.open("r", encoding="utf-8") as f:
    for line in f:
        m = literal_pat.match(line.strip())
        if not m:
            continue
        subj, pred, text, lang = m.groups()
        if pred == "http://www.w3.org/2004/02/skos/core#prefLabel" and lang == "en":
            label_rows.append({
                "ResourceIRI": subj,
                "PrefLabel": text
            })

pref_labels = pd.DataFrame(label_rows)
print("pref_labels rows:", len(pref_labels))
display(pref_labels.head())

# dates, start and end from relations_events_base_clean.nt
start_rows = []
end_rows   = []

with relations_clean.open("r", encoding="utf-8") as f:
    for line in f:
        m = date_pat.match(line.strip())
        if not m:
            continue
        subj, pred, date_str = m.groups()
        if pred == "http://semanticweb.cs.vu.nl/2009/11/sem/hasBeginTimeStamp":
            start_rows.append({
                "EventIRI": subj,
                "StartDate": date_str
            })
        elif pred == "http://semanticweb.cs.vu.nl/2009/11/sem/hasEndTimeStamp":
            end_rows.append({
                "EventIRI": subj,
                "EndDate": date_str
            })

event_start = pd.DataFrame(start_rows)
event_end   = pd.DataFrame(end_rows)

print("event_start rows:", len(event_start))
print("event_end rows:", len(event_end))
display(event_start.head())
display(event_end.head())

# join into single HistoricalEvents table
events = (
    event_labels
    .merge(event_desc, on="EventIRI", how="left")
    .merge(event_start, on="EventIRI", how="left")
    .merge(event_end, on="EventIRI", how="left")
)

# Add integer EventID
events["EventID"] = range(1, len(events) + 1)

# Convert to datetime safely (weird future years that python can't handle become NaT)
events["StartDate"] = pd.to_datetime(events["StartDate"], errors="coerce")
events["EndDate"]   = pd.to_datetime(events["EndDate"], errors="coerce")
events["StartYear"] = events["StartDate"].dt.year
events["EndYear"]   = events["EndDate"].dt.year

print("HistoricalEvents shape:", events.shape)
display(events.head())

# save to csv files
events.to_csv("HistoricalEvents.csv", index=False)
event_desc.to_csv("EventDescriptions.csv", index=False)
event_labels.to_csv("EventLabels.csv", index=False)
pref_labels.to_csv("PreferredLabels.csv", index=False)

print("intermediate csvs created!")


Using files:
  descriptions: data/events_first_sentences_clean.nt
  events: data/events.nt
  pref labels: data/preferred_labels_clean.nt
  relations: data/relations_events_base_clean.nt
event_desc rows: 371221


Unnamed: 0,EventIRI,Description
0,https://eventkg.l3s.uni-hannover.de/resource/e...,Lindsay Davenport and Corina Morariu were the ...
1,https://eventkg.l3s.uni-hannover.de/resource/e...,The 2019 China Open was a badminton tournament...
2,https://eventkg.l3s.uni-hannover.de/resource/e...,"Rehaif v. United States, 588 U.S. ___, was a c..."
3,https://eventkg.l3s.uni-hannover.de/resource/e...,The 2019 Ken Galluccio Cup will be the 11th ed...
4,https://eventkg.l3s.uni-hannover.de/resource/e...,"The 2019 Danish Handball Cup, known as Santand..."


event_labels rows: 910482


Unnamed: 0,EventIRI,Title
0,https://eventkg.l3s.uni-hannover.de/resource/e...,2018 New Zealand First Party deputy leadership...
1,https://eventkg.l3s.uni-hannover.de/resource/e...,Happiness
2,https://eventkg.l3s.uni-hannover.de/resource/e...,happiness
3,https://eventkg.l3s.uni-hannover.de/resource/e...,Data Science Day Berlin
4,https://eventkg.l3s.uni-hannover.de/resource/e...,Triskaidekaphobia


pref_labels rows: 21045267


Unnamed: 0,ResourceIRI,PrefLabel
0,https://eventkg.l3s.uni-hannover.de/resource/e...,Battle of Tulagi and Gavutu–Tanambogo
1,https://eventkg.l3s.uni-hannover.de/resource/e...,"South Australian state election in Black, 2018"
2,https://eventkg.l3s.uni-hannover.de/resource/e...,Ech Du
3,https://eventkg.l3s.uni-hannover.de/resource/e...,2021 African Minifootball Cup
4,https://eventkg.l3s.uni-hannover.de/resource/e...,2008 Taiwanese United Nations membership refer...


event_start rows: 720325
event_end rows: 622970


Unnamed: 0,EventIRI,StartDate
0,https://eventkg.l3s.uni-hannover.de/resource/e...,2020-09-27
1,https://eventkg.l3s.uni-hannover.de/resource/e...,2021-01-24
2,https://eventkg.l3s.uni-hannover.de/resource/e...,2020-11-03
3,https://eventkg.l3s.uni-hannover.de/resource/e...,2021-01-01
4,https://eventkg.l3s.uni-hannover.de/resource/e...,2008-05-06


Unnamed: 0,EventIRI,EndDate
0,https://eventkg.l3s.uni-hannover.de/resource/e...,2021-05-30
1,https://eventkg.l3s.uni-hannover.de/resource/e...,2021-01-24
2,https://eventkg.l3s.uni-hannover.de/resource/e...,2008-05-06
3,https://eventkg.l3s.uni-hannover.de/resource/e...,2021-10-18
4,https://eventkg.l3s.uni-hannover.de/resource/e...,2021-06-26


HistoricalEvents shape: (910482, 8)


Unnamed: 0,EventIRI,Title,Description,StartDate,EndDate,EventID,StartYear,EndYear
0,https://eventkg.l3s.uni-hannover.de/resource/e...,2018 New Zealand First Party deputy leadership...,An election for the parliamentary Deputy leade...,2018-02-27,2018-02-27,1,2018.0,2018.0
1,https://eventkg.l3s.uni-hannover.de/resource/e...,Happiness,"Happiness, in the context of mental or emotion...",NaT,NaT,2,,
2,https://eventkg.l3s.uni-hannover.de/resource/e...,happiness,"Happiness, in the context of mental or emotion...",NaT,NaT,3,,
3,https://eventkg.l3s.uni-hannover.de/resource/e...,Data Science Day Berlin,,2012-05-01,NaT,4,2012.0,
4,https://eventkg.l3s.uni-hannover.de/resource/e...,Triskaidekaphobia,Triskaidekaphobia is fear or avoidance of the ...,NaT,NaT,5,,


intermediate csvs created!


## format for final csv

In [17]:
# 1. load csv
events_raw = pd.read_csv("HistoricalEvents.csv")
print("Raw HistoricalEvents.csv:")
display(events_raw.head())

# 2. Make a copy with only the columns we care about
needed_cols = ["EventID", "Title", "Description", "StartDate", "EndDate"]
events = events_raw[needed_cols].copy()

# 3. Normalize dates to mySQL date format (YYYY-MM-DD)
# errors="coerce" turns invalid dates into NaT
events["StartDate"] = pd.to_datetime(events["StartDate"], errors="coerce").dt.date
events["EndDate"]   = pd.to_datetime(events["EndDate"], errors="coerce").dt.date

# **location column is not in eventKG database?
#filter out rows with nan startdate or enddate
historical_events_sql = events[["EventID", "Title", "StartDate", "EndDate", "Description"]].dropna(subset=["StartDate", "EndDate"]).reset_index(drop=True)

print("Formatted HistoricalEvents table (for SQL):")
display(historical_events_sql.head())

# 6. save to new csv
historical_events_sql.to_csv("historical_events.csv", index=False)
print("finished!")


Raw HistoricalEvents.csv:


Unnamed: 0,EventIRI,Title,Description,StartDate,EndDate,EventID,StartYear,EndYear
0,https://eventkg.l3s.uni-hannover.de/resource/e...,2018 New Zealand First Party deputy leadership...,An election for the parliamentary Deputy leade...,2018-02-27,2018-02-27,1,2018.0,2018.0
1,https://eventkg.l3s.uni-hannover.de/resource/e...,Happiness,"Happiness, in the context of mental or emotion...",,,2,,
2,https://eventkg.l3s.uni-hannover.de/resource/e...,happiness,"Happiness, in the context of mental or emotion...",,,3,,
3,https://eventkg.l3s.uni-hannover.de/resource/e...,Data Science Day Berlin,,2012-05-01,,4,2012.0,
4,https://eventkg.l3s.uni-hannover.de/resource/e...,Triskaidekaphobia,Triskaidekaphobia is fear or avoidance of the ...,,,5,,


Formatted HistoricalEvents table (for SQL):


Unnamed: 0,EventID,Title,StartDate,EndDate,Description
0,1,2018 New Zealand First Party deputy leadership...,2018-02-27,2018-02-27,An election for the parliamentary Deputy leade...
1,7,Cave Creek Complex Wildfire,2005-01-01,2005-12-31,The Cave Creek Complex Wildfire was the third ...
2,8,Cave Creek Complex fire,2005-01-01,2005-12-31,The Cave Creek Complex Wildfire was the third ...
3,9,United Nations Security Council Resolution 1994,2011-06-30,2011-12-31,United Nations Security Council Resolution 199...
4,10,World Youth Day 2000,2000-08-15,2000-08-20,World Youth Day 2000 was a Catholic youth fest...


finished!


## adding location information (missing from original dataset for some reason)

In [None]:
#try to extract location information from name or description (find country names)
%pip install pycountry pandas
import pandas as pd
import pycountry
country_names = [country.name for country in pycountry.countries]
print(country_names[:10]) 
location_list = []
numLocations = 0
historical_events_raw = pd.read_csv("historical_events.csv")
historical_events_raw['Title'] = historical_events_raw['Title'].fillna('No Title Provided')
historical_events_raw['Description'] = historical_events_raw['Description'].fillna('No Description Provided')
for index, row in historical_events_raw.iterrows():
    title = row['Title']
    description = row['Description']
    location_found = 'Location Unknown'
    
    for country in country_names:
        if isinstance(country, str) and (country in title or country in description):
            #check if country is a string
            numLocations += 1
            location_found = country
            break
    
    location_list.append(location_found)
historical_events_raw['Location'] = location_list
display(historical_events_raw.head())
print(f"Number of events with locations: {numLocations}")
historical_events_raw.to_csv("historical_events_with_location.csv", index=False)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
['Aruba', 'Afghanistan', 'Angola', 'Anguilla', 'Åland Islands', 'Albania', 'Andorra', 'United Arab Emirates', 'Argentina', 'Armenia']


Unnamed: 0,EventID,Title,StartDate,EndDate,Description,Location
0,1,2018 New Zealand First Party deputy leadership...,2018-02-27,2018-02-27,An election for the parliamentary Deputy leade...,New Zealand
1,7,Cave Creek Complex Wildfire,2005-01-01,2005-12-31,The Cave Creek Complex Wildfire was the third ...,
2,8,Cave Creek Complex fire,2005-01-01,2005-12-31,The Cave Creek Complex Wildfire was the third ...,
3,9,United Nations Security Council Resolution 1994,2011-06-30,2011-12-31,United Nations Security Council Resolution 199...,
4,10,World Youth Day 2000,2000-08-15,2000-08-20,World Youth Day 2000 was a Catholic youth fest...,Italy


Number of events with locations: 179444


In [10]:
#count nulls in every column check
null_counts = historical_events_raw.isnull().sum()
print("Null counts in each column:")
print(null_counts) 

Null counts in each column:
EventID             0
Title               0
StartDate           0
EndDate             0
Description         0
Location       401126
dtype: int64
