In [6]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 882.6 kB/s eta 0:00:14
     - ------------------------------------- 0.5/12.8 MB 882.6 kB/s eta 0:00:14
     -- ------------------------------------ 0.8/12.8 MB 798.0 kB/s eta 0:00:16
     --- ----------------------------------- 1.0/12.8 MB 882.6 kB/s eta 0:00:14
     --- ----------------------------------- 1.0/12.8 MB 882.6 kB/s eta 0:00:14
     --- ----------------------------------- 1.3/12.8 MB 780.2 kB/s eta 0:00:15
     ---- ---------------------------------- 1.6/12.8 MB 814.1 kB/s eta 0:00:14
     ---- ---------------------------------- 1.6

In [9]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

In [10]:
# Load the book

with open('20th_Century.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

book = NER(data)

In [11]:
# Visualize identified entities

displacy.render(book[273:500], style = "ent", jupyter = True)

In [17]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [18]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , Key, events, of, the, 20th, century, -,...",[the 20th century -]
1,"(articleAbout, WikipediaContact, us, \t\t...",[Search Search ...
2,"(1.1.2, Russian, Revolution, and, communism, ...","[1.2, 1.2.1 Economic depression , 1.3,..."
3,"(1.3.1, The, war, in, Europe, , 1.3.2,...","[1.3.1, Europe, 1.3.2, Blitzkrieg 1.3.3]"
4,"(Operation, Overlord, , 1.3.6, Final, ...","[the Pacific , 1.3.7.1, 1.3.8, Japanese,..."
5,"(Allied, offensive, , 1.3.10, Final, d...",[]
6,"(The, Nuclear, Age, begins, , 1.4)","[The Nuclear Age, 1.4]"
7,"(The, post, -, war, world, , 1.4.1)",[1.4.1]
8,"(The, end, of, empires, :, decolonization, ...","[1.4.2, The Cold War, 1947â€“1991]"
9,"(The, space, race, , 1.4.5, The, end, ...","[the Cold War 1.4.6 Information, 1.5, ..."


In [21]:
countries_df = pd.read_csv("countries_list_20th_century_1.5.csv",index_col=0)

In [27]:
countries_df["country_name"]=countries_df["country_name"].str.strip()

In [28]:
# Function to filter out entities not of interest

def filter_entity(ent_list, countries_df):
    return [ent for ent in ent_list
                  if ent in list(countries_df['country_name'])]

In [29]:
df_sentences['country_name'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries_df))

In [30]:
df_sentences

Unnamed: 0,sentence,entities,country_name
0,"( , Key, events, of, the, 20th, century, -,...",[the 20th century -],[]
1,"(articleAbout, WikipediaContact, us, \t\t...",[Search Search ...,[]
2,"(1.1.2, Russian, Revolution, and, communism, ...","[1.2, 1.2.1 Economic depression , 1.3,...",[]
3,"(1.3.1, The, war, in, Europe, , 1.3.2,...","[1.3.1, Europe, 1.3.2, Blitzkrieg 1.3.3]",[]
4,"(Operation, Overlord, , 1.3.6, Final, ...","[the Pacific , 1.3.7.1, 1.3.8, Japanese,...",[]
...,...,...,...
1593,"(Art, Christianity, Literature, Music, classic...","[Art Christianity Literature Music, 1970s 1980...",[]
1594,"(Text, is, available, under, the, Creative, Co...",[the Creative Commons Attribution-ShareAlike 4...,[]
1595,"(By, using, this, site, ,, you, agree, to, the...",[the Terms of Use and Privacy Policy],[]
1596,"(WikipediaÂ, ®, is, a, registered, trademark, ...","[WikipediaÂ®, the Wikimedia Foundation, Inc.]",[]


In [31]:
# Filter out sentences 

df_sentences_filtered = df_sentences[df_sentences['country_name'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_name
1088,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1093,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, Partition between In...",[Pakistan]
1102,"(^, "", The, Philippines, ,, 1898â€“1946, |, US...","[Philippines, 1898â€“1946, US House of Represe...",[Philippines]
1132,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Postcolonial Borders,...",[Afghanistan]
1168,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1232,"("", Selling, "", Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1263,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Stuck in Endless Preliminaries, Vietnam, the ...",[Vietnam]
1526,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1532,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1533,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


In [33]:
df_sentences_filtered['country_name'] = df_sentences_filtered['country_name'].apply(lambda x: [item.split()[0]
                                                                                                    for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentences_filtered['country_name'] = df_sentences_filtered['country_name'].apply(lambda x: [item.split()[0]


In [35]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_name
1088,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1093,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, Partition between In...",[Pakistan]
1102,"(^, "", The, Philippines, ,, 1898â€“1946, |, US...","[Philippines, 1898â€“1946, US House of Represe...",[Philippines]
1132,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Postcolonial Borders,...",[Afghanistan]
1168,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1232,"("", Selling, "", Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1263,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Stuck in Endless Preliminaries, Vietnam, the ...",[Vietnam]
1526,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1532,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1533,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


In [52]:
# Defining relationships

# window size = 5 : this defines how many sentences will be looked at simultaneously
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_name = sum((df_sentences_filtered.loc[i: end_i].country_name), [])

     # Remove duplicated characters that are next to each other
    country_unique = [country_name[i] for i in range(len(country_name))
                             if (i==0) or country_name[i] != country_name[i-1]]

    if len(country_unique) > 1:
        for idx, a in enumerate( country_unique[ :-1]):
          b = country_unique[idx + 1]
          relationships.append({"source": a, "target": b})

In [53]:
relationship_df = pd.DataFrame(relationships)

In [54]:
relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Austria
4,Austria,Hungary
...,...,...
591,India,Singapore
592,India,Singapore
593,India,Singapore
594,India,Singapore


In [55]:
# Sort the cases with a- >b and b- >a
relationships_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationships_df.head(5)

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Germany,Russia
3,Austria,Germany
4,Austria,Hungary


In [56]:
relationships_df["value"] = 1
relationships_df = relationships_df.groupby(["source","target"], sort=False, as_index=False).sum()

relationships_df.head(10)

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,26
2,Austria,Germany,17
3,Austria,Hungary,6
4,Bulgaria,Hungary,6
5,Bulgaria,Russia,6
6,Germany,Italy,25
7,Germany,Spain,2
8,France,Poland,14
9,France,Germany,24
