# Name Entity Recognition(NER) Network Analysis

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module of spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 2.4 MB/s eta 0:00:06
     ---- ----------------------------------- 1.3/12.8 MB 2.7 MB/s eta 0:00:05
     ---- ----------------------------------- 1.3/12.8 MB 2.7 MB/s eta 0:00:05
     ---- ----------------------------------- 1.3/12.8 MB 2.7 MB/s eta 0:00:05
     ----- ---------------------------------- 1.8/12.8 MB 1.5 MB/s eta 0:00:08
     -------- ------------------------------- 2.6/12.8 MB 1.9 MB/s eta 0:00:06
     ---------- ----------------------------- 3.4/12.8 MB 2.2 MB/s eta 0:00:05
     ------------- -------------------------- 4.2/12.8 MB 2.4 MB/s eta 0:00:04
     ---------------- ----------------------- 5.

In [3]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

## Load 20th Century Events Text

In [4]:
# Defining path
path = r'C:\Users\jboer\20th-Century\20th_Century_Wiki_cleaned.txt'
# Loading text
with open(path, 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [5]:
# Creating NER object
text = NER(data)

In [6]:
# Visualize identified entities
displacy.render(text[273:20000], style = "ent", jupyter = True)

## Get sentence entities from NER object

In [7]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in text.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [8]:
# Checking df_sentences dataframe
df_sentences.head()

Unnamed: 0,sentence,entities
0,"(20th, century, changed, world, unprecedented,...","[20th century, World Wars, Cold War led, 21st ..."
1,"(The, new, beginning, 20th, century, marked, s...","[20th century, 1900s, decade, 1914, Panama Can..."
2,"(Russia, ended, hostile, actions, Central, Pow...","[Russia, Central Powers, Tsar Bolsheviks, Trea..."
3,"(Division, Austria, Hungary, World, War, I, Wh...","[Division Austria Hungary World War I, 1918, n..."
4,"(New, states, like, Yugoslavia, Czechoslovakia...","[Yugoslavia, Czechoslovakia, Austro Hungarian ..."


## Load country names

In [15]:
# Import country names
path = r'C:\Users\jboer\20th-Century\countries_list_20th_century_1.5.csv'
country = pd.read_csv(path, index_col = 0)

In [16]:
# Checking dataframe
print(country.head())

  country_name
1  Afghanistan
2      Albania
3      Algeria
4      Andorra
5       Angola


In [11]:
# Mapping variations of country names
country_var = {
    "United States": ['United States', 'US', 'USA', 'America', 'American', 'Americans', 'States', 'U'],
    "Germany": ['Germany', 'Germans', 'German'],
    "United Kingdom": ['United Kingdom', 'Britain', 'Kingdom', 'England', 'British'],
    "Australia": ['Australia', 'Australians'],
    "Japan": ['Japan', 'Japanese'],
    "China, People's Republic of": ['China', 'Chinese'],
    "Korea, North": [('North Korea'), ('North Korean'), ('North Koreans')],
    "Korea, South": [('South Korea'), ('South Korean'), ('South Koreans')],
    "France": ['France', 'French'],
    "Vietnam": ['Vietnam', 'Vietnamese'],
    "Russia": ['Russia', 'Russian', 'Russians', 'USSR', 'Soviet'],
    "Czech Republic": ['Czech Republic', 'Czechoslovakia', 'Czech'],
    "Hungary": ['Hungary', 'Hungarian', 'Hungarians'],
    "Finland": ['Finland', 'Finnish'],
    "Sweden": ['Sweden', 'Swedish']
}

#### By creating a map of all the variations a country could have in the text, I am ensuring that every mention of a country, whether by it's formal name or some other variation of it's name, will be counted. This will give me a more accurate representation of the number of times a country is mentioned.

In [12]:
# Creating dataframe for country variations
data = []
for country, variations in country_var.items():
    for variation in variations:
        data.append([country, variation])
        
df_var = pd.DataFrame(data, columns=['country_name', 'variation'])

In [13]:
df_var.head()

Unnamed: 0,country_name,variation
0,United States,United States
1,United States,US
2,United States,USA
3,United States,America
4,United States,American


In [17]:
# Merging country dataframe with df_var dataframe
df_merged = pd.merge(country, df_var, on='country_name', how='left')

In [18]:
print(df_merged)

                         country_name variation
0                         Afghanistan       NaN
1                             Albania       NaN
2                             Algeria       NaN
3                             Andorra       NaN
4                              Angola       NaN
..                                ...       ...
236  Sahrawi Arab Democratic Republic       NaN
237                        Somaliland       NaN
238                     South Ossetia       NaN
239                            Taiwan       NaN
240                      Transnistria       NaN

[241 rows x 2 columns]


In [19]:
# Filling null values in variation column
df_merged['variation'] = df_merged['variation'].fillna(df_merged['country_name'])

In [20]:
print(df_merged)

                         country_name                         variation
0                         Afghanistan                       Afghanistan
1                             Albania                           Albania
2                             Algeria                           Algeria
3                             Andorra                           Andorra
4                              Angola                            Angola
..                                ...                               ...
236  Sahrawi Arab Democratic Republic  Sahrawi Arab Democratic Republic
237                        Somaliland                        Somaliland
238                     South Ossetia                     South Ossetia
239                            Taiwan                            Taiwan
240                      Transnistria                      Transnistria

[241 rows x 2 columns]


## Filtering entities from text

In [21]:
# Function to filter out entities not of interest
def filter_entity(ent_list, df_merged):
    return [ent for ent in ent_list 
            if ent in list(df_merged['variation'])]

In [22]:
# Check
filter_entity(["Germany", "CF", "2"], df_merged)

['Germany']

In [23]:
# Creating list of entities
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, df_merged))

In [24]:
# Checking list
df_sentences['country_entities'].head(20)

0                                                    []
1           [British, France, German, Austria, Hungary]
2     [Russia, Germany, Russia, Germany, Ukraine, Ge...
3                           [Germans, Germans, Germany]
4                                      [Czechoslovakia]
5                                             [Russian]
6                                                    []
7                                                    []
8                                                    []
9                                                    []
10    [Germans, Germany, Italy, Germany, Germany, Ge...
11    [German, German, Germans, United States, Germany]
12                        [German, Czech, Czech, Spain]
13                                                   []
14                                                   []
15                                                   []
16                                                   []
17    [Czechoslovakia, Britain, France, Poland, 

In [25]:
# Filter out sentences that don't have any country entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [26]:
# Checking filtered sentences
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
76,"(Korean, War, Vietnam, War, Cuban, Missile, Cr...","[Korean War Vietnam War Cuban Missile Crisis, ...","[China, North Korea, United States, South Kore..."
78,"(After, French, withdrawal, former, colony, 21...","[French, 21 July 1954, Vietnam, two, Korea, 17...","[French, Vietnam, United States, American, US,..."
79,"(This, Nixon, Doctrine, involved, gradual, pul...","[Nixon, American, South Vietnamese, American, ...","[American, American, Cambodia, US, US, US]"
80,"(Vietnam, unified, Communist, rule, year, late...","[Vietnam, Communist, year later, one, Cuban Mi...","[Vietnam, United States, Cuba, Soviet, US, Sov..."
81,"(Five, landings, astronauts, followed, Apollo,...","[Five, Apollo, 13, US, Soviet, early 21st cent...","[US, Soviet, United States, Russia, Japan, Can..."
83,"(By, 1980s, Soviet, Union, weakening, Sino, So...","[1980s Soviet Union, Sino Soviet, USSR, People...",[USSR]
84,"(Its, arms, race, US, draining, country, funds...","[US, Mikhail Gorbachev, Solidarity, Berlin, So...","[US, Soviet, Lithuania, Russia]"
96,"(By, end, century, technological, advances, sc...","[end century, Europe, first, Indian, sixth, en...","[China, US]"
97,"(This, led, anti, Western, anti, American, fee...","[Western, American, Middle East, China, India,...","[American, China, India]"
100,"(Third, World, However, developing, countries,...","[South Africa, Nelson Mandela, first, four hal...","[South Africa, Rwanda, North Korea]"


## Create relationships

In [27]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated countries that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [28]:
# Creating a dataframe for relationships
relationship_df = pd.DataFrame(relationships)

In [29]:
# Checking relationships dataframe
relationship_df

Unnamed: 0,source,target
0,British,France
1,France,German
2,German,Austria
3,Austria,Hungary
4,Hungary,Russia
...,...,...
1758,Rwanda,North Korea
1759,South Africa,Rwanda
1760,Rwanda,North Korea
1761,South Africa,Rwanda


In [30]:
# Flattening variation dictionary to create a mapping dictionary
# Flatten the dictionary to create a mapping dictionary
mapping_dict = {variation: country for country, variations in country_var.items() for variation in variations}
print(mapping_dict)

{'United States': 'United States', 'US': 'United States', 'USA': 'United States', 'America': 'United States', 'American': 'United States', 'Americans': 'United States', 'States': 'United States', 'U': 'United States', 'Germany': 'Germany', 'Germans': 'Germany', 'German': 'Germany', 'United Kingdom': 'United Kingdom', 'Britain': 'United Kingdom', 'Kingdom': 'United Kingdom', 'England': 'United Kingdom', 'British': 'United Kingdom', 'Australia': 'Australia', 'Australians': 'Australia', 'Japan': 'Japan', 'Japanese': 'Japan', 'China': "China, People's Republic of", 'Chinese': "China, People's Republic of", 'North Korea': 'Korea, North', 'North Korean': 'Korea, North', 'North Koreans': 'Korea, North', 'South Korea': 'Korea, South', 'South Korean': 'Korea, South', 'South Koreans': 'Korea, South', 'France': 'France', 'French': 'France', 'Vietnam': 'Vietnam', 'Vietnamese': 'Vietnam', 'Russia': 'Russia', 'Russian': 'Russia', 'Russians': 'Russia', 'USSR': 'Russia', 'Soviet': 'Russia', 'Czech Rep

In [31]:
# Replacing variations in relationship_df with main country names using mapping dictionary
relationship_df['source'] = relationship_df['source'].replace(mapping_dict)
relationship_df['target'] = relationship_df['target'].replace(mapping_dict)

In [32]:
# Checking dataframe
print(relationship_df.head())

           source   target
0  United Kingdom   France
1          France  Germany
2         Germany  Austria
3         Austria  Hungary
4         Hungary   Russia


#### Since I used the varaiation map before to make sure I counted each mention of a country, I will use it again to replace the varaiations in the dataframe with the actual country name, so that when I summarize the interactions, each variation will be counted under the proper country name.

In [33]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,United Kingdom
1,France,Germany
2,Austria,Germany
3,Austria,Hungary
4,Hungary,Russia


In [34]:
# Summarize the interactions
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [35]:
# Checking the summary of interactions
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,United Kingdom,55
1,France,Germany,42
2,Austria,Germany,2
3,Austria,Hungary,2
4,Hungary,Russia,8
5,Germany,Russia,57
6,Germany,Ukraine,6
7,Germany,United States,82
8,United Kingdom,United States,27
9,Germany,United Kingdom,73


#### Now I have a dataframe that is showing the interactions of each country in the text referred to by their actual country name, but still accouting for every varaition a country name could have had in the text.
#### This dataframe will be useful to see how countries interact with each other in the text, ultimately telling us how closely related two countries are in the events of the 20th century.

In [36]:
# Export dataframe
relationship_df.to_csv('country_relationship.csv')