In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import networkx as nx

plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Reading in the data 

In [None]:
df = pd.read_csv('/Users/KStamp/Desktop/Katie Stamp - DSI20 - LDN/Cleaned Datasets/final_gtd.csv', index_col=['Unnamed: 0'])

In [None]:
df.head(2)

In [None]:
# returning the country codes back to the original names for ease of network visability
df['country_txt'] = df.country.replace({4 : 'Afghanistan',
5 :' Albania',
6 : 'Algeria',
7 : 'Andorra',
8 : 'Angola',
10 : 'Antigua and Barbuda', 
11 : 'Argentina',
12 : 'Armenia',
14 : 'Australia',
15: 'Austria',
16 : 'Azerbaijan',
17 : 'Bahamas',
18 : 'Bahrain',
19 : 'Bangladesh',
20 : 'Barbados',
21 : 'Belgium',
22 :'Belize',
23 : 'Benin',
24 : 'Bermuda',
25 : 'Bhutan',
26 : 'Bolivia',
28 : 'Bosnia-Herzegovina',
29 : 'Botswana',
30 : 'Brazil',
31 : 'Brunei',
32 : 'Bulgaria',
33 : 'Burkina_Faso',
34 : 'Burundi',
35 : 'Belarus',
36 : 'Cambodia',
37 : 'Cameroon',
38 : 'Canada',
41 : 'Central_African_Republic', 
42 : 'Chad',
43 :'Chile',
44 : 'China',
45 : 'Colombia',
46 : 'Comoros',
47 : 'Republic_of_the_Congo',
49 : 'Costa_Rica',
50 : 'Croatia',
51 : 'Cuba',
53 : 'Cyprus',
54: 'Czech_Republic',
55 : 'Denmark',
56 : 'Djibouti',
57 : 'Dominica',
58 : 'Dominican_Republic',
59 : 'Ecuador',
60 : 'Egypt',
61 : 'El_Salvador',
62 : 'Equatorial_Guinea', 
63 : 'Eritrea',
64 : 'Estonia',
65 : 'Ethiopia',
66 : 'Falkland_Islands', 
67 : 'Fiji',
68 : 'Finland',
69 :'France',
70 : 'French_Guiana', 
71 : 'French_Polynesia', 
72 : 'Gabon',
73 : 'Gambia',
74 :'Georgia',
75 : 'Germany',
76 : 'Ghana',
78 : 'Greece',
79 : 'Greenland',
80 : 'Grenada',
81 : 'Guadeloupe',
83 : 'Guatemala',
84 : 'Guinea',
85 : 'Guinea-Bissau', 
86 : 'Guyana',
87 : 'Haiti',
88 : 'Honduras',
89 : 'Hong_Kong',
90 : 'Hungary',
91 : 'Iceland',
92 : 'India',
93 : 'Indonesia',
94 :'Iran',
95 : 'Iraq',
96 : 'Ireland',
97 : 'Israel',
98 : 'Italy',
99 : 'Ivory_Coast',
100 : 'Jamaica',
101 : 'Japan',
102 : 'Jordan',
103 : 'Kazakhstan',
104: 'Kenya',
106 : 'Kuwait',
107 : 'Kyrgyzstan',
108 : 'Laos',
109: 'Latvia',
110 :'Lebanon',
111 : 'Lesotho',
112 : 'Liberia',
113 : 'Libya',
114 : 'Liechtenstein',
115 : 'Lithuania',
116 : 'Luxembourg',
117 : 'Macau',
118 : 'Macedonia',
119 : 'Madagascar',
120 : 'Malawi',
121 : 'Malaysia',
122 : 'Maldives',
123 : 'Mali',
124 : 'Malta',
125 : 'Isle_of_Man',
126 : 'Marshall_Islands',
127 : 'Martinique',
128 : 'Mauritania',
129 : 'Mauritius',
130 : 'Mexico',
132 : 'Moldova',
134 : 'Mongolia',
136 : 'Morocco',
137 : 'Mozambique',
138 : 'Myanmar',
139 : 'Namibia',
141 : 'Nepal',
142 : 'Netherlands',
143 : 'New_Caledonia',
144 : 'New_Zealand',
145 : 'Nicaragua',
146 : 'Niger',
147 : 'Nigeria',
149 : 'North_Korea',
151 : 'Norway',
152 : 'Oman',
153 : 'Pakistan',
155 : 'West_Bank and Gaza_Strip', 
156 : 'Panama',
157 : 'Papua_New_Guinea', 
158 : 'Paraguay',
159 : 'Peru',
160 : 'Philippines',
161 : 'Poland',
162 : 'Portugal',
163 : 'Puerto_Rico',
164 : 'Qatar',
166 : 'Romania',
167 : 'Russia',
168 : 'Rwanda',
169 : 'Saba(Netherlands_Antilles)',
173 : 'Saudi_Arabia',
174 : 'Senegal',
175 : 'Serbia-Montenegro', 
176 : 'Seychelles',
177 : 'Sierra_Leone',
178 : 'Singapore',
179 : 'Slovak_Republic',
180 : 'Slovenia',
181 : 'Solomon_Islands', 
182 : 'Somalia',
183 : 'South_Africa',
184 : 'South_Korea',
185 : 'Spain',
186 : 'Sri_Lanka',
189 : 'St.Kitts and Nevis', 
190 : 'St.Lucia',
192 : 'St.Martin',
195 : 'Sudan',
196 : 'Suriname',
197 : 'Swaziland',
198 : 'Sweden',
199 : 'Switzerland',
200 : 'Syria',
201 : 'Taiwan',
202 : 'Tajikistan',
203 : 'Tanzania',
204 : 'Togo',
205 : 'Thailand',
206 : 'Tonga',
207 : 'Trinidad and Tobago', 
208 : 'Tunisia',
209 : 'Turkey',
210 : 'Turkmenistan',
212 : 'Tuvalu',
213 : 'Uganda',
214 : 'Ukraine',
215 : 'United_Arab_Emirates', 
216 : 'Great_Britain',
217 : 'United_States',
218 : 'Uruguay',
219 : 'Uzbekistan',
220 : 'Vanuatu',
221 : 'Vatican_City',
222 : 'Venezuela',
223 : 'Vietnam',
225 : 'Virgin_Islands_(U.S)',
226 : 'Wallis and Futuna',
228 : 'Yemen',
603 : 'United_Kingdom',
362 : 'West_Germany',
604 :  'Zaire',
406 : 'South_Yemen',
230 : 'Zambia',
1003 : 'Kosovo',
229 : 'Democratic Republic of the Congo'})

In [None]:
df

In [None]:
df.country_txt.unique()

In [None]:
df.head()

# Creating a bipartite network between terrorist group and country

In [None]:
from networkx.algorithms import bipartite

B = nx.Graph()
# Add the node attribute "bipartite"
B.add_nodes_from(df['gname'], bipartite=0)
B.add_nodes_from(df['country_txt'], bipartite=1)
B.add_edges_from(list(zip(df['gname'], df['country_txt'])))

In [None]:
print(nx.info(B))

In [None]:
bottom_nodes, top_nodes = bipartite.sets(B)
print("First group:", bottom_nodes, "\n")
print("Second group:", top_nodes)

In [None]:
position = list(zip([0]*len(bottom_nodes), list(range(len(bottom_nodes))))) + \
    list(zip([1]*len(top_nodes), list(range(len(top_nodes)))))

positions = {}
for i, node in enumerate(bottom_nodes):
    positions[node] = np.array(position[i])
for i, node in enumerate(top_nodes):
    positions[node] = np.array(position[i+len(bottom_nodes)])

In [None]:
positions

In [None]:
# plotting positions to show connections
plt.figure(figsize=(62,48))

pos = nx.spring_layout(B)
nx.draw(B, pos=positions, with_labels=True, node_color='white', font_size=15, edge_color='gray', style='dashed')
nx.draw_networkx_nodes(bipartite.projected_graph(B, bottom_nodes),
                       pos=positions, node_color='red', node_size=1000, alpha=0.5)
nx.draw_networkx_nodes(bipartite.projected_graph(B, top_nodes),
                       pos=positions, node_color='gold', node_size=1000, alpha=0.5)
plt.title("Terrorist Group vs Country Neural Network ", fontsize=50)
plt.show()

We can see that groups such a muslim extremists have a large amount of links, meaning they have attacked multiple countries, we could hypothesise that they will therefore be connected to a larger network of terrorist groups.

In [None]:
G_top = bipartite.projected_graph(B, top_nodes)
G_bottom = bipartite.projected_graph(B, bottom_nodes)

In [None]:
# Plotting the network between terrorist groups
plt.figure(figsize=(32,28))
nx.draw(G_bottom, with_labels=True, node_color='red', edge_color='gray', style='dashed', font_size=25)

As predicted we can see that there is a large amount of links to and from muslim extremists, pointing to a network of terrorism other groups such as national liberation front are solely linked to one other group the nicaraguan democratic front, we can hypothesis that the country they attack and that their agenda is more localised.

In [None]:
# Plotting the network of countries
plt.figure(figsize=(32,28))
nx.draw(G_top, with_labels=True, node_color='gold', edge_color='gray', style='dashed', font_size=25)

The network is not arranged by latitude or longitude at the moment, yet we can see the connections between continents we can see South American countries connected in the bottom right and a collection of European countries in the middle of the network.

In [None]:
# plotting degree distribution of Terrorist Groups
d_max = max(dict(B.degree()).values())
plt.hist(dict(nx.degree(bipartite.projected_graph(B, bottom_nodes))).values(),
          bins=np.linspace(0, d_max, d_max+1), align='left', rwidth=0.5)
plt.xlabel('Degree')
plt.ylabel('Counts')
plt.title('Degree Distribution Terrorist Groups')
plt.show()   

In [None]:
# plotting degree distribution of countries
d_max = max(dict(B.degree()).values())
plt.hist(dict(nx.degree(bipartite.projected_graph(B, top_nodes))).values(),
          bins=np.linspace(0, d_max, d_max+1), align='left', rwidth=0.5)
plt.xlabel('Degree')
plt.ylabel('Counts')
plt.title('Degree Distribution Countries')
plt.show()

The degree distributions show closer links within the terrorist groups than the countries themselves, the highest number of connections within the terrorist groups is 25, whilst the countries have the larger count of 50.

It also appears that the smaller the degrees of connections within the terrorist group the higher the count, whereas the degrees of connections with countries are opposed to this.

In [None]:
# computing the clustering coefficient for nodes
nx.clustering(bipartite.projected_graph(B, bottom_nodes))

In [None]:
nx.average_clustering(bipartite.projected_graph(B, bottom_nodes))

In [None]:
nx.clustering(bipartite.projected_graph(B, top_nodes))

In [None]:
nx.average_clustering(bipartite.projected_graph(B, top_nodes))

In [None]:
# plotting clustering coefficient distribution of Terrorist Groups
plt.hist(list(nx.clustering(bipartite.projected_graph(B, bottom_nodes)).values()),
         align='left', rwidth=0.5)
plt.xlabel('Clustering coefficient')
plt.ylabel('Counts')
plt.title('Clustering coefficient distribution - Terrorist Groups')
plt.show()

In [None]:
# plotting clustering coefficient distribution of Countries
plt.hist(list(nx.clustering(bipartite.projected_graph(B, top_nodes)).values()),
         align='left', rwidth=0.5)
plt.xlabel('Clustering coefficient')
plt.ylabel('Counts')
plt.title('Clustering coefficient distribution - Countries')
plt.show()

Both the terrorist groups and the countries are showing how well connected the neighborhood of each node is. The most frequent coefficient in both groups is '1' showing that there are large qtys of connections in the each group.

In [None]:
print("Adjacency matrix of the bipartite graph:")
print(nx.adjacency_matrix(B).todense())

In [None]:
# plotting adjacency matrix
plt.figure(figsize=(32,28))
plt.spy(nx.adjacency_matrix(B), markersize=20)
plt.show()

The graph shows a perfect symmetry diagonally, meaning a one way relationship between group and country.

In [None]:
 nx.clustering(B)

The one way relationship is confirmed above, with a result of 0 clusters.

In [None]:
# reviewing the paths between terrorist groups
nx.shortest_path_length(B, source='Taliban')

We can use shortest length to show connections between terrorist groups, for example above, the taliban has a 1 path connection to turkemenistan, pakistan, afghanistan denoting these countries are most likeliy to be attacked by the taliban we can also say that there are 6 groups who are only 2 connections away from taliban, denoting to the factthat they have attacked the same countries and therefore have similar agendas.

In [None]:
# reviewing the paths between countries
nx.shortest_path_length(B, source='United_States')

We can use shortest length to show connections between countries, for example above, united states has a 1 path connection to 5 groups denoting these countries are most likeliy to be attacked by the them, such as IRA. We can also say that there are over 50 countries who are only 2 connections away from united states, denoting to the fact that they have been attacked by the same groups.

In [None]:
# evaluating 'betweenness'
node_centrality = nx.betweenness_centrality(B)
edge_centrality = nx.edge_betweenness_centrality(B)

In [None]:
node_centrality

In [None]:
node_corr = pd.DataFrame(list(node_centrality.items()), columns=['node', 'centrality'])

In [None]:
edge_corr = pd.DataFrame(list(edge_centrality.items()), columns=['edge', 'centrality'])

In [None]:
sns.heatmap(node_corr.corr());

In [None]:
sns.heatmap(edge_corr.corr());

In [None]:
edge_centrality

In [None]:
# plot to show nodes with highest level of importance 
plt.figure(figsize=(52,38))
plt.barh(list(range(len(list(node_centrality.values())))),
         list(node_centrality.values()))
plt.yticks(list(range(len(node_centrality))), list(node_centrality.keys()))
plt.title('Node betweenness centrality')
plt.show() 

As predicted previously we can see that muslim extremists have the highest 'betweeness' meaning this group would serve as a bridge from one part of a graph to another.

In [None]:
# plot to show edges with the highest level of importance
plt.figure(figsize=(52,38))
plt.barh(list(range(len(list(edge_centrality.values())))),
         list(edge_centrality.values()))
plt.yticks(list(range(len(edge_centrality))), list(edge_centrality.keys()))
plt.title('Edge betweenness centrality')
plt.show() 

Here the highest edge betweeness is the terrorist group M-19 (Movement of April '19) and country Lebonan with Muslim Extremists and Mali as the second most connected edge. 

In [None]:
import copy

def breaking_graph(H, node_list):
    # define the new graph as the subgraph induced by the GCC
    n_l = copy.deepcopy(node_list)
    # continue deleting nodes from the GCC while the graph consists of a
    # single component (num_components=1)
    num_components = 1
    count = 0
    while num_components == 1:
        count += 1
        #node_to_delete = random.choice(H.nodes())
        # select at random an element in the node list or
        # select a node according to the betweenness ranking
        # (the last in the list)
        node_to_delete = n_l.pop()
        H.remove_node(node_to_delete)
        num_components = nx.number_connected_components(H)
    return count 

In [None]:
# how many times did it have to break to take the giant component down
G_GCC = B.copy()
random_list = copy.deepcopy(list(G_GCC.nodes()))
np.random.shuffle(random_list)

c = breaking_graph(G_GCC, random_list)

print("num of iterations:", c)

In [None]:
plt.figure(figsize=(32,28))
graphviz_pos = nx.drawing.layout.spring_layout(G_GCC)
nx.draw(G_GCC, graphviz_pos, node_size=200,
        with_labels=True)

In [None]:
# average iterations to break the network
n_iter = 1000
count = 0.0
for i in range(n_iter):
    G_GCC = B.copy()
    random_list = copy.deepcopy(list(G_GCC.nodes()))
    np.random.shuffle(random_list)
    c = breaking_graph(G_GCC, random_list)
    count += c

print("average iterations to break GCC:", count/n_iter)

On average it takes 7.577 random nodes to break the network (splitting mostly countries from the group) 

7.577/139 nodes = 5% 

In [None]:
import operator

G_GCC = B.copy()

node_centrality = nx.betweenness_centrality(G_GCC)

sorted_bc = sorted(list(node_centrality.items()), 
                   key=operator.itemgetter(1))

# selecting the ranked node list
node_ranking = [item[0] for item in sorted_bc]

c = breaking_graph(G_GCC, node_ranking)

print("num of iterations:", c)

In [None]:
n_iter = 1000
count = 0.0
for i in range(n_iter):
    G_GCC = B.copy()
    c = breaking_graph(G_GCC, node_ranking)
    count += c

print("average iterations to break GCC:", count/n_iter)

The function found the nodes with the highest centrality only took 1 iteration to break - everytime (1000 iterations), it will always be removing the node with the highest centruality.

In [None]:
# plotting betweenness centrality
plt.hist(list(node_centrality.values()),
         bins=np.logspace(-3*np.log(5), -1*np.log(4), 20))
plt.title('Betweenness centrality')
plt.show()

We can see high qtys of low betweenness centrality meaning overal there were low amounts of influence a node has over the flow of information in a graph.

In [None]:
# plotting the network with drop in components
def Girvan_Newman(G_1):
    G = G_1.copy()
    pos = nx.drawing.spring_layout(G)
    sorted_bc = [1]
    actual_number_components = 1
    while not sorted_bc == []:
        d_edge = nx.edge_betweenness_centrality(G)
        sorted_bc = sorted(list(d_edge.items()), key=operator.itemgetter(1))
        e = sorted_bc.pop()
        print("deleting edge:", e[0], end=' ')
        G.remove_edge(*e[0])
        num_comp = nx.number_connected_components(G)
        print("...we have now ", num_comp, " components")
        if num_comp > actual_number_components:
            actual_number_components = num_comp
            if num_comp < 22:
                plt.figure(figsize=(28,20))
                nx.draw(G, pos, with_labels=True)
                plt.show()

In [None]:
Girvan_Newman(B)

In [None]:
GN_communities = list(nx.community.girvan_newman(B))
GN_communities

In [None]:
# plotting communities in clusters 
def plot_communities(G, number_of_clusters=2, seed=1):
    GN_communities = list(nx.community.girvan_newman(G))
    pos = nx.drawing.spring_layout(G, seed=1)
    if number_of_clusters > G.number_of_nodes():
        index = G.number_of_nodes()-2
    else:
        index = number_of_clusters - 2
    G_sub = {}

    label_dict = {}
    for i, nodes in enumerate(GN_communities[index]):
        for node in nodes:
            label_dict[node] = i 
    label_list = [label_dict[node] for node in G.nodes()]
    
    plt.figure(figsize=(32,28))

    nx.draw(G, pos=pos, node_color = label_list, cmap='rainbow', with_labels=True)
    
    plt.show()

In [None]:
plot_communities(B, number_of_clusters=2)

In [None]:
plot_communities(B, number_of_clusters=6)

In [None]:
plot_communities(B, number_of_clusters=12)

# Shortest Connections In Network

In [None]:
# shortest connection between UK AND US is IRA
nx.shortest_path(B, 'United_Kingdom', 'United_States')

In [None]:
def shortest_connection(G, start, end):
    print("The Network Path Length Between Them Is:", nx.shortest_path_length(G, start, end))
    print("Shortest Connection Between Them Is:", nx.shortest_path(G, start, end))

In [None]:
shortest_connection_path(B, 'United_Kingdom', 'United_States')

Here we can look at the groups that were incorrectly guessed by our models and their connections.

In [None]:
shortest_connection(B, 'National Liberation Army of Colombia (ELN)', 
                         'Revolutionary Armed Forces of Colombia (FARC)')

In [None]:
shortest_connection(B, 'Abu Sayyaf Group (ASG)', "New People's Army (NPA)")

We can see both these groups have a short connection between then with just 1 country as their link so we can begin to understand why the model incorrectly predicted one as the other. (Visually shown further down).

In [None]:
# The diameter is the longest of all shortest paths between any two points.
nx.diameter(B, e=None, usebounds=False)

# Calling in The Dataset With Latitude and Longitude

In [None]:
df2 = pd.read_csv('/Users/KStamp/Desktop/geo_code_save.csv', index_col='Unnamed: 0')

In [None]:
df.head()

In [None]:
df2.head()[['latitude', 'longitude', 'country_txt']]

In [None]:
# merging latitude, longitude, country and terrorist group into one df 
df3 = df.merge(df2, left_index=True, right_index=True)
df3[['latitude', 'longitude', 'country_txt_x', 'gname_x']]

In [None]:
locations = dict(zip(df3['country_txt_x'], df3[['longitude', 'latitude']].values))

In [None]:
# Using the option pos in the drawing function to position the nodes according to their 
# longitudinal and latitudinal coordinates. 
plt.figure(figsize=(32,28))
nx.draw(G_top, pos=locations, with_labels=True, node_color='gold', edge_color='gray', style='dashed', font_size=25)

Here we can see countries have been grouped together by location on a map much clearer than before, Europe is visible in mid-centre, South Africa at the bottom of the network and South American countries under the United States.

In [None]:
locations_gname = dict(zip(df3['gname_x'], df3[['longitude', 'latitude']].values)) 

In [None]:
# Using the option pos in the drawing function to position the nodes according to their 
# longitudinal and latitudinal coordinates for terrorist groups.
plt.figure(figsize=(32,28))
nx.draw(G_bottom, pos=locations_gname, with_labels=True, node_color='red', edge_color='gray', style='dashed', font_size=25)

We can see terrorist groups have been grouped together by location on a map, e.g. African National  Congress is placed where Africa would be on this map of latitude and longitude given, you can start to see more of the relations between groups that attack similiar areas.

In [None]:
loc_all = locations.copy()

In [None]:
loc_all

In [None]:
loc_all.update(locations_gname)
print(loc_all) 

In [None]:
# plotting countries and terrorist groups by latitude and longitude of attacks 
plt.figure(figsize=(62,58))
nx.draw(B, pos=loc_all, with_labels=True, node_color='red', edge_color='gray', style='dashed', font_size=10)

This is where we can visualise the relationships between terrorist group, country and the latitude / longitude of the attacks. For example, the connection between Abu Sayyaf Group and New People’s Army (which were two groups our classification model incorrectly predicted) shows us that both organisations performed attacks in the Philippines.

In [None]:
# Ideally I would like to take these networks a step further and look at other connections - such as attack 
# type / weapon type / target type to build up a layer of networks.