In [1]:
# Render our plots inline
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (30, 14)

In [2]:
#Read in the data set
gtd = pd.read_csv('gtdDataSet.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
gtd_vra = gtd[['eventid', 'iyear', 'imonth', 'summary', 'doubtterr', 'weaptype1', 'weaptype1_txt', 'weapsubtype1', 'weapsubtype1_txt', 'nperps', 'targtype1', 'targtype1_txt', 'targsubtype1', 'targsubtype1_txt', 'nkill']]

In [6]:
gtd_vra.shape

(201183, 15)

In [8]:
#Select for rows where the attack used either a knife or a vehicle
gtd_vra = gtd_vra[(gtd_vra['weapsubtype1_txt'] == 'Knife or Other Sharp Object') | (gtd_vra['weaptype1'] == 10)]

In [9]:
gtd_vra.shape

(3071, 15)

In [29]:
#Select for rows where the target type is one of the top 7 most frequently occurring 
gtd_vra = gtd_vra[(gtd_vra['targtype1_txt'] == 'Private Citizens & Property') | (gtd_vra['targtype1_txt'] == 'Military') | 
                  (gtd_vra['targtype1_txt'] == 'Police') | (gtd_vra['targtype1_txt'] == 'Government (General)') 
                 | (gtd_vra['targtype1_txt'] == 'Journalists & Media') | (gtd_vra['targtype1_txt'] == 'Religious Figures/Institutions') |
                   (gtd_vra['targtype1_txt'] == 'Business')] 

In [30]:
gtd_vra.shape

(1585, 15)

In [31]:
#ID baseline by checking which feature has the highest percentage when normalized.
#Note that we have calculated the baseline earlier than normal, but we're doing it here so that it's before the feature is split into dummy variables.
gtd_vra["targtype1_txt"].value_counts(normalize = True)

Private Citizens & Property       0.559621
Military                          0.142587
Police                            0.141325
Government (General)              0.077603
Religious Figures/Institutions    0.051104
Business                          0.027760
Name: targtype1_txt, dtype: float64

In [32]:
#Drop any duplicate rows, since we subsetted columns there should be no duplicates
gtd_vra.drop_duplicates(inplace=True)
gtd_vra.dropna(inplace=True)

TO DO: Write code to normalize all variables and look for outliers

In [33]:
#Inpsect the number of records after the selection
gtd_vra.shape

(1585, 15)

In [34]:
#Partition the data set without balancing. We initially started at an 80/20 split but found that 70/30 assisted
#in model performance
gtd_train1, gtd_test1 = train_test_split(gtd_vra, test_size =0.3, random_state = 1888)

In [35]:
#Check the data set to verify that no records are missing
print('GTD shape:', gtd_vra.shape)
print('Train shape: ',gtd_train1.shape)
print('Test shape: ', gtd_test1.shape)

GTD shape: (1585, 15)
Train shape:  (1109, 15)
Test shape:  (476, 15)


1238 + 531 = 1769. It appears that all records were succesfully carried over during partitioning

The baseline for our model is 50% since we see the most frequently occurring feature, Private citizens & property accounting for roughly 50.1% of occurrences.

In [36]:
#Set up our x training set
x = gtd_train1[['iyear', 'imonth', 'doubtterr', 'weaptype1_txt', 'weapsubtype1_txt', 'nperps', 'nkill']]

In [37]:
#Set up our y training set
y = gtd_train1[['targtype1_txt']]

In [38]:
#Set up our x test set
x_test = gtd_test1[['iyear', 'imonth', 'doubtterr', 'weaptype1_txt', 'weapsubtype1_txt', 'nperps', 'nkill']]

In [39]:
#Set up our y test set
y_test = gtd_test1[['targtype1_txt']]

In [40]:
#Code x training set with dummys for appropriate categorical variables
x = pd.get_dummies(x, columns=['weaptype1_txt','weapsubtype1_txt'], prefix = ['dummy','dummy'])

In [41]:
#Code x_test training set with dummys for appropriate categorical variables
x_test = pd.get_dummies(x_test, columns=['weaptype1_txt','weapsubtype1_txt'], prefix = ['dummy','dummy'])

In [42]:
# verify that the two sets do not contain the same columns
print (x_test.isin(x).all().all())
# list the columns
print(x_test.columns)
print(x.columns)

# get a list of which columns are in the training set but not testing
col_compare  = x.columns.isin(x_test)
# iterate to list missing columns 
print('Not in testing ---> ')
for index in range (0, len(col_compare)):
       if not col_compare[index]:
            print(x.columns[index])
            
# get a list of which columns are in the testing set but not training
col_compare  = x_test.columns.isin(x)
# iterate to list missing columns
print('Not in training ---> ')
for index in range (0, len(col_compare)):
       if not col_compare[index]:
            print(x_test.columns[index])

False
Index(['iyear', 'imonth', 'doubtterr', 'nperps', 'nkill', 'dummy_Melee',
       'dummy_Knife or Other Sharp Object'],
      dtype='object')
Index(['iyear', 'imonth', 'doubtterr', 'nperps', 'nkill', 'dummy_Melee',
       'dummy_Knife or Other Sharp Object'],
      dtype='object')
Not in testing ---> 
Not in training ---> 


In [43]:
#Verify that the columns now have the same shape
print(x.shape)
print(x_test.shape)

(1109, 7)
(476, 7)


In [44]:
#Create cart model using max 15 nodes
cart01 = DecisionTreeClassifier(criterion = "gini", max_leaf_nodes=15).fit(x,y)

#Create our c50 model using max 15 nodes
c50_01 = DecisionTreeClassifier(criterion="entropy",max_leaf_nodes=15).fit(x,y)

#Create our random forest model using max estimators 100
rf01 = RandomForestClassifier(n_estimators = 100,criterion="gini").fit(x,y)

  rf01 = RandomForestClassifier(n_estimators = 100,criterion="gini").fit(x,y)


In [45]:
print("Training scores:")
print(cart01.score(x,y))
print(c50_01.score(x,y))
print(rf01.score(x,y))
print("----------------")
print("Test scores:")
print(cart01.score(x_test,y_test))
print(c50_01.score(x_test,y_test))
print(rf01.score(x_test,y_test))

Training scores:
0.6681695220919748
0.6663660955816051
0.8827772768259693
----------------
Test scores:
0.6470588235294118
0.6533613445378151
0.569327731092437


In [None]:
Begin creating visualizations for report and presentation

In [None]:
#Distribution of Target for Low barrier to entry attack
gtd_vra["targtype1_txt"].value_counts().plot(kind='bar')
plt.title("Distribution of vehicle and knife attack targets")
plt.xlabel('Attack Targets')
plt.ylabel('Counts')
plt.show()