# Identifying Features Associated with Groups and First Model Attempts

## Load Data

In [1]:
# Imports
import pandas as pd
import numpy as np

# columns of interest
cols_to_extract = ['eventid', 'iyear',
 'country_txt','gname','attacktype1_txt',
 'success','suicide',
 'weaptype1_txt','weapsubtype1_txt',
 'targtype1_txt','targsubtype1_txt',
 'individual','nperps','claimed',
 'nkill','nwound',
 'property','propextent_txt',
 'ishostkid','nhostkid','hostkidoutcome_txt','ransom']

# Load data
df = pd.read_excel('GTD_0617dist/globalterrorismdb_0617dist.xlsx',
                   index='eventid',
                   usecols=cols_to_extract,
                   na_values = ['Unknown','-99','-9','Not Applicable'])

# setting index in read statement doesn't seem to work, so do it here
df.set_index('eventid',inplace=True) 

# replace unwanted _txt suffix from column names
df.columns = df.columns.str.replace('_txt','')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170350 entries, 197000000001 to 201701270001
Data columns (total 21 columns):
iyear             170350 non-null int64
country           170350 non-null object
success           170350 non-null int64
suicide           170350 non-null int64
attacktype1       163925 non-null object
targtype1         165477 non-null object
targsubtype1      161005 non-null object
gname             92044 non-null object
individual        170350 non-null int64
nperps            26173 non-null float64
claimed           102742 non-null float64
weaptype1         156498 non-null object
weapsubtype1      150924 non-null object
nkill             160668 non-null float64
nwound            155025 non-null float64
property          170350 non-null int64
propextent        41479 non-null object
ishostkid         169903 non-null float64
nhostkid          11358 non-null float64
ransom            74955 non-null float64
hostkidoutcome    6651 non-null object
dtypes: float64(

## Extract events associated with groups of interest

In [2]:
# remove events with no group affiliation
no_grp = df.gname.isnull() | df.individual
with_grp = ~no_grp
df = df[with_grp]

# don't need the 'individual' column any more
df.drop('individual',axis=1,inplace=True)

# only keep the top n groups with the most incidents
n_groups = 50 #for all groups: df['gname'].nunique()

top_grps = df['gname'].value_counts().head(n_groups).index
df = df[df.gname.isin(top_grps)]

print('Number of events affiliated with individuals or unknown group: ',sum(no_grp))
print('Number of events affiliated with a group: ',sum(with_grp))
print('Number of events affiliated with top {} groups: {}'.format(n_groups,len(df)))

Number of events affiliated with individuals or unknown group:  78620
Number of events affiliated with a group:  91730
Number of events affiliated with top 50 groups: 60221


## Modify some unwanted columns and values

In [3]:
# replace some values not correctly dealt with by pandas import
df.replace(-9,np.nan,inplace=True)
df.replace(-99,np.nan,inplace=True)

# rename year column
df.rename(columns={'iyear':'year'}, inplace=True)

# If no claimed info - treat as not claimed
df['claimed'].fillna(0,inplace=True)

# remove some values that don't give useful information
df['weaptype1'].replace('Other',np.nan,inplace=True)

df['weapsubtype1'].replace(['Unknown Gun Type', 'Unknown Explosive Type',
                                'Other Explosive Type', 'Unknown Weapon Type',
                                'Other Gun Type'], np.nan, inplace=True)

df['targtype1'].replace('Other',np.nan,inplace=True)

df['targsubtype1'].replace(['Other Personnel', 'Other (including online news agencies)', 'Other Facility'],
                               np.nan,inplace=True)


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60221 entries, 197001310001 to 201612310043
Data columns (total 20 columns):
year              60221 non-null int64
country           60221 non-null object
success           60221 non-null int64
suicide           60221 non-null int64
attacktype1       56890 non-null object
targtype1         59024 non-null object
targsubtype1      57737 non-null object
gname             60221 non-null object
nperps            10123 non-null float64
claimed           60221 non-null float64
weaptype1         53851 non-null object
weapsubtype1      27327 non-null object
nkill             55491 non-null float64
nwound            52350 non-null float64
property          53930 non-null float64
propextent        14450 non-null object
ishostkid         60070 non-null float64
nhostkid          4991 non-null float64
ransom            29995 non-null float64
hostkidoutcome    3166 non-null object
dtypes: float64(8), int64(3), object(9)
memory usage: 9.6+ MB


## Categorise some columns to reduce no. features

In [4]:
# Numeric columns - convert to values for 0, 1, 2-10, and more than 10 
conv_numeric = ['nkill','nwound','nperps','nhostkid']

for col in conv_numeric:
    df[col] = pd.cut(df[col],
                        [-0.1,0.9,1.9,10.9,max(df[col])+0.1],
                        labels=['0','1','2to10','11'])

# boolean columns - convert to yes/no to help identification in dummy variables later
conv_bool = ['success','suicide','claimed','property','ishostkid','ransom']

for col in conv_bool:
    df[col].replace({0:'no',1:'yes'},inplace=True)

# bin year in to decades
df['year'] = pd.cut(df['year'],
                        [1969.9,1979.9,1989.9,1999.9,2009.9,2019.9],
                        labels=['1970s','1980s','1990s','2000s','2010s'])  

display(df.head())

# warning message below r.e. empty bins

  if (np.diff(bins) < 0).any():


Unnamed: 0_level_0,year,country,success,suicide,attacktype1,targtype1,targsubtype1,gname,nperps,claimed,weaptype1,weapsubtype1,nkill,nwound,property,propextent,ishostkid,nhostkid,ransom,hostkidoutcome
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
197001310001,1970s,Philippines,no,no,,Military,"Military Personnel (soldiers, troops, officers...",New People's Army (NPA),,no,,,0,1.0,no,,no,,no,
197004020001,1970s,Philippines,yes,no,,Military,"Military Personnel (soldiers, troops, officers...",New People's Army (NPA),,no,,,2to10,0.0,no,,no,,no,
197004250001,1970s,Turkey,yes,no,Bombing/Explosion,Airports & Aircraft,Airline Officer/Personnel,Palestinians,,no,Explosives/Bombs/Dynamite,,0,0.0,yes,,no,,no,
197005040001,1970s,Paraguay,yes,no,Assassination,Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",Palestinians,2to10,no,Firearms,Automatic Weapon,1,1.0,no,,no,,no,
197006260001,1970s,United Kingdom,no,no,Armed Assault,Terrorists/Non-State Militia,Terrorist,Irish Republican Army (IRA),,no,Incendiary,,2to10,,yes,,no,,no,


## Create training and testing datasets

In [5]:
from sklearn.model_selection import train_test_split

# labels
y = df['gname']
display(y.head())

# features
X = df.drop('gname',axis=1)
display(X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)
print('Train: X shape =',X_train.shape,', Y shape=',y_train.shape)
print('Test: X shape =',X_test.shape,', Y shape=',y_test.shape)

# make some combined data frames with both labels and features. Useful later.
df_train = X_train.copy()
df_train['gname'] =  y_train

df_test = X_test.copy()
df_test['gname'] = y_test

eventid
197001310001        New People's Army (NPA)
197004020001        New People's Army (NPA)
197004250001                   Palestinians
197005040001                   Palestinians
197006260001    Irish Republican Army (IRA)
Name: gname, dtype: object

Unnamed: 0_level_0,year,country,success,suicide,attacktype1,targtype1,targsubtype1,nperps,claimed,weaptype1,weapsubtype1,nkill,nwound,property,propextent,ishostkid,nhostkid,ransom,hostkidoutcome
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
197001310001,1970s,Philippines,no,no,,Military,"Military Personnel (soldiers, troops, officers...",,no,,,0,1.0,no,,no,,no,
197004020001,1970s,Philippines,yes,no,,Military,"Military Personnel (soldiers, troops, officers...",,no,,,2to10,0.0,no,,no,,no,
197004250001,1970s,Turkey,yes,no,Bombing/Explosion,Airports & Aircraft,Airline Officer/Personnel,,no,Explosives/Bombs/Dynamite,,0,0.0,yes,,no,,no,
197005040001,1970s,Paraguay,yes,no,Assassination,Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",2to10,no,Firearms,Automatic Weapon,1,1.0,no,,no,,no,
197006260001,1970s,United Kingdom,no,no,Armed Assault,Terrorists/Non-State Militia,Terrorist,,no,Incendiary,,2to10,,yes,,no,,no,


Train: X shape = (42154, 19) , Y shape= (42154,)
Test: X shape = (18067, 19) , Y shape= (18067,)


## Create Dummy variables for Each Category Value

Gives a bool column for each unique value.

In [6]:
# labels
# do on full dataset first to make sure same columns in test and train
y_dum = pd.get_dummies(y)
y_dum_train = y_dum.loc[y_train.index]
y_dum_test = y_dum.loc[y_test.index]

display(y_dum_train.head())

# features
# do on full dataset first to make sure same columns in test and train
X_dum = pd.get_dummies(X)
X_dum_train = X_dum.loc[X_train.index]
X_dum_test = X_dum.loc[X_test.index]

# NaN values in test data can destroy predictions so remove them
X_dum_test.fillna(0,inplace=True) 

display(X_dum_train.head())

Unnamed: 0_level_0,Abu Sayyaf Group (ASG),African National Congress (South Africa),Al-Qaida in Iraq,Al-Qaida in the Arabian Peninsula (AQAP),Al-Shabaab,Algerian Islamic Extremists,Bangsamoro Islamic Freedom Movement (BIFM),Basque Fatherland and Freedom (ETA),Boko Haram,Chechen Rebels,...,Revolutionary Armed Forces of Colombia (FARC),Separatists,Shining Path (SL),Sikh Extremists,Sinai Province of the Islamic State,Taliban,Tehrik-i-Taliban Pakistan (TTP),Tripoli Province of the Islamic State,Tupac Amaru Revolutionary Movement (MRTA),United Liberation Front of Assam (ULFA)
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201606300009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
197601050002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199201160007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198508160030,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199608220002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,year_1970s,year_1980s,year_1990s,year_2000s,year_2010s,country_Afghanistan,country_Algeria,country_Angola,country_Argentina,country_Austria,...,nhostkid_2to10,nhostkid_11,ransom_no,ransom_yes,hostkidoutcome_Attempted Rescue,hostkidoutcome_Combination,hostkidoutcome_Hostage(s) escaped (not during rescue attempt),hostkidoutcome_Hostage(s) killed (not during rescue attempt),hostkidoutcome_Hostage(s) released by perpetrators,hostkidoutcome_Successful Rescue
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201606300009,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197601050002,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199201160007,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
198508160030,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199608220002,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## Calculate Characterising Values for each Group
i.e. the feature values that are most characteristic of an attack by each group

In [7]:
# no. incidents associated with each group
grp_incs = y.value_counts()

# add group name column to the data frame of dummies
df_dum_train = X_dum_train.copy()
df_dum_train['gname'] =  y_train

# loop over all columns excluding gname
col_tfidf = dict()

for col in df_dum_train.drop('gname',axis=1).columns:
    # for each group, how many times this unique value appears
    grp_cnts = df_dum_train.groupby('gname')[col].sum()
    
    # count how many groups have an incident including this unique value
    # convert this for tf-idf weight using log(n_groups/count)
    if (grp_cnts>0).sum()==0:
        print('Warning: No instances of',col)
        
    w_col = np.log(n_groups/((grp_cnts>0).sum()))
    
    # multiply w_col by no. occurences each column value to get weight for each group
    col_tfidf[col] = (grp_cnts*w_col)
    
# merge unique values for each column in to one large data frame
w_tfidf = pd.DataFrame(col_tfidf,index=top_grps)

# get rid of NaN weights
w_tfidf.fillna(0,inplace=True)

w_tfidf.info()





<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, Taliban to Lord's Resistance Army (LRA)
Columns: 312 entries, attacktype1_Armed Assault to year_2010s
dtypes: float64(312)
memory usage: 122.3+ KB


## Print Top n Features for Top m Groups

In [20]:
n_print_grp=10
n_print_feat=7

for grp in grp_incs.head(n_print_grp).index:    
    print(w_tfidf.loc[grp].sort_values(ascending=False).head(n_print_feat))
    print('----------------------------------------------')

country_Afghanistan                                     12719.429850
year_2010s                                               1345.377889
targsubtype1_NATO                                         727.409850
claimed_yes                                               608.426487
suicide_yes                                               333.222005
year_2000s                                                301.588060
weapsubtype1_Suicide (carried bodily by human being)      199.751380
Name: Taliban, dtype: float64
----------------------------------------------
country_Peru                 6258.137222
year_1980s                   1082.273053
year_1990s                    293.682636
targsubtype1_Electricity      151.565355
weapsubtype1_Dynamite/TNT      85.140424
targtype1_Utilities            79.578826
weapsubtype1_Handgun           60.527536
Name: Shining Path (SL), dtype: float64
----------------------------------------------
country_Iraq                                            6286.53859

## Model Using tf-idf Type Weights Derived Above
NB: weights above calculated on all data. Should be only on training data.

In [9]:
# subtract neg_scale*weight for features not present in event
# but present in group
neg_scale = 0.25 

# matrix multiply test events by group weights for each feature
tfidf_events = pd.DataFrame(np.inner(X_dum_test,w_tfidf),index=X_dum_test.index,columns=w_tfidf.index)

# subtract contribution of negative cases
tfidf_events = tfidf_events - neg_scale*pd.DataFrame(np.inner(X_dum_test.replace({0:1,1:0}),w_tfidf),index=X_dum_test.index,columns=w_tfidf.index)

tfidf_pred = pd.DataFrame({'gname':y_test,'pred':tfidf_events.idxmax(axis=1)})
tfidf_pred['true'] = tfidf_pred.gname == tfidf_pred.pred

# stats on accuracy of model overall and per group
print('overall accuracy',sum(tfidf_pred.true)/len(tfidf_pred))

# calculate metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

print('precision',precision_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('recall',recall_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('accuracy',accuracy_score(tfidf_pred.gname, tfidf_pred.pred))
print('confusion matrix (top 20 grps):')
cmatrix = confusion_matrix(tfidf_pred.gname, tfidf_pred.pred,labels=top_grps)        
display(pd.DataFrame(cmatrix).iloc[:20,:20])

print('grp13: ',top_grps[13])
print('grp6: ',top_grps[6])
print('----------------------------')
print('grp14: ',top_grps[14])
print('grp11: ',top_grps[11])
print('----------------------------')
print('grp19: ',top_grps[19])
print('grp17: ',top_grps[17])
print('----------------------------')

overall accuracy 0.000719543919854
precision 0.000719543919854
recall 0.000719543919854
accuracy 0.000719543919854
confusion matrix (top 20 grps):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1821,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1300,0,0
2,0,4,4,0,0,0,0,0,0,0,127,1,0,0,0,1,0,1020,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,988,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,729,0,0
5,0,0,0,0,0,0,1,0,780,0,0,0,0,0,0,0,0,20,0,0
6,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,676,2,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,268,0,467,0,0
8,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,600,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,594,0,0


grp13:  National Liberation Army of Colombia (ELN)
grp6:  Revolutionary Armed Forces of Colombia (FARC)
----------------------------
grp14:  Maoists
grp11:  Communist Party of India - Maoist (CPI-Maoist)
----------------------------
grp19:  Houthi extremists (Ansar Allah)
grp17:  Al-Qaida in the Arabian Peninsula (AQAP)
----------------------------


In [10]:
print('==========================')
print('Frequent groups with incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].gname.value_counts().head(10))

print('==========================')
print('Incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].pred.value_counts().head(10))


Frequent groups with incorrect predictions


Taliban                                             2018
Shining Path (SL)                                   1358
Islamic State of Iraq and the Levant (ISIL)         1328
Farabundo Marti National Liberation Front (FMLN)    1001
Irish Republican Army (IRA)                          806
Al-Shabaab                                           795
Revolutionary Armed Forces of Colombia (FARC)        737
New People's Army (NPA)                              735
Basque Fatherland and Freedom (ETA)                  647
Boko Haram                                           642
Name: gname, dtype: int64

Incorrect predictions


Al-Qaida in the Arabian Peninsula (AQAP)          13976
Kurdistan Workers' Party (PKK)                      897
United Liberation Front of Assam (ULFA)             778
Al-Shabaab                                          617
Irish Republican Army (IRA)                         527
Tehrik-i-Taliban Pakistan (TTP)                     434
Communist Party of India - Maoist (CPI-Maoist)      224
Death Squad                                         179
Basque Fatherland and Freedom (ETA)                 146
Bangsamoro Islamic Freedom Movement (BIFM)           76
Name: pred, dtype: int64

## Fit a Classifier to the Data

In [11]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import average_precision_score, accuracy_score

model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_dum_train.fillna(0), y_dum_train.fillna(0))

y_svc_pred = model.predict(X_dum_test.fillna(0))

print('Accuracy score:',accuracy_score(y_dum_test,y_svc_pred))
print('Precision score:',average_precision_score(y_dum_test,y_svc_pred))

Accuracy score: 0.857751701998
Precision score: 0.681730034938


## Investigate the Results of the Classifier

In [12]:
# extract group for each event in test data
#y_test = y_test.stack()
#y_test = y_test[y_test>0].index.get_level_values(1)

# extract prediction for each event in test data
labels_svc_pred = y_dum_test.columns[y_svc_pred.argmax(axis=1)]

# create a data frame of labels and predictions
labels_svc = pd.DataFrame({'true':y_test.values, 'pred':labels_svc_pred.values})

# was the predcition correct?
labels_svc['correct'] = labels_svc['pred']==labels_svc['true']

# labelled correctly / total events
frac_true_svc = (labels_svc.loc[labels_svc.correct,'true'].value_counts()/labels_svc['true'].value_counts()).sort_values(ascending=False)

# predicted correctly / predicted total
frac_pred_svc = (labels_svc.loc[labels_svc.correct,'pred'].value_counts()/labels_svc['pred'].value_counts()).sort_values(ascending=False)


print('-------------------------------------------')
print('Groups commonly labelled incorrectly:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Common incorrect predictions:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Groups commonly labelled correctly:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.head(5))
print('-------------------------------------------')
print('Common correct predictions:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.head(5))

-------------------------------------------
Groups commonly labelled incorrectly:
-------------------------------------------


National Liberation Army of Colombia (ELN)       281
Revolutionary Armed Forces of Colombia (FARC)    228
Tupac Amaru Revolutionary Movement (MRTA)        157
Maoists                                          135
Al-Qaida in Iraq                                 123
Name: true, dtype: int64

Tupac Amaru Revolutionary Movement (MRTA)     0.122905
Muslim extremists                             0.253247
Al-Qaida in Iraq                              0.305085
National Liberation Army of Colombia (ELN)    0.352535
United Liberation Front of Assam (ULFA)       0.363636
Name: true, dtype: float64

-------------------------------------------
Common incorrect predictions:
-------------------------------------------


Abu Sayyaf Group (ASG)                            831
Shining Path (SL)                                 165
Islamic State of Iraq and the Levant (ISIL)       153
Revolutionary Armed Forces of Colombia (FARC)     142
Communist Party of India - Maoist (CPI-Maoist)    138
Name: pred, dtype: int64

Abu Sayyaf Group (ASG)                              0.096739
Fulani extremists                                   0.582090
Movement of the Revolutionary Left (MIR) (Chile)    0.608108
Tupac Amaru Revolutionary Movement (MRTA)           0.611111
M-19 (Movement of April 19)                         0.638298
Name: pred, dtype: float64

-------------------------------------------
Groups commonly labelled correctly:
-------------------------------------------


Taliban                                             2002
Shining Path (SL)                                   1338
Islamic State of Iraq and the Levant (ISIL)         1278
Farabundo Marti National Liberation Front (FMLN)     992
Al-Shabaab                                           795
Name: true, dtype: int64

Donetsk People's Republic                                      1.000000
Al-Shabaab                                                     1.000000
Nicaraguan Democratic Force (FDN)                              0.996350
Taliban                                                        0.992071
National Union for the Total Independence of Angola (UNITA)    0.991667
Name: true, dtype: float64

-------------------------------------------
Common correct predictions:
-------------------------------------------


Taliban                                             2002
Shining Path (SL)                                   1338
Islamic State of Iraq and the Levant (ISIL)         1278
Farabundo Marti National Liberation Front (FMLN)     992
Al-Shabaab                                           795
Name: pred, dtype: int64

Algerian Islamic Extremists                 1.000000
Donetsk People's Republic                   1.000000
African National Congress (South Africa)    1.000000
Taliban                                     0.998504
Nicaraguan Democratic Force (FDN)           0.996350
Name: pred, dtype: float64

## Abu Sayyaf Group (ASG): Frequently Predicted Wrongly

A look at some of the features of ASG events, and the groups that are often mistaken for ASG

In [13]:
print('=============================')
print('Abu Sayyaf Group (ASG)')
print('=============================')
asg=df[df.gname=='Abu Sayyaf Group (ASG)']
display(asg['country'].value_counts())

print('=============================')
print('Philippines')
print('=============================')
print(df[df.country == 'Philippines']['gname'].value_counts())

print('=============================')
print('Malaysia')
print('=============================')
print(df[df.country == 'Malaysia']['gname'].value_counts())

print('================================================')
print('Regularly Mistaken for Abu Sayyaf Group (ASG)')
print('================================================')
print(labels_svc.loc[(~labels_svc.correct) & (labels_svc.pred=='Abu Sayyaf Group (ASG)')].true.value_counts().head(10))

asg_w = w_tfidf.loc['Abu Sayyaf Group (ASG)']
farc_w = w_tfidf.loc['Revolutionary Armed Forces of Colombia (FARC)']
asgfarc = pd.DataFrame({'Abu Sayyaf Group (ASG)':asg_w, 'Revolutionary Armed Forces of Colombia (FARC)':farc_w})
asgfarc['diff'] = asgfarc['Abu Sayyaf Group (ASG)']-asgfarc['Revolutionary Armed Forces of Colombia (FARC)']

print('================================================')
print('Features More Common in ASG than FARC')
print('================================================')
display(asgfarc.sort_values('diff').tail(5).sort_values(by='diff',ascending=False))

print('=========================================================================')
print('Features More Common FARC than ASG')
print('=========================================================================')
display(asgfarc.sort_values('diff').head(5))

print('=========================================================================')
print('Features Similar in Both')
print('=========================================================================')
inboth = (asgfarc['Abu Sayyaf Group (ASG)']>0.005) & (asgfarc['Revolutionary Armed Forces of Colombia (FARC)']>0.005)
display(abs(asgfarc.loc[inboth]).sort_values('diff'))


Abu Sayyaf Group (ASG)


Philippines    451
Malaysia        19
Name: country, dtype: int64

Philippines
New People's Army (NPA)                        2412
Abu Sayyaf Group (ASG)                          451
Moro Islamic Liberation Front (MILF)            363
Bangsamoro Islamic Freedom Movement (BIFM)      320
Muslim extremists                                 8
Islamic State of Iraq and the Levant (ISIL)       8
Death Squad                                       2
Name: gname, dtype: int64
Malaysia
Abu Sayyaf Group (ASG)                         19
Islamic State of Iraq and the Levant (ISIL)     1
Name: gname, dtype: int64
Regularly Mistaken for Abu Sayyaf Group (ASG)
Revolutionary Armed Forces of Colombia (FARC)     137
National Liberation Army of Colombia (ELN)        135
M-19 (Movement of April 19)                        63
New People's Army (NPA)                            49
Narco-Terrorists                                   44
Bangsamoro Islamic Freedom Movement (BIFM)         34
Communist Party of India - Maoist (CPI-Maoist)     33
Muslim extremists                      

Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
country_Philippines,609.494985,0.0,609.494985
country_Malaysia,48.283137,0.0,48.283137
targsubtype1_Commercial Maritime,21.990978,0.0,21.990978
targtype1_Maritime,14.74751,2.602502,12.145008
ransom_yes,20.440111,13.498187,6.941925


Features More Common FARC than ASG


Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
country_Colombia,0.0,3174.031095,-3174.031095
year_1980s,0.0,175.439139,-175.439139
year_1990s,9.526618,145.198798,-135.67218
year_2000s,32.781311,154.264992,-121.483681
year_2010s,75.258413,163.000449,-87.742036


Features Similar in Both


Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
targsubtype1_Memorial/Cemetery/Monument,1.139434,1.139434,0.000000
targsubtype1_Affiliated Institution,1.021651,1.021651,0.000000
targsubtype1_Bus Station/Stop,0.544727,0.544727,0.000000
targsubtype1_Student,0.967584,0.967584,0.000000
targtype1_NGO,3.080931,3.080931,0.000000
targsubtype1_Tourist,4.581454,4.581454,0.000000
weapsubtype1_Pressure Trigger,1.427116,1.427116,0.000000
targsubtype1_Port,1.347074,1.347074,0.000000
targtype1_Religious Figures/Institutions,1.475047,1.053605,0.421442
targtype1_Educational Institution,0.734796,0.285754,0.449042


## Countries of ASG Predicted Events

ASG often predicted for events in countries they were never active in. Suggests country should be weighted much more heavily? Try much simpler model using only year, latitude, longitude?

In [14]:
# merge the predicted labels columns to the test_events df
test_events_svc = pd.merge(df_test, 
                       pd.DataFrame({'pred':labels_svc_pred},index=df_test.index),
                       left_index=True,right_index=True)

test_events_svc['true'] = test_events_svc['gname']==test_events_svc['pred']

print('======================================================================')
print('Countries of Events Incorrectly Predicted as Abu Sayyaf Group (ASG)')
print('======================================================================')
print(test_events_svc[(test_events_svc.pred=='Abu Sayyaf Group (ASG)') & (~test_events_svc.true)]['country'].unique())
print('======================================================================')
print('Countries Where Abu Sayyaf Group (ASG) Carried Out Attacks')
print('======================================================================')
print(df[df.gname=='Abu Sayyaf Group (ASG)']['country'].unique())

Countries of Events Incorrectly Predicted as Abu Sayyaf Group (ASG)
['Colombia' 'Panama' 'Philippines' 'Yemen' 'France' 'India' 'Algeria'
 'Libya' 'Israel' 'United Kingdom' 'Turkey' 'Russia' 'Peru' 'Ecuador'
 'Sri Lanka' 'Central African Republic' 'West Bank and Gaza Strip' 'Greece'
 'Brazil' 'Egypt' 'El Salvador' 'Zambia' 'Saudi Arabia' 'Honduras'
 'Nigeria' 'Bangladesh' 'Italy' 'Austria' 'Iraq' 'Kuwait' 'Belgium'
 'South Sudan' 'Guatemala' 'Canada' 'Denmark' 'Guadeloupe' 'Netherlands'
 'Lebanon' 'Bulgaria' 'Tunisia' 'Iran' 'Pakistan' 'Mali' 'Bolivia' 'Chile'
 'Georgia' 'Syria' 'Costa Rica' 'Kosovo' 'South Yemen' 'Chad' 'Botswana']
Countries Where Abu Sayyaf Group (ASG) Carried Out Attacks
['Philippines' 'Malaysia']


## Naive Bayes - train

In [15]:
# probability each feature value
p_x = X_dum_train.sum()/len(X_dum_train)
display(p_x.sort_values(ascending=False).head())

# probability each group y
grp_incs_train = y_train.value_counts()
p_y = grp_incs_train/len(X_dum_train)
display(p_y.head())

# probability each feature value, given group y
p_xgy = df_dum_train.groupby('gname').sum()
p_xgy = p_xgy.divide(grp_incs_train, axis=0)
display(p_xgy.head())

suicide_no      0.952579
success_yes     0.916568
ishostkid_no    0.905964
claimed_no      0.839422
nwound_0        0.564359
dtype: float64

Taliban                                             0.108104
Shining Path (SL)                                   0.075746
Islamic State of Iraq and the Levant (ISIL)         0.070100
Farabundo Marti National Liberation Front (FMLN)    0.055748
Al-Shabaab                                          0.044788
Name: gname, dtype: float64

Unnamed: 0,year_1970s,year_1980s,year_1990s,year_2000s,year_2010s,country_Afghanistan,country_Algeria,country_Angola,country_Argentina,country_Austria,...,nhostkid_2to10,nhostkid_11,ransom_no,ransom_yes,hostkidoutcome_Attempted Rescue,hostkidoutcome_Combination,hostkidoutcome_Hostage(s) escaped (not during rescue attempt),hostkidoutcome_Hostage(s) killed (not during rescue attempt),hostkidoutcome_Hostage(s) released by perpetrators,hostkidoutcome_Successful Rescue
Abu Sayyaf Group (ASG),0.0,0.0,0.089231,0.261538,0.649231,0.0,0.0,0.0,0.0,0.0,...,0.181538,0.033846,0.16,0.163077,0.0,0.076923,0.012308,0.021538,0.156923,0.030769
African National Congress (South Africa),0.050228,0.90411,0.045662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Al-Qaida in Iraq,0.0,0.0,0.0,0.291939,0.708061,0.0,0.0,0.0,0.0,0.0,...,0.019608,0.006536,0.026144,0.002179,0.0,0.002179,0.0,0.010893,0.006536,0.0
Al-Qaida in the Arabian Peninsula (AQAP),0.0,0.0,0.0,0.007163,0.992837,0.0,0.0,0.0,0.0,0.0,...,0.024355,0.017192,0.067335,0.004298,0.002865,0.007163,0.0,0.017192,0.020057,0.002865
Al-Shabaab,0.0,0.0,0.0,0.029131,0.970869,0.0,0.0,0.0,0.0,0.0,...,0.048199,0.019068,0.081568,0.003178,0.0,0.006886,0.00053,0.034958,0.019597,0.003708


## Naive Bayes - Test

In [16]:
# series to store results in
y_bayes = pd.Series(index=X_dum_test.index)

for event in X_dum_test.index:
    # likelihood for each feature given each group
    probs = X_dum_test.loc[event]*p_xgy
    
    # multiply likelihoods for each feature
    probs=probs.T
    probs=probs[probs.sum(axis=1)>0]
    probs=probs.product().multiply(p_y)
    
    # normalise
    #probs = probs/probs.sum()
    
    # store group with max likelihood
    y_bayes[event]=probs.idxmax(axis=1)
    
df_bayes = pd.DataFrame({'gname':y_test,'pred':y_bayes})
df_bayes['true'] = df_bayes.gname == df_bayes.pred

print('overall accuracy',sum(df_bayes.true)/len(df_bayes))


overall accuracy 0.82454198262


In [17]:
print('precision',precision_score(df_bayes.gname, df_bayes.pred, labels=top_grps,average='micro'))
print('recall',recall_score(df_bayes.gname, df_bayes.pred, labels=top_grps,average='micro'))
print('accuracy',accuracy_score(df_bayes.gname, df_bayes.pred))
print('confusion matrix (top 20 grps):')
cmatrix = confusion_matrix(df_bayes.gname, df_bayes.pred,labels=top_grps)        
display(pd.DataFrame(cmatrix).iloc[:20,:20])

print('grp13: ',top_grps[13])
print('grp6: ',top_grps[6])
print('----------------------------')
print('grp14: ',top_grps[14])
print('grp11: ',top_grps[11])
print('----------------------------')
print('grp19: ',top_grps[19])
print('grp17: ',top_grps[17])
print('----------------------------')

precision 0.82454198262
recall 0.82454198262
accuracy 0.82454198262
confusion matrix (top 20 grps):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0
1,0,1046,0,0,0,0,7,0,0,0,0,0,0,0,11,0,0,0,0,0
2,0,0,1127,0,1,0,0,1,21,0,0,0,0,0,0,0,0,2,0,0
3,0,0,0,965,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0
4,0,0,0,0,784,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,715,0,0,0,0,0,0,0,0,0,0,7,0,0,0
6,0,4,0,0,0,0,368,0,0,0,0,0,0,189,0,0,0,0,0,0
7,0,0,0,0,0,0,0,596,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,23,0,0,2,0,0,588,0,2,0,0,0,0,0,5,0,0,0
9,0,0,0,0,0,0,0,0,0,521,0,0,0,0,0,0,0,0,0,0


grp13:  National Liberation Army of Colombia (ELN)
grp6:  Revolutionary Armed Forces of Colombia (FARC)
----------------------------
grp14:  Maoists
grp11:  Communist Party of India - Maoist (CPI-Maoist)
----------------------------
grp19:  Houthi extremists (Ansar Allah)
grp17:  Al-Qaida in the Arabian Peninsula (AQAP)
----------------------------


## sklearn Naive Bayes

In [45]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_dum_train.fillna(0), y_train)

y_NBgaus = model.predict(X_dum_test.fillna(0))

print('score:',model.score(X_dum_test.fillna(0),y_dum_test))

score: 0.857751701998


In [43]:
labels_NBgaus_pred = y_dum_test.columns[y_NBgaus.argmax(axis=1)]

# create a data frame of labels and predictions
labels_NBgaus = pd.DataFrame({'true':y_test.values, 'pred':labels_NBgaus_pred.values},index=y_test.index)

# was the predcition correct?
labels_NBgaus['correct'] = labels_NBgaus['pred']==labels_NBgaus['true']

# labelled correctly / total events
frac_true_NBgaus = (labels_NBgaus.loc[labels_NBgaus.correct,'true'].value_counts()/labels_NBgaus['true'].value_counts()).sort_values(ascending=False)

# predicted correctly / predicted total
frac_pred_NBgaus = (labels_NBgaus.loc[labels_NBgaus.correct,'pred'].value_counts()/labels_NBgaus['pred'].value_counts()).sort_values(ascending=False)


print('-------------------------------------------')
print('Groups commonly labelled incorrectly:')
print('-------------------------------------------')
display(labels_NBgaus.loc[~labels_NBgaus.correct,'true'].value_counts().head(5))
display(frac_true_NBgaus.tail(5).sort_values())
print('-------------------------------------------')
print('Common incorrect predictions:')
print('-------------------------------------------')
display(labels_NBgaus.loc[~labels_NBgaus.correct,'pred'].value_counts().head(5))
display(frac_pred_NBgaus.tail(5).sort_values())
print('-------------------------------------------')
print('Groups commonly labelled correctly:')
print('-------------------------------------------')
display(labels_NBgaus.loc[labels_NBgaus.correct,'true'].value_counts().head(5))
display(frac_true_NBgaus.head(5))
print('-------------------------------------------')
print('Common correct predictions:')
print('-------------------------------------------')
display(labels_NBgaus.loc[labels_NBgaus.correct,'pred'].value_counts().head(5))
display(frac_pred_NBgaus.head(5))


-------------------------------------------
Groups commonly labelled incorrectly:
-------------------------------------------


National Liberation Army of Colombia (ELN)       281
Revolutionary Armed Forces of Colombia (FARC)    228
Tupac Amaru Revolutionary Movement (MRTA)        157
Maoists                                          135
Al-Qaida in Iraq                                 123
Name: true, dtype: int64

Tupac Amaru Revolutionary Movement (MRTA)     0.122905
Muslim extremists                             0.253247
Al-Qaida in Iraq                              0.305085
National Liberation Army of Colombia (ELN)    0.352535
United Liberation Front of Assam (ULFA)       0.363636
Name: true, dtype: float64

-------------------------------------------
Common incorrect predictions:
-------------------------------------------


Abu Sayyaf Group (ASG)                            831
Shining Path (SL)                                 165
Islamic State of Iraq and the Levant (ISIL)       153
Revolutionary Armed Forces of Colombia (FARC)     142
Communist Party of India - Maoist (CPI-Maoist)    138
Name: pred, dtype: int64

Abu Sayyaf Group (ASG)                              0.096739
Fulani extremists                                   0.582090
Movement of the Revolutionary Left (MIR) (Chile)    0.608108
Tupac Amaru Revolutionary Movement (MRTA)           0.611111
M-19 (Movement of April 19)                         0.638298
Name: pred, dtype: float64

-------------------------------------------
Groups commonly labelled correctly:
-------------------------------------------


Taliban                                             2002
Shining Path (SL)                                   1338
Islamic State of Iraq and the Levant (ISIL)         1278
Farabundo Marti National Liberation Front (FMLN)     992
Al-Shabaab                                           795
Name: true, dtype: int64

Donetsk People's Republic                                      1.000000
Al-Shabaab                                                     1.000000
Nicaraguan Democratic Force (FDN)                              0.996350
Taliban                                                        0.992071
National Union for the Total Independence of Angola (UNITA)    0.991667
Name: true, dtype: float64

-------------------------------------------
Common correct predictions:
-------------------------------------------


Taliban                                             2002
Shining Path (SL)                                   1338
Islamic State of Iraq and the Levant (ISIL)         1278
Farabundo Marti National Liberation Front (FMLN)     992
Al-Shabaab                                           795
Name: pred, dtype: int64

Algerian Islamic Extremists                 1.000000
Donetsk People's Republic                   1.000000
African National Congress (South Africa)    1.000000
Taliban                                     0.998504
Nicaraguan Democratic Force (FDN)           0.996350
Name: pred, dtype: float64