# Identifying Features Associated with Groups and First Model Attempts

## Load Data

In [1]:
# Imports
import pandas as pd
import numpy as np

# columns of interest
cols_to_extract = ['eventid', 'iyear',
 'country_txt','gname','attacktype1_txt',
 'success','suicide',
 'weaptype1_txt','weapsubtype1_txt',
 'targtype1_txt','targsubtype1_txt',
 'individual','nperps','claimed',
 'nkill','nwound',
 'property','propextent_txt',
 'ishostkid','nhostkid','hostkidoutcome_txt','ransom']

# Load data
df = pd.read_excel('GTD_0617dist/globalterrorismdb_0617dist.xlsx',
                   index='eventid',
                   usecols=cols_to_extract,
                   na_values = ['Unknown','-99','-9','Not Applicable'])

# setting index in read statement doesn't seem to work, so do it here
df.set_index('eventid',inplace=True) 

# replace unwanted _txt suffix from column names
df.columns = df.columns.str.replace('_txt','')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170350 entries, 197000000001 to 201701270001
Data columns (total 21 columns):
iyear             170350 non-null int64
country           170350 non-null object
success           170350 non-null int64
suicide           170350 non-null int64
attacktype1       163925 non-null object
targtype1         165477 non-null object
targsubtype1      161005 non-null object
gname             92044 non-null object
individual        170350 non-null int64
nperps            26173 non-null float64
claimed           102742 non-null float64
weaptype1         156498 non-null object
weapsubtype1      150924 non-null object
nkill             160668 non-null float64
nwound            155025 non-null float64
property          170350 non-null int64
propextent        41479 non-null object
ishostkid         169903 non-null float64
nhostkid          11358 non-null float64
ransom            74955 non-null float64
hostkidoutcome    6651 non-null object
dtypes: float64(

## Extract events associated with groups of interest

In [2]:
print('Total no. events: ', len(df))

# remove events with no group affiliation
no_grp = df.gname.isnull() | df.individual
with_grp = ~no_grp
print('Number of events affiliated with a group: ',sum(with_grp))
df = df[with_grp]

# don't need the 'individual' column any more
df.drop('individual',axis=1,inplace=True)

# only keep the top n groups with the most incidents
n_groups = df['gname'].nunique()#50 #for all groups: df['gname'].nunique()
top_grps = df['gname'].value_counts().head(n_groups)
print('Number of events affiliated with top {} groups: {}'.format(n_groups,sum(top_grps)))

# only keep groups with at least 10 incidents
top_grps = top_grps[top_grps>10].index

# extract events with group of interest
df = df[df.gname.isin(top_grps)]
print('Number of events after excluding groups with less than 10 incidents: ', len(df))
print('Final no. groups: ',df['gname'].nunique())

Total no. events:  170350
Number of events affiliated with a group:  91730
Number of events affiliated with top 3434 groups: 91730
Number of events after excluding groups with less than 10 incidents:  85225
Final no. groups:  519


## Modify some unwanted columns and values

In [3]:
# replace some values not correctly dealt with by pandas import
df.replace(-9,np.nan,inplace=True)
df.replace(-99,np.nan,inplace=True)

# rename year column
df.rename(columns={'iyear':'year'}, inplace=True)

# If no claimed info - treat as not claimed
df['claimed'].fillna(0,inplace=True)

# remove some values that don't give useful information
df['weaptype1'].replace('Other',np.nan,inplace=True)

df['weapsubtype1'].replace(['Unknown Gun Type', 'Unknown Explosive Type',
                                'Other Explosive Type', 'Unknown Weapon Type',
                                'Other Gun Type'], np.nan, inplace=True)

df['targtype1'].replace('Other',np.nan,inplace=True)

df['targsubtype1'].replace(['Other Personnel', 'Other (including online news agencies)', 'Other Facility'],
                               np.nan,inplace=True)


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85225 entries, 197000000002 to 201701270001
Data columns (total 20 columns):
year              85225 non-null int64
country           85225 non-null object
success           85225 non-null int64
suicide           85225 non-null int64
attacktype1       80602 non-null object
targtype1         83479 non-null object
targsubtype1      81416 non-null object
gname             85225 non-null object
nperps            14709 non-null float64
claimed           85225 non-null float64
weaptype1         76148 non-null object
weapsubtype1      37993 non-null object
nkill             79081 non-null float64
nwound            74808 non-null float64
property          76903 non-null float64
propextent        19925 non-null object
ishostkid         84922 non-null float64
nhostkid          7126 non-null float64
ransom            45336 non-null float64
hostkidoutcome    4391 non-null object
dtypes: float64(8), int64(3), object(9)
memory usage: 13.7+ MB


## Categorise some columns to reduce no. features

In [4]:
# Numeric columns - convert to values for 0, 1, 2-10, and more than 10 
conv_numeric = ['nkill','nwound','nperps','nhostkid']

for col in conv_numeric:
    df[col] = pd.cut(df[col],
                        [-0.1,0.9,1.9,10.9,max(df[col])+0.1],
                        labels=['0','1','2to10','11+'])

# boolean columns - convert to yes/no to help identification in dummy variables later
conv_bool = ['success','suicide','claimed','property','ishostkid','ransom']

for col in conv_bool:
    df[col].replace({0:'no',1:'yes'},inplace=True)

# bin year in to decades
df['year'] = pd.cut(df['year'],
                        [1969.9,1979.9,1989.9,1999.9,2009.9,2019.9],
                        labels=['1970s','1980s','1990s','2000s','2010s'])  

display(df.head())

# warning message below r.e. empty bins

Unnamed: 0_level_0,year,country,success,suicide,attacktype1,targtype1,targsubtype1,gname,nperps,claimed,weaptype1,weapsubtype1,nkill,nwound,property,propextent,ishostkid,nhostkid,ransom,hostkidoutcome
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
197000000002,1970s,Mexico,yes,no,Hostage Taking (Kidnapping),Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",23rd of September Communist League,2to10,no,,,0,0,no,,yes,1.0,yes,
197001010002,1970s,United States,yes,no,Armed Assault,Police,"Police Building (headquarters, station, school)",Black Nationalists,,no,Firearms,,0,0,yes,Minor (likely < $1 million),no,,no,
197001020001,1970s,Uruguay,no,no,Assassination,Police,Police Security Forces/Officers,Tupamaros (Uruguay),2to10,no,Firearms,Automatic Weapon,0,0,no,,no,,no,
197001050001,1970s,United States,no,no,Bombing/Explosion,Military,Military Barracks/Base/Headquarters/Checkpost,"Weather Underground, Weathermen",,no,Explosives/Bombs/Dynamite,,0,0,no,Minor (likely < $1 million),no,,no,
197001060001,1970s,United States,yes,no,Facility/Infrastructure Attack,Military,Military Recruiting Station/Academy,Left-Wing Militants,,no,Incendiary,Molotov Cocktail/Petrol Bomb,0,0,yes,Minor (likely < $1 million),no,,no,


## Create training and testing datasets

In [5]:
from sklearn.model_selection import train_test_split

# labels
y = df['gname']
display(y.head())

# features
X = df.drop('gname',axis=1)
display(X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)
print('Train: X shape =',X_train.shape,', Y shape=',y_train.shape)
print('Test: X shape =',X_test.shape,', Y shape=',y_test.shape)

# make some combined data frames with both labels and features. Useful later.
df_train = X_train.copy()
df_train['gname'] =  y_train

df_test = X_test.copy()
df_test['gname'] = y_test

eventid
197000000002    23rd of September Communist League
197001010002                    Black Nationalists
197001020001                   Tupamaros (Uruguay)
197001050001       Weather Underground, Weathermen
197001060001                   Left-Wing Militants
Name: gname, dtype: object

Unnamed: 0_level_0,year,country,success,suicide,attacktype1,targtype1,targsubtype1,nperps,claimed,weaptype1,weapsubtype1,nkill,nwound,property,propextent,ishostkid,nhostkid,ransom,hostkidoutcome
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
197000000002,1970s,Mexico,yes,no,Hostage Taking (Kidnapping),Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",2to10,no,,,0,0,no,,yes,1.0,yes,
197001010002,1970s,United States,yes,no,Armed Assault,Police,"Police Building (headquarters, station, school)",,no,Firearms,,0,0,yes,Minor (likely < $1 million),no,,no,
197001020001,1970s,Uruguay,no,no,Assassination,Police,Police Security Forces/Officers,2to10,no,Firearms,Automatic Weapon,0,0,no,,no,,no,
197001050001,1970s,United States,no,no,Bombing/Explosion,Military,Military Barracks/Base/Headquarters/Checkpost,,no,Explosives/Bombs/Dynamite,,0,0,no,Minor (likely < $1 million),no,,no,
197001060001,1970s,United States,yes,no,Facility/Infrastructure Attack,Military,Military Recruiting Station/Academy,,no,Incendiary,Molotov Cocktail/Petrol Bomb,0,0,yes,Minor (likely < $1 million),no,,no,


Train: X shape = (59657, 19) , Y shape= (59657,)
Test: X shape = (25568, 19) , Y shape= (25568,)


## Create Dummy variables for Each Category Value

Gives a bool column for each unique value.

In [6]:
# labels
# do on full dataset first to make sure same columns in test and train
y_dum = pd.get_dummies(y)
y_dum_train = y_dum.loc[y_train.index]
y_dum_test = y_dum.loc[y_test.index]

display(y_dum_train.head())

# features
# do on full dataset first to make sure same columns in test and train
X_dum = pd.get_dummies(X)
X_dum_train = X_dum.loc[X_train.index]
X_dum_test = X_dum.loc[X_test.index]

# NaN values in test data can destroy predictions so remove them
X_dum_test.fillna(0,inplace=True) 

display(X_dum_train.head())

Unnamed: 0_level_0,16 January Organization for the Liberation of Tripoli,20 December Movement (M-20),23rd of September Communist League,2nd of June Movement,31 January People's Front (FP-31),9 February,Abdullah Azzam Brigades,Abu Hafs al-Masri Brigades,Abu Nidal Organization (ANO),Abu Sayyaf Group (ASG),...,"Weather Underground, Weathermen",White Wolves,White extremists,Workers' Self-Defense Movement (MAO),Young Communist League,Youths,Zapatista National Liberation Army,Zebra killers,Zimbabwe African Nationalist Union (ZANU),Zimbabwe African People's Union
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201611290005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199210300002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198902070002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199907190002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200711190003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,year_1970s,year_1980s,year_1990s,year_2000s,year_2010s,country_Afghanistan,country_Algeria,country_Andorra,country_Angola,country_Argentina,...,nhostkid_2to10,nhostkid_11+,ransom_no,ransom_yes,hostkidoutcome_Attempted Rescue,hostkidoutcome_Combination,hostkidoutcome_Hostage(s) escaped (not during rescue attempt),hostkidoutcome_Hostage(s) killed (not during rescue attempt),hostkidoutcome_Hostage(s) released by perpetrators,hostkidoutcome_Successful Rescue
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201611290005,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199210300002,0,0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
198902070002,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199907190002,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200711190003,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Calculate Characterising Values for each Group
i.e. the feature values that are most characteristic of an attack by each group

In [7]:
# no. incidents associated with each group
grp_incs = y.value_counts()

# add group name column to the data frame of dummies
df_dum_train = X_dum_train.copy()
df_dum_train['gname'] =  y_train

# loop over all columns excluding gname
col_tfidf = dict()

for col in df_dum_train.drop('gname',axis=1).columns:
    # for each group, how many times this unique value appears
    grp_cnts = df_dum_train.groupby('gname')[col].sum()
    
    # count how many groups have an incident including this unique value
    # convert this for tf-idf weight using log(n_groups/count)
    if (grp_cnts>0).sum()==0:
        print('Warning: No instances of',col)
        
    w_col = np.log(n_groups/((grp_cnts>0).sum()))
    
    # multiply w_col by no. occurences each column value to get weight for each group
    col_tfidf[col] = (grp_cnts*w_col)
    
# merge unique values for each column in to one large data frame
w_tfidf = pd.DataFrame(col_tfidf,
                       index=top_grps,
                       columns=df_dum_train.drop('gname',axis=1).columns) #ensures columns in same order

# get rid of NaN weights
w_tfidf.fillna(0,inplace=True)

w_tfidf.info()





<class 'pandas.core.frame.DataFrame'>
Index: 519 entries, Taliban to Orange Volunteers (OV)
Columns: 367 entries, year_1970s to hostkidoutcome_Successful Rescue
dtypes: float64(367)
memory usage: 1.5+ MB


## Print Top n Features for Top m Groups

In [8]:
n_print_grp=10
n_print_feat=5

for grp in grp_incs.head(n_print_grp).index:    
    print(w_tfidf.loc[grp].sort_values(ascending=False).head(n_print_feat))
    print('----------------------------------------------')

country_Afghanistan    23727.047569
year_2010s             10172.579676
suicide_no              7953.230259
success_yes             7936.758333
ishostkid_no            7793.003893
Name: Taliban, dtype: float64
----------------------------------------------
country_Peru    16598.756445
ransom_no        6298.127511
year_1980s       6152.028534
claimed_no       6117.600064
suicide_no       5993.738746
Name: Shining Path (SL), dtype: float64
----------------------------------------------
country_Iraq    12904.910749
year_2010s       8035.305061
success_yes      5061.669519
ishostkid_no     5052.211990
suicide_no       4391.377316
Name: Islamic State of Iraq and the Levant (ISIL), dtype: float64
----------------------------------------------
country_El Salvador    11943.282890
year_1980s              5006.646461
ransom_no               4683.893106
claimed_no              4507.501848
suicide_no              4406.493933
Name: Farabundo Marti National Liberation Front (FMLN), dtype: float64
--

## Model Using tf-idf Type Weights Derived Above

In [9]:
# subtract neg_scale*weight for features not present in event
# but present in group
neg_scale = 0.25 

# matrix multiply test events by group weights for each feature
tfidf_events = pd.DataFrame(np.inner(X_dum_test,w_tfidf),index=X_dum_test.index,columns=w_tfidf.index)

# subtract contribution of negative cases
tfidf_events = tfidf_events - neg_scale*pd.DataFrame(np.inner(X_dum_test.replace({0:1,1:0}),w_tfidf),index=X_dum_test.index,columns=w_tfidf.index)

tfidf_pred = pd.DataFrame({'gname':y_test,'pred':tfidf_events.idxmax(axis=1)})
tfidf_pred['true'] = tfidf_pred.gname == tfidf_pred.pred

# stats on accuracy of model overall and per group
print('overall accuracy',sum(tfidf_pred.true)/len(tfidf_pred))

# calculate metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

print('precision',precision_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('recall',recall_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('accuracy',accuracy_score(tfidf_pred.gname, tfidf_pred.pred))
print('confusion matrix (top 20 grps):')
cmatrix = confusion_matrix(tfidf_pred.gname, tfidf_pred.pred,labels=top_grps)        
display(pd.DataFrame(cmatrix).iloc[:20,:20])

print('grp13: ',top_grps[13])
print('grp6: ',top_grps[6])
print('----------------------------')
print('grp14: ',top_grps[14])
print('grp11: ',top_grps[11])
print('----------------------------')
print('grp19: ',top_grps[19])
print('grp17: ',top_grps[17])
print('----------------------------')

overall accuracy 0.245697747184
precision 0.245697747184
recall 0.245697747184
accuracy 0.245697747184
confusion matrix (top 20 grps):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1929,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1377,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,156,2,1095,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,10,0,1009,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,582,3,13,0,216,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,2,408,0,0,0,361,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,271,353,1,2,0,0,144,0,0,0,0,0,0,0,0,0,0,0,0,0
7,367,263,0,0,0,0,0,70,0,0,0,0,0,0,0,0,0,0,0,0
8,308,334,7,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0
9,555,4,59,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0


grp13:  National Liberation Army of Colombia (ELN)
grp6:  Revolutionary Armed Forces of Colombia (FARC)
----------------------------
grp14:  Maoists
grp11:  Communist Party of India - Maoist (CPI-Maoist)
----------------------------
grp19:  Houthi extremists (Ansar Allah)
grp17:  Al-Qaida in the Arabian Peninsula (AQAP)
----------------------------


In [10]:
print('==========================')
print('Frequent groups with incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].gname.value_counts().head(10))

print('==========================')
print('Incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].pred.value_counts().head(10))


Frequent groups with incorrect predictions


Kurdistan Workers' Party (PKK)                    649
New People's Army (NPA)                           630
Revolutionary Armed Forces of Colombia (FARC)     627
Boko Haram                                        618
Basque Fatherland and Freedom (ETA)               614
Al-Shabaab                                        598
Communist Party of India - Maoist (CPI-Maoist)    507
Liberation Tigers of Tamil Eelam (LTTE)           473
National Liberation Army of Colombia (ELN)        451
Irish Republican Army (IRA)                       410
Name: gname, dtype: int64

Incorrect predictions


Shining Path (SL)                                   9690
Taliban                                             8300
Islamic State of Iraq and the Levant (ISIL)          610
Irish Republican Army (IRA)                          307
Farabundo Marti National Liberation Front (FMLN)     171
Revolutionary Armed Forces of Colombia (FARC)         95
New People's Army (NPA)                               73
Communist Party of India - Maoist (CPI-Maoist)        16
Basque Fatherland and Freedom (ETA)                    9
Al-Shabaab                                             7
Name: pred, dtype: int64

## Fit a Classifier to the Data

In [11]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import average_precision_score, accuracy_score

model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_dum_train.fillna(0), y_dum_train.fillna(0))

y_svc_pred = model.predict(X_dum_test.fillna(0))

print('Accuracy score:',accuracy_score(y_dum_test,y_svc_pred))
print('Precision score:',average_precision_score(y_dum_test,y_svc_pred))

Accuracy score: 0.627816020025


  recall = tps / tps[-1]


Precision score: nan


## Investigate the Results of the Classifier

In [12]:
# extract group for each event in test data
#y_test = y_test.stack()
#y_test = y_test[y_test>0].index.get_level_values(1)

# extract prediction for each event in test data
labels_svc_pred = y_dum_test.columns[y_svc_pred.argmax(axis=1)]

# create a data frame of labels and predictions
labels_svc = pd.DataFrame({'true':y_test.values, 'pred':labels_svc_pred.values})

# was the predcition correct?
labels_svc['correct'] = labels_svc['pred']==labels_svc['true']

# labelled correctly / total events
frac_true_svc = (labels_svc.loc[labels_svc.correct,'true'].value_counts()/labels_svc['true'].value_counts()).sort_values(ascending=False)

# predicted correctly / predicted total
frac_pred_svc = (labels_svc.loc[labels_svc.correct,'pred'].value_counts()/labels_svc['pred'].value_counts()).sort_values(ascending=False)


print('-------------------------------------------')
print('Groups commonly labelled incorrectly:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Common incorrect predictions:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Groups commonly labelled correctly:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.head(5))
print('-------------------------------------------')
print('Common correct predictions:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.head(5))

-------------------------------------------
Groups commonly labelled incorrectly:
-------------------------------------------


Revolutionary Armed Forces of Colombia (FARC)     348
National Liberation Army of Colombia (ELN)        345
Maoists                                           244
Communist Party of India - Maoist (CPI-Maoist)    219
Al-Qaida in Iraq                                  176
Name: true, dtype: int64

United Popular Action Front (FAPU)   NaN
Weather Underground, Weathermen      NaN
White Wolves                         NaN
Youths                               NaN
Zimbabwe African People's Union      NaN
Name: true, dtype: float64

-------------------------------------------
Common incorrect predictions:
-------------------------------------------


16 January Organization for the Liberation of Tripoli    5552
Islamic State of Iraq and the Levant (ISIL)               215
Revolutionary Armed Forces of Colombia (FARC)             162
Communist Party of India - Maoist (CPI-Maoist)            162
Shining Path (SL)                                         161
Name: pred, dtype: int64

Sunni Muslim extremists                                  NaN
Supreme Council for Islamic Revolution in Iraq (SCIRI)   NaN
Tripura National Volunteers (TNV)                        NaN
United Democratic Liberation Army (UDLA)                 NaN
Zimbabwe African People's Union                          NaN
Name: pred, dtype: float64

-------------------------------------------
Groups commonly labelled correctly:
-------------------------------------------


Taliban                                             1907
Shining Path (SL)                                   1363
Islamic State of Iraq and the Levant (ISIL)         1111
Farabundo Marti National Liberation Front (FMLN)     995
Al-Shabaab                                           803
Name: true, dtype: int64

16 January Organization for the Liberation of Tripoli    1.0
Paraguayan People's Army (EPP)                           1.0
Maximiliano Gomez Revolutionary Brigade                  1.0
Tupamaros (Uruguay)                                      1.0
Tupac Katari Guerrilla Army (EGTK)                       1.0
Name: true, dtype: float64

-------------------------------------------
Common correct predictions:
-------------------------------------------


Taliban                                             1907
Shining Path (SL)                                   1363
Islamic State of Iraq and the Levant (ISIL)         1111
Farabundo Marti National Liberation Front (FMLN)     995
Al-Shabaab                                           803
Name: pred, dtype: int64

Brunswijk Jungle Commando    1.0
Sandinistas                  1.0
Forqan Group                 1.0
Dishmish Regiment            1.0
Alfaro Vive                  1.0
Name: pred, dtype: float64

## Abu Sayyaf Group (ASG): Frequently Predicted Wrongly

A look at some of the features of ASG events, and the groups that are often mistaken for ASG

In [13]:
print('=============================')
print('Abu Sayyaf Group (ASG)')
print('=============================')
asg=df[df.gname=='Abu Sayyaf Group (ASG)']
display(asg['country'].value_counts())

print('=============================')
print('Philippines')
print('=============================')
print(df[df.country == 'Philippines']['gname'].value_counts())

print('=============================')
print('Malaysia')
print('=============================')
print(df[df.country == 'Malaysia']['gname'].value_counts())

print('================================================')
print('Regularly Mistaken for Abu Sayyaf Group (ASG)')
print('================================================')
print(labels_svc.loc[(~labels_svc.correct) & (labels_svc.pred=='Abu Sayyaf Group (ASG)')].true.value_counts().head(10))

asg_w = w_tfidf.loc['Abu Sayyaf Group (ASG)']
farc_w = w_tfidf.loc['Revolutionary Armed Forces of Colombia (FARC)']
asgfarc = pd.DataFrame({'Abu Sayyaf Group (ASG)':asg_w, 'Revolutionary Armed Forces of Colombia (FARC)':farc_w})
asgfarc['diff'] = asgfarc['Abu Sayyaf Group (ASG)']-asgfarc['Revolutionary Armed Forces of Colombia (FARC)']

print('================================================')
print('Features More Common in ASG than FARC')
print('================================================')
display(asgfarc.sort_values('diff').tail(5).sort_values(by='diff',ascending=False))

print('=========================================================================')
print('Features More Common FARC than ASG')
print('=========================================================================')
display(asgfarc.sort_values('diff').head(5))

print('=========================================================================')
print('Features Similar in Both')
print('=========================================================================')
inboth = (asgfarc['Abu Sayyaf Group (ASG)']>0.005) & (asgfarc['Revolutionary Armed Forces of Colombia (FARC)']>0.005)
display(abs(asgfarc.loc[inboth]).sort_values('diff'))


Abu Sayyaf Group (ASG)


Philippines    451
Malaysia        19
Name: country, dtype: int64

Philippines
New People's Army (NPA)                        2412
Abu Sayyaf Group (ASG)                          451
Moro Islamic Liberation Front (MILF)            363
Bangsamoro Islamic Freedom Movement (BIFM)      320
Moro National Liberation Front (MNLF)           196
Communists                                       33
April 6th Liberation Movement                    31
Muslim Rebels                                    20
Muslims                                          18
Jemaah Islamiya (JI)                             18
Maute Group                                      15
Alex Boncayao Brigade (ABB)                      11
Muslim Separatists                               10
Muslim extremists                                 8
Muslim Militants                                  8
Islamic State of Iraq and the Levant (ISIL)       8
Muslim Guerrillas                                 6
Armed People                                      5
Gunmen                                            4


Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
country_Philippines,1469.487935,0.0,1469.487935
country_Malaysia,122.122216,0.0,122.122216
targsubtype1_Commercial Maritime,62.993374,0.0,62.993374
targtype1_Maritime,71.132873,11.855479,59.277394
ransom_yes,162.345198,119.274023,43.071175


Features More Common FARC than ASG


Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
country_Colombia,0.0,7730.303727,-7730.303727
claimed_no,574.068632,3227.927996,-2653.859364
suicide_no,612.222999,3227.397786,-2615.174787
success_yes,546.645176,2958.31507,-2411.669894
ishostkid_no,400.999229,2727.551362,-2326.552133


Features Similar in Both


Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
targsubtype1_Religion Identified,3.652845,3.652845,0.000000
nperps_0,4.963427,4.963427,0.000000
targsubtype1_Television Journalist/Staff/Facility,4.134148,4.134148,0.000000
targsubtype1_Port,5.576532,5.576532,0.000000
targsubtype1_Marketplace/Plaza/Square,16.853982,16.853982,0.000000
weapsubtype1_Pressure Trigger,5.656574,5.656574,0.000000
targsubtype1_Affiliated Institution,4.427909,4.427909,0.000000
targsubtype1_Student,4.357291,4.357291,0.000000
targsubtype1_Military Checkpoint,7.785972,7.785972,0.000000
targtype1_Religious Figures/Institutions,40.596276,37.696542,2.899734


## Countries of ASG Predicted Events

ASG often predicted for events in countries they were never active in. Suggests country should be weighted much more heavily? Try much simpler model using only year, latitude, longitude?

In [14]:
# merge the predicted labels columns to the test_events df
test_events_svc = pd.merge(df_test, 
                       pd.DataFrame({'pred':labels_svc_pred},index=df_test.index),
                       left_index=True,right_index=True)

test_events_svc['true'] = test_events_svc['gname']==test_events_svc['pred']

print('======================================================================')
print('Countries of Events Incorrectly Predicted as Abu Sayyaf Group (ASG)')
print('======================================================================')
print(test_events_svc[(test_events_svc.pred=='Abu Sayyaf Group (ASG)') & (~test_events_svc.true)]['country'].unique())
print('======================================================================')
print('Countries Where Abu Sayyaf Group (ASG) Carried Out Attacks')
print('======================================================================')
print(df[df.gname=='Abu Sayyaf Group (ASG)']['country'].unique())

Countries of Events Incorrectly Predicted as Abu Sayyaf Group (ASG)
['Philippines' 'Malaysia']
Countries Where Abu Sayyaf Group (ASG) Carried Out Attacks
['Philippines' 'Malaysia']


## Naive Bayes - train

In [15]:
# probability each feature value
p_x = X_dum_train.sum()/len(X_dum_train)
display(p_x.sort_values(ascending=False).head())

# probability each group y
grp_incs_train = y_train.value_counts()
p_y = grp_incs_train/len(X_dum_train)
display(p_y.head())

# probability each feature value, given group y
p_xgy = df_dum_train.groupby('gname').sum()
p_xgy = p_xgy.divide(grp_incs_train, axis=0)
display(p_xgy.head())

suicide_no      0.959971
success_yes     0.916372
ishostkid_no    0.903616
claimed_no      0.830598
nwound_0        0.577317
dtype: float64

Taliban                                             0.077845
Shining Path (SL)                                   0.053171
Islamic State of Iraq and the Levant (ISIL)         0.050857
Farabundo Marti National Liberation Front (FMLN)    0.039090
Irish Republican Army (IRA)                         0.031815
Name: gname, dtype: float64

Unnamed: 0,year_1970s,year_1980s,year_1990s,year_2000s,year_2010s,country_Afghanistan,country_Algeria,country_Andorra,country_Angola,country_Argentina,...,nhostkid_2to10,nhostkid_11+,ransom_no,ransom_yes,hostkidoutcome_Attempted Rescue,hostkidoutcome_Combination,hostkidoutcome_Hostage(s) escaped (not during rescue attempt),hostkidoutcome_Hostage(s) killed (not during rescue attempt),hostkidoutcome_Hostage(s) released by perpetrators,hostkidoutcome_Successful Rescue
16 January Organization for the Liberation of Tripoli,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20 December Movement (M-20),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23rd of September Communist League,0.971429,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.771429,0.228571,0.0,0.0,0.0,0.028571,0.114286,0.0
2nd of June Movement,0.833333,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0
31 January People's Front (FP-31),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Naive Bayes - Test

In [16]:
# series to store results in
y_bayes = pd.Series(index=X_dum_test.index)

for event in X_dum_test.index:
    # likelihood for each feature given each group
    probs = X_dum_test.loc[event]*p_xgy
    
    # multiply likelihoods for each feature
    probs=probs.T
    probs=probs[probs.sum(axis=1)>0]
    probs=probs.product().multiply(p_y)
    
    # normalise
    #probs = probs/probs.sum()
    
    # store group with max likelihood
    y_bayes[event]=probs.idxmax(axis=1)
    
df_bayes = pd.DataFrame({'gname':y_test,'pred':y_bayes})
df_bayes['true'] = df_bayes.gname == df_bayes.pred

print('overall accuracy',sum(df_bayes.true)/len(df_bayes))


overall accuracy 0.609433667084


In [17]:
print('precision',precision_score(df_bayes.gname, df_bayes.pred, labels=top_grps,average='micro'))
print('recall',recall_score(df_bayes.gname, df_bayes.pred, labels=top_grps,average='micro'))
print('accuracy',accuracy_score(df_bayes.gname, df_bayes.pred))
print('confusion matrix (top 20 grps):')
cmatrix = confusion_matrix(df_bayes.gname, df_bayes.pred,labels=top_grps)        
display(pd.DataFrame(cmatrix).iloc[:20,:20])

print('grp13: ',top_grps[13])
print('grp6: ',top_grps[6])
print('----------------------------')
print('grp14: ',top_grps[14])
print('grp11: ',top_grps[11])
print('----------------------------')
print('grp19: ',top_grps[19])
print('grp17: ',top_grps[17])
print('----------------------------')

precision 0.609433667084
recall 0.609433667084
accuracy 0.609433667084
confusion matrix (top 20 grps):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1689,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0
1,0,1052,0,0,0,0,8,0,0,0,0,0,0,0,5,0,0,0,0,0
2,0,0,931,0,0,0,0,2,14,0,0,0,0,0,0,0,0,2,0,0
3,0,0,0,939,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0
4,0,0,0,0,777,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,529,0,0,0,0,1,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,346,0,0,0,0,0,0,154,0,0,0,0,0,0
7,0,0,2,0,0,0,0,496,0,0,0,0,0,0,0,1,0,0,0,0
8,0,0,16,0,0,0,0,0,514,0,0,0,0,0,0,0,2,0,0,0
9,0,0,0,0,0,0,0,0,0,505,0,0,0,0,0,0,0,0,0,0


grp13:  National Liberation Army of Colombia (ELN)
grp6:  Revolutionary Armed Forces of Colombia (FARC)
----------------------------
grp14:  Maoists
grp11:  Communist Party of India - Maoist (CPI-Maoist)
----------------------------
grp19:  Houthi extremists (Ansar Allah)
grp17:  Al-Qaida in the Arabian Peninsula (AQAP)
----------------------------


## sklearn Naive Bayes

In [18]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_dum_train.fillna(0), y_train)

y_NBgaus = clf.predict(X_dum_test.fillna(0))

print('score:',accuracy_score(y_test,y_NBgaus))

score: 0.611975907384


In [19]:
# create a data frame of labels and predictions
labels_NBgaus = pd.DataFrame({'true':y_test.values, 'pred':y_NBgaus},index=y_test.index)

# was the predcition correct?
labels_NBgaus['correct'] = labels_NBgaus['pred']==labels_NBgaus['true']

# labelled correctly / total events
frac_true_NBgaus = (labels_NBgaus.loc[labels_NBgaus.correct,'true'].value_counts()/labels_NBgaus['true'].value_counts()).sort_values(ascending=False)

# predicted correctly / predicted total
frac_pred_NBgaus = (labels_NBgaus.loc[labels_NBgaus.correct,'pred'].value_counts()/labels_NBgaus['pred'].value_counts()).sort_values(ascending=False)


print('-------------------------------------------')
print('Groups commonly labelled incorrectly:')
print('-------------------------------------------')
display(labels_NBgaus.loc[~labels_NBgaus.correct,'true'].value_counts().head(5))
display(frac_true_NBgaus.tail(5).sort_values())
print('-------------------------------------------')
print('Common incorrect predictions:')
print('-------------------------------------------')
display(labels_NBgaus.loc[~labels_NBgaus.correct,'pred'].value_counts().head(5))
display(frac_pred_NBgaus.tail(5).sort_values())
print('-------------------------------------------')
print('Groups commonly labelled correctly:')
print('-------------------------------------------')
display(labels_NBgaus.loc[labels_NBgaus.correct,'true'].value_counts().head(5))
display(frac_true_NBgaus.head(5))
print('-------------------------------------------')
print('Common correct predictions:')
print('-------------------------------------------')
display(labels_NBgaus.loc[labels_NBgaus.correct,'pred'].value_counts().head(5))
display(frac_pred_NBgaus.head(5))


-------------------------------------------
Groups commonly labelled incorrectly:
-------------------------------------------


Shining Path (SL)                                372
Revolutionary Armed Forces of Colombia (FARC)    365
Islamic State of Iraq and the Levant (ISIL)      309
National Liberation Army of Colombia (ELN)       268
New People's Army (NPA)                          237
Name: true, dtype: int64

Young Communist League                      NaN
Youths                                      NaN
Zapatista National Liberation Army          NaN
Zimbabwe African Nationalist Union (ZANU)   NaN
Zimbabwe African People's Union             NaN
Name: true, dtype: float64

-------------------------------------------
Common incorrect predictions:
-------------------------------------------


Death Squad                                       514
Gunmen                                            320
Lord's Resistance Army (LRA)                      297
Communist Party of India - Maoist (CPI-Maoist)    291
National Liberation Army of Colombia (ELN)        273
Name: pred, dtype: int64

Omega-7                              NaN
People's Liberation Front of India   NaN
Prima Linea                          NaN
Resistenza                           NaN
Terrorists                           NaN
Name: pred, dtype: float64

-------------------------------------------
Groups commonly labelled correctly:
-------------------------------------------


Taliban                                             1741
Shining Path (SL)                                   1007
Farabundo Marti National Liberation Front (FMLN)     963
Islamic State of Iraq and the Levant (ISIL)          944
Al-Shabaab                                           764
Name: true, dtype: int64

Pro Hartal Activists                                0.966667
Al-Gama'at al-Islamiyya (IG)                        0.948718
African National Congress (South Africa)            0.948454
Farabundo Marti National Liberation Front (FMLN)    0.945044
Al-Shabaab                                          0.938575
Name: true, dtype: float64

-------------------------------------------
Common correct predictions:
-------------------------------------------


Taliban                                             1741
Shining Path (SL)                                   1007
Farabundo Marti National Liberation Front (FMLN)     963
Islamic State of Iraq and the Levant (ISIL)          944
Al-Shabaab                                           764
Name: pred, dtype: int64

Zebra killers                                 1.0
Tawhid and Jihad                              1.0
National Liberation Army (NLA) (Macedonia)    1.0
Popular Forces of April 25                    1.0
Popular Revolutionary Bloc (BPR)              1.0
Name: pred, dtype: float64