# Identifying Features Associated with Groups and First Model Attempts

## Load Data

In [1]:
# Imports
import pandas as pd
import numpy as np

# columns of interest
cols_to_extract = ['eventid', 'iyear',
 'country_txt','gname','attacktype1_txt',
 'success','suicide',
 'weaptype1_txt','weapsubtype1_txt',
 'targtype1_txt','targsubtype1_txt',
 'individual','nperps','claimed',
 'nkill','nwound',
 'property','propextent_txt',
 'ishostkid','nhostkid','hostkidoutcome_txt','ransom']

# Load data
df = pd.read_excel('GTD_0617dist/globalterrorismdb_0617dist.xlsx',
                   index='eventid',
                   usecols=cols_to_extract,
                   na_values = ['Unknown','-99','-9','Not Applicable'])

# setting index in read statement doesn't seem to work, so do it here
df.set_index('eventid',inplace=True) 

# replace unwanted _txt suffix from column names
df.columns = df.columns.str.replace('_txt','')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170350 entries, 197000000001 to 201701270001
Data columns (total 21 columns):
iyear             170350 non-null int64
country           170350 non-null object
success           170350 non-null int64
suicide           170350 non-null int64
attacktype1       163925 non-null object
targtype1         165477 non-null object
targsubtype1      161005 non-null object
gname             92044 non-null object
individual        170350 non-null int64
nperps            26173 non-null float64
claimed           102742 non-null float64
weaptype1         156498 non-null object
weapsubtype1      150924 non-null object
nkill             160668 non-null float64
nwound            155025 non-null float64
property          170350 non-null int64
propextent        41479 non-null object
ishostkid         169903 non-null float64
nhostkid          11358 non-null float64
ransom            74955 non-null float64
hostkidoutcome    6651 non-null object
dtypes: float64(

## Extract events associated with groups of interest

In [2]:
# remove events with no group affiliation
no_grp = df.gname.isnull() | df.individual
with_grp = ~no_grp
df = df[with_grp]

# don't need the 'individual' column any more
df.drop('individual',axis=1,inplace=True)

# only keep the top n groups with the most incidents
n_groups = 50 #for all groups: df['gname'].nunique()

top_grps = df['gname'].value_counts().head(n_groups).index
df = df[df.gname.isin(top_grps)]

print('Number of events affiliated with individuals or unknown group: ',sum(no_grp))
print('Number of events affiliated with a group: ',sum(with_grp))
print('Number of events affiliated with top {} groups: {}'.format(n_groups,len(df)))

Number of events affiliated with individuals or unknown group:  78620
Number of events affiliated with a group:  91730
Number of events affiliated with top 50 groups: 60221


## Modify some unwanted columns and values

In [3]:
# replace some values not correctly dealt with by pandas import
df.replace(-9,np.nan,inplace=True)
df.replace(-99,np.nan,inplace=True)

# rename year column
df.rename(columns={'iyear':'year'}, inplace=True)

# If no claimed info - treat as not claimed
df['claimed'].fillna(0,inplace=True)

# remove some values that don't give useful information
df['weaptype1'].replace('Other',np.nan,inplace=True)

df['weapsubtype1'].replace(['Unknown Gun Type', 'Unknown Explosive Type',
                                'Other Explosive Type', 'Unknown Weapon Type',
                                'Other Gun Type'], np.nan, inplace=True)

df['targtype1'].replace('Other',np.nan,inplace=True)

df['targsubtype1'].replace(['Other Personnel', 'Other (including online news agencies)', 'Other Facility'],
                               np.nan,inplace=True)


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60221 entries, 197001310001 to 201612310043
Data columns (total 20 columns):
year              60221 non-null int64
country           60221 non-null object
success           60221 non-null int64
suicide           60221 non-null int64
attacktype1       56890 non-null object
targtype1         59024 non-null object
targsubtype1      57737 non-null object
gname             60221 non-null object
nperps            10123 non-null float64
claimed           60221 non-null float64
weaptype1         53851 non-null object
weapsubtype1      27327 non-null object
nkill             55491 non-null float64
nwound            52350 non-null float64
property          53930 non-null float64
propextent        14450 non-null object
ishostkid         60070 non-null float64
nhostkid          4991 non-null float64
ransom            29995 non-null float64
hostkidoutcome    3166 non-null object
dtypes: float64(8), int64(3), object(9)
memory usage: 9.6+ MB


## Categorise some columns to reduce no. features

In [4]:
# Numeric columns - convert to values for 0, 1, 2-10, and more than 10 
conv_numeric = ['nkill','nwound','nperps','nhostkid']

for col in conv_numeric:
    df[col] = pd.cut(df[col],
                        [-0.1,0.9,1.9,10.9,max(df[col])+0.1],
                        labels=['0','1','2to10','11'])

# bin year in to decades
df['year'] = pd.cut(df['year'],
                        [1969.9,1979.9,1989.9,1999.9,2009.9,2019.9],
                        labels=['1970s','1980s','1990s','2000s','2010s'])  

display(df.head())

# warning message below r.e. empty bins

  if (np.diff(bins) < 0).any():


Unnamed: 0_level_0,year,country,success,suicide,attacktype1,targtype1,targsubtype1,gname,nperps,claimed,weaptype1,weapsubtype1,nkill,nwound,property,propextent,ishostkid,nhostkid,ransom,hostkidoutcome
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
197001310001,1970s,Philippines,0,0,,Military,"Military Personnel (soldiers, troops, officers...",New People's Army (NPA),,0.0,,,0,1.0,0.0,,0.0,,0.0,
197004020001,1970s,Philippines,1,0,,Military,"Military Personnel (soldiers, troops, officers...",New People's Army (NPA),,0.0,,,2to10,0.0,0.0,,0.0,,0.0,
197004250001,1970s,Turkey,1,0,Bombing/Explosion,Airports & Aircraft,Airline Officer/Personnel,Palestinians,,0.0,Explosives/Bombs/Dynamite,,0,0.0,1.0,,0.0,,0.0,
197005040001,1970s,Paraguay,1,0,Assassination,Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",Palestinians,2to10,0.0,Firearms,Automatic Weapon,1,1.0,0.0,,0.0,,0.0,
197006260001,1970s,United Kingdom,0,0,Armed Assault,Terrorists/Non-State Militia,Terrorist,Irish Republican Army (IRA),,0.0,Incendiary,,2to10,,1.0,,0.0,,0.0,


## Create training and testing datasets

In [5]:
from sklearn.model_selection import train_test_split

# labels
y = df['gname']
display(y.head())

# features
X = df.drop('gname',axis=1)
display(X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)
print('Train: X shape =',X_train.shape,', Y shape=',y_train.shape)
print('Test: X shape =',X_test.shape,', Y shape=',y_test.shape)

# make some combined data frames with both labels and features. Useful later.
df_train = X_train.copy()
df_train['gname'] =  y_train

df_test = X_test.copy()
df_test['gname'] = y_test

eventid
197001310001        New People's Army (NPA)
197004020001        New People's Army (NPA)
197004250001                   Palestinians
197005040001                   Palestinians
197006260001    Irish Republican Army (IRA)
Name: gname, dtype: object

Unnamed: 0_level_0,year,country,success,suicide,attacktype1,targtype1,targsubtype1,nperps,claimed,weaptype1,weapsubtype1,nkill,nwound,property,propextent,ishostkid,nhostkid,ransom,hostkidoutcome
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
197001310001,1970s,Philippines,0,0,,Military,"Military Personnel (soldiers, troops, officers...",,0.0,,,0,1.0,0.0,,0.0,,0.0,
197004020001,1970s,Philippines,1,0,,Military,"Military Personnel (soldiers, troops, officers...",,0.0,,,2to10,0.0,0.0,,0.0,,0.0,
197004250001,1970s,Turkey,1,0,Bombing/Explosion,Airports & Aircraft,Airline Officer/Personnel,,0.0,Explosives/Bombs/Dynamite,,0,0.0,1.0,,0.0,,0.0,
197005040001,1970s,Paraguay,1,0,Assassination,Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",2to10,0.0,Firearms,Automatic Weapon,1,1.0,0.0,,0.0,,0.0,
197006260001,1970s,United Kingdom,0,0,Armed Assault,Terrorists/Non-State Militia,Terrorist,,0.0,Incendiary,,2to10,,1.0,,0.0,,0.0,


Train: X shape = (42154, 19) , Y shape= (42154,)
Test: X shape = (18067, 19) , Y shape= (18067,)


## Create Dummy variables for Each Category Value

Gives a bool column for each unique value.

In [6]:
# labels
# do on full dataset first to make sure same columns in test and train
y_dum = pd.get_dummies(y)
y_dum_train = y_dum.loc[y_train.index]
y_dum_test = y_dum.loc[y_test.index]

display(y_dum_train.head())

# features
# do on full dataset first to make sure same columns in test and train
X_dum = pd.get_dummies(X)
X_dum_train = X_dum.loc[X_train.index]
X_dum_test = X_dum.loc[X_test.index]

# NaN values in test data can destroy predictions so remove them
X_dum_test.fillna(0,inplace=True) 

display(X_dum_train.head())

Unnamed: 0_level_0,Abu Sayyaf Group (ASG),African National Congress (South Africa),Al-Qaida in Iraq,Al-Qaida in the Arabian Peninsula (AQAP),Al-Shabaab,Algerian Islamic Extremists,Bangsamoro Islamic Freedom Movement (BIFM),Basque Fatherland and Freedom (ETA),Boko Haram,Chechen Rebels,...,Revolutionary Armed Forces of Colombia (FARC),Separatists,Shining Path (SL),Sikh Extremists,Sinai Province of the Islamic State,Taliban,Tehrik-i-Taliban Pakistan (TTP),Tripoli Province of the Islamic State,Tupac Amaru Revolutionary Movement (MRTA),United Liberation Front of Assam (ULFA)
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201606300009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
197601050002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199201160007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198508160030,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199608220002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,success,suicide,claimed,property,ishostkid,ransom,year_1970s,year_1980s,year_1990s,year_2000s,...,nhostkid_0,nhostkid_1,nhostkid_2to10,nhostkid_11,hostkidoutcome_Attempted Rescue,hostkidoutcome_Combination,hostkidoutcome_Hostage(s) escaped (not during rescue attempt),hostkidoutcome_Hostage(s) killed (not during rescue attempt),hostkidoutcome_Hostage(s) released by perpetrators,hostkidoutcome_Successful Rescue
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201606300009,1,0,1.0,1.0,0.0,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197601050002,1,0,0.0,0.0,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199201160007,1,0,0.0,1.0,0.0,0.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
198508160030,1,0,0.0,1.0,0.0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
199608220002,1,0,0.0,1.0,0.0,0.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Calculate Characterising Values for each Group
i.e. the feature values that are most characteristic of an attack by each group

In [7]:
# no. incidents associated with each group
grp_incs = y.value_counts()

# loop over all columns excluding gname
grp_weights = dict()

# add group name column to the data frame of dummies
df_dum_train = X_dum_train.copy()
df_dum_train['gname'] =  y_train

for col in df_dum_train.drop('gname',axis=1).columns:
    # for each group, how many times this unique value appears
    grp_cnts = df_dum_train.groupby('gname')[col].sum()
    
    # count how many groups have an incident including this unique value
    # convert this for tf-idf weight using log(n_groups/count)
    if (grp_cnts>0).sum()==0:
        print('Warn: No instances of',col)
        
    w_col = np.log(n_groups/((grp_cnts>0).sum()))
    
    # multiply w_col by no. occurences each column value to get weight for each group
    grp_weights[col] = (grp_cnts*w_col)
    
# merge unique values for each column in to one large data frame
grp_aw = pd.DataFrame(index=top_grps)
for key, w_col in grp_weights.items():
    grp_aw[key] = w_col
    

# get rid of NaN weights
grp_aw.fillna(0,inplace=True)

grp_aw.info()

Warn: No instances of country_Bangladesh
Warn: No instances of country_Botswana
Warn: No instances of country_Bulgaria
Warn: No instances of country_Finland




Warn: No instances of country_Guadeloupe
Warn: No instances of country_South Yemen
<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, Taliban to Lord's Resistance Army (LRA)
Columns: 306 entries, success to hostkidoutcome_Successful Rescue
dtypes: float64(306)
memory usage: 119.9+ KB


## Print Top n Features for Top m Groups

In [8]:
n_print_grp=10
n_print_feat=5

for grp in grp_incs.head(n_print_grp).index:    
    print(grp_aw.loc[grp].sort_values(ascending=False).head(n_print_feat))
    print('----------------------------------------------')

country_Afghanistan    12719.429850
year_2010s              1345.377889
targsubtype1_NATO        727.409850
claimed                  608.426487
suicide                  333.222005
Name: Taliban, dtype: float64
----------------------------------------------
country_Peru                 6258.137222
year_1980s                   1082.273053
year_1990s                    293.682636
targsubtype1_Electricity      151.565355
weapsubtype1_Dynamite/TNT      85.140424
Name: Shining Path (SL), dtype: float64
----------------------------------------------
country_Iraq     6286.538596
year_2010s       1053.974459
country_Syria     792.089272
suicide           500.566977
claimed           324.384352
Name: Islamic State of Iraq and the Levant (ISIL), dtype: float64
----------------------------------------------
country_El Salvador         7519.293927
year_1980s                   894.883019
targsubtype1_Electricity     253.380250
year_1990s                   156.696440
targtype1_Utilities          130.

## Model Using tf-idf Type Weights Derived Above
NB: weights above calculated on all data. Should be only on training data.

In [9]:
# subtract neg_scale*weight for features not present in event
# but present in group
neg_scale = 0.25 

# matrix multiply test events by group weights for each feature
tfidf_events = pd.DataFrame(np.inner(X_dum_test,grp_aw),index=X_dum_test.index,columns=grp_aw.index)

# subtract contribution of negative cases
tfidf_events = tfidf_events - neg_scale*pd.DataFrame(np.inner(X_dum_test.replace({0:1,1:0}),grp_aw),index=X_dum_test.index,columns=grp_aw.index)

tfidf_pred = pd.DataFrame({'gname':y_test,'pred':tfidf_events.idxmax(axis=1)})
tfidf_pred['true'] = tfidf_pred.gname == tfidf_pred.pred

# stats on accuracy of model overall and per group
print('overall accuracy',sum(tfidf_pred.true)/len(tfidf_pred))

# calculate metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

print('precision',precision_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('recall',recall_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('accuracy',accuracy_score(tfidf_pred.gname, tfidf_pred.pred))
print('confusion matrix (top 20 grps):')
cmatrix = confusion_matrix(tfidf_pred.gname, tfidf_pred.pred,labels=top_grps)        
display(pd.DataFrame(cmatrix).iloc[:20,:20])

print('grp13: ',top_grps[13])
print('grp6: ',top_grps[6])
print('----------------------------')
print('grp14: ',top_grps[14])
print('grp11: ',top_grps[11])
print('----------------------------')
print('grp19: ',top_grps[19])
print('grp17: ',top_grps[17])
print('----------------------------')

overall accuracy 0.800077489345
precision 0.800077489345
recall 0.800077489345
accuracy 0.800077489345
confusion matrix (top 20 grps):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,2002,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,0,0
1,0,1354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1223,0,1,0,0,2,25,0,0,0,0,0,0,0,1,0,0,4
3,0,0,0,994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,728,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,780,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,730,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,735,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,3,0,0,1,0,0,580,0,0,0,0,0,0,0,0,0,0,0
9,0,0,2,0,0,0,0,0,0,606,0,0,0,0,0,0,0,0,0,0


grp13:  National Liberation Army of Colombia (ELN)
grp6:  Revolutionary Armed Forces of Colombia (FARC)
----------------------------
grp14:  Maoists
grp11:  Communist Party of India - Maoist (CPI-Maoist)
----------------------------
grp19:  Houthi extremists (Ansar Allah)
grp17:  Al-Qaida in the Arabian Peninsula (AQAP)
----------------------------


In [10]:
'''
print('==========================')
print('event 17041')
print('==========================')
print('group predictions:')
display(tfidf_events.loc[17041].sort_values(ascending=False).head(10))
print('real group:',y_test.loc[17041].gname)

tmp=X_test.loc[17041]
tmp.index=grp_aw.loc['Revolutionary Armed Forces of Colombia (FARC)'].index

ev17041 = pd.DataFrame({'event':tmp,
              'FARC':grp_aw.loc['Revolutionary Armed Forces of Colombia (FARC)'],
              'ELN':grp_aw.loc['National Liberation Army of Colombia (ELN)']})

ev17041['diff'] = ev17041.ELN - ev17041.FARC
ev17041.loc[ev17041.event==0, 'diff'] = neg_scale*ev17041.loc[ev17041.event==0, 'diff']

display(ev17041[(ev17041.FARC>0) | (ev17041.ELN>0) | (ev17041.event>0)].sort_values('diff',ascending=False).head(10))
display(ev17041[(ev17041.FARC>0) | (ev17041.ELN>0) | (ev17041.event>0)].sort_values('diff',ascending=False).tail(10))
'''

print('==========================')
print('Frequent groups with incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].gname.value_counts().head(10))

print('==========================')
print('Incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].pred.value_counts().head(10))


Frequent groups with incorrect predictions


National Liberation Army of Colombia (ELN)    434
Maoists                                       333
Houthi extremists (Ansar Allah)               247
Sikh Extremists                               219
Tupac Amaru Revolutionary Movement (MRTA)     179
Al-Qaida in Iraq                              177
M-19 (Movement of April 19)                   148
Muslim extremists                             142
Abu Sayyaf Group (ASG)                        141
People's Liberation Front (JVP)               119
Name: gname, dtype: int64

Incorrect predictions


Revolutionary Armed Forces of Colombia (FARC)     708
Communist Party of India - Maoist (CPI-Maoist)    697
New People's Army (NPA)                           338
Al-Qaida in the Arabian Peninsula (AQAP)          256
Islamic State of Iraq and the Levant (ISIL)       191
Shining Path (SL)                                 188
Bangsamoro Islamic Freedom Movement (BIFM)        184
Liberation Tigers of Tamil Eelam (LTTE)           134
Irish Republican Army (IRA)                       118
Boko Haram                                        114
Name: pred, dtype: int64

## Fit a Classifier to the Data

In [11]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import average_precision_score, accuracy_score

model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_dum_train, y_dum_train)

y_svc_pred = model.predict(X_dum_test)

print('Accuracy score:',accuracy_score(y_dum_test,y_svc_pred))
print('Precision score:',average_precision_score(y_dum_test,y_svc_pred))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Investigate the Results of the Classifier

In [None]:
# extract group for each event in test data
#y_test = y_test.stack()
#y_test = y_test[y_test>0].index.get_level_values(1)

# extract prediction for each event in test data
labels_svc_pred = y_dum_test.columns[y_svc_pred.argmax(axis=1)]

# create a data frame of labels and predictions
labels_svc = pd.DataFrame({'true':y_test.values, 'pred':labels_svc_pred.values})

# was the predcition correct?
labels_svc['correct'] = labels_svc['pred']==labels_svc['true']

# labelled correctly / total events
frac_true_svc = (labels_svc.loc[labels_svc.correct,'true'].value_counts()/labels_svc['true'].value_counts()).sort_values(ascending=False)

# predicted correctly / predicted total
frac_pred_svc = (labels_svc.loc[labels_svc.correct,'pred'].value_counts()/labels_svc['pred'].value_counts()).sort_values(ascending=False)


print('-------------------------------------------')
print('Groups commonly labelled incorrectly:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Common incorrect predictions:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Groups commonly labelled correctly:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.head(5))
print('-------------------------------------------')
print('Common correct predictions:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.head(5))

## Abu Sayyaf Group (ASG): Frequently Predicted Wrongly

A look at some of the features of ASG events, and the groups that are often mistaken for ASG

In [None]:
print('=============================')
print('Abu Sayyaf Group (ASG)')
print('=============================')
asg=df[df.gname=='Abu Sayyaf Group (ASG)']
display(asg['country_txt'].value_counts())

print('=============================')
print('Philippines')
print('=============================')
print(df[df.country_txt == 'Philippines']['gname'].value_counts())

print('=============================')
print('Malaysia')
print('=============================')
print(df[df.country_txt == 'Malaysia']['gname'].value_counts())

print('================================================')
print('Regularly Mistaken for Abu Sayyaf Group (ASG)')
print('================================================')
print(labels_svc.loc[(~labels_svc.correct) & (labels_svc.pred=='Abu Sayyaf Group (ASG)')].true.value_counts().head(10))

asg_w = grp_aw.loc['Abu Sayyaf Group (ASG)']
farc_w = grp_aw.loc['Revolutionary Armed Forces of Colombia (FARC)']
asgfarc = pd.DataFrame({'Abu Sayyaf Group (ASG)':asg_w, 'Revolutionary Armed Forces of Colombia (FARC)':farc_w})
asgfarc['diff'] = asgfarc['Abu Sayyaf Group (ASG)']-asgfarc['Revolutionary Armed Forces of Colombia (FARC)']

print('================================================')
print('Features More Common in ASG than FARC')
print('================================================')
display(asgfarc.sort_values('diff').tail(5).sort_values(by='diff',ascending=False))

print('=========================================================================')
print('Features More Common FARC than ASG')
print('=========================================================================')
display(asgfarc.sort_values('diff').head(5))

print('=========================================================================')
print('Features Similar in Both')
print('=========================================================================')
inboth = (asgfarc['Abu Sayyaf Group (ASG)']>0.005) & (asgfarc['Revolutionary Armed Forces of Colombia (FARC)']>0.005)
display(abs(asgfarc.loc[inboth]).sort_values('diff'))


## Countries of ASG Predicted Events

ASG often predicted for events in countries they were never active in. Suggests country should be weighted much more heavily? Try much simpler model using only year, latitude, longitude?

In [None]:
# merge the predicted labels columns to the test_events df
test_events_svc = pd.merge(df_test, 
                       pd.DataFrame({'pred':labels_svc_pred},index=df_test.index),
                       left_index=True,right_index=True)

test_events_svc['true'] = test_events_svc['gname']==test_events_svc['pred']

print('======================================================================')
print('Countries of Events Incorrectly Predicted as Abu Sayyaf Group (ASG)')
print('======================================================================')
print(test_events_svc[(test_events_svc.pred=='Abu Sayyaf Group (ASG)') & (~test_events_svc.true)]['country_txt'].unique())
print('======================================================================')
print('Countries Where Abu Sayyaf Group (ASG) Carried Out Attacks')
print('======================================================================')
print(df[df.gname=='Abu Sayyaf Group (ASG)']['country_txt'].unique())