# Identifying Features Associated with Groups and First Model Attempts

## Load Data

In [11]:
# Imports
import pandas as pd
import numpy as np

# columns of interest
cols_to_extract = ['eventid', 'iyear',
 'country_txt','gname','attacktype1_txt',
 'success','suicide',
 'weaptype1_txt','weapsubtype1_txt',
 'targtype1_txt','targsubtype1_txt',
 'individual','nperps','claimed',
 'nkill','nwound',
 'property','propextent_txt',
 'ishostkid','nhostkid','hostkidoutcome_txt','ransom']

# Load data
df = pd.read_excel('GTD_0617dist/globalterrorismdb_0617dist.xlsx',
                   index='eventid',
                   usecols=cols_to_extract,
                   na_values = ['Unknown','-99','-9','Not Applicable'])

df.set_index('eventid',inplace=True)

df.info()

AttributeError: 'DataFrame' object has no attribute 'setindex'

## Extract events associated with groups of interest

In [15]:
# remove events with no group affiliation
no_grp = df.gname.isnull() | df.individual
with_grp = ~no_grp
df = df[with_grp]

# don't need the 'individual' column any more
df.drop('individual',axis=1,inplace=True)

# only keep the top n groups with the most incidents
n_groups = 50 #for all groups: df['gname'].nunique()

top_grps = df['gname'].value_counts().head(n_groups).index
df = df[df.gname.isin(top_grps)]

print('Number of events affiliated with individuals or unknown group: ',sum(no_grp))
print('Number of events affiliated with a group: ',sum(with_grp))
print('Number of events affiliated with top {} groups: {}'.format(n_groups,len(df)))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Number of events affiliated with individuals or unknown group:  78620
Number of events affiliated with a group:  91730
Number of events affiliated with top 50 groups: 60221


## Modify some unwanted columns and values

In [16]:
# replace some values not correctly dealt with by pandas import
df.replace(-9,np.nan,inplace=True)
df.replace(-99,np.nan,inplace=True)

# rename year column
df.rename(columns={'iyear':'year'}, inplace=True)

# If no claimed info - treat as not claimed
df['claimed'].fillna(0,inplace=True)

# remove some values that don't give useful information
df['weaptype1_txt'].replace('Other',np.nan,inplace=True)

df['weapsubtype1_txt'].replace(['Unknown Gun Type', 'Unknown Explosive Type',
                                'Other Explosive Type', 'Unknown Weapon Type',
                                'Other Gun Type'], np.nan, inplace=True)

df['targtype1_txt'].replace('Other',np.nan,inplace=True)

df['targsubtype1_txt'].replace(['Other Personnel', 'Other (including online news agencies)', 'Other Facility'],
                               np.nan,inplace=True)


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60221 entries, 197001310001 to 201612310043
Data columns (total 20 columns):
year                  60221 non-null int64
country_txt           60221 non-null object
success               60221 non-null int64
suicide               60221 non-null int64
attacktype1_txt       56890 non-null object
targtype1_txt         59024 non-null object
targsubtype1_txt      57737 non-null object
gname                 60221 non-null object
nperps                10123 non-null float64
claimed               60221 non-null float64
weaptype1_txt         53851 non-null object
weapsubtype1_txt      27327 non-null object
nkill                 55491 non-null float64
nwound                52350 non-null float64
property              53930 non-null float64
propextent_txt        14450 non-null object
ishostkid             60070 non-null float64
nhostkid              4991 non-null float64
ransom                29995 non-null float64
hostkidoutcome_txt    3166 non-nu

## Categorise some columns to reduce no. features

In [17]:
# Numeric columns - convert to values for 0, 1, 2-10, and more than 10 
conv_numeric = ['nkill','nwound','nperps','nhostkid']

for col in conv_numeric:
    df[col] = pd.cut(df[col],
                        [-0.1,0.9,1.9,10.9,max(df[col])+0.1],
                        labels=['0_'+col,'1_'+col,'2to10_'+col,'11+_'+col])

# boolean columns - add column suffix
conv_bool = ['success','suicide','claimed','ishostkid','ransom','property']

for col in conv_bool:
    df[col].replace({0:('0_'+col),1:('1_'+col)},inplace=True)

# bin year in to decades
df['year'] = pd.cut(df['year'],
                        [1969.9,1979.9,1989.9,1999.9,2009.9,2019.9],
                        labels=['1970s_yr','1980s_yr','1990s_yr','2000s_yr','2010s_yr'])  

# add suffixes to some column values to help understanding later
add_suffix = {'targtype1_txt':'_target',
              'attacktype1_txt':'_attack', 
              'targsubtype1_txt':'_targetsub',
              'weaptype1_txt':'_weapon',
              'weapsubtype1_txt':'_weaponsub',
              'propextent_txt':'_property'}

for col,suffix in add_suffix.items():
    rows = df[col].notnull()
    df.loc[rows,col] = df.loc[rows,col] + suffix

display(df.head())

# warning message below r.e. empty bins

  if (np.diff(bins) < 0).any():


Unnamed: 0_level_0,year,country_txt,success,suicide,attacktype1_txt,targtype1_txt,targsubtype1_txt,gname,nperps,claimed,weaptype1_txt,weapsubtype1_txt,nkill,nwound,property,propextent_txt,ishostkid,nhostkid,ransom,hostkidoutcome_txt
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
197001310001,1970s_yr,Philippines,0_success,0_suicide,,Military_target,"Military Personnel (soldiers, troops, officers...",New People's Army (NPA),,0_claimed,,,0_nkill,1_nwound,0_property,,0_ishostkid,,0_ransom,
197004020001,1970s_yr,Philippines,1_success,0_suicide,,Military_target,"Military Personnel (soldiers, troops, officers...",New People's Army (NPA),,0_claimed,,,2to10_nkill,0_nwound,0_property,,0_ishostkid,,0_ransom,
197004250001,1970s_yr,Turkey,1_success,0_suicide,Bombing/Explosion_attack,Airports & Aircraft_target,Airline Officer/Personnel_targetsub,Palestinians,,0_claimed,Explosives/Bombs/Dynamite_weapon,,0_nkill,0_nwound,1_property,,0_ishostkid,,0_ransom,
197005040001,1970s_yr,Paraguay,1_success,0_suicide,Assassination_attack,Government (Diplomatic)_target,"Diplomatic Personnel (outside of embassy, cons...",Palestinians,2to10_nperps,0_claimed,Firearms_weapon,Automatic Weapon_weaponsub,1_nkill,1_nwound,0_property,,0_ishostkid,,0_ransom,
197006260001,1970s_yr,United Kingdom,0_success,0_suicide,Armed Assault_attack,Terrorists/Non-State Militia_target,Terrorist_targetsub,Irish Republican Army (IRA),,0_claimed,Incendiary_weapon,,2to10_nkill,,1_property,,0_ishostkid,,0_ransom,


## Create training and testing datasets

In [18]:
from sklearn.model_selection import train_test_split

# labels
y = df['gname']
display(y.head())

# features
X = df.drop('gname',axis=1)
display(X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)
print('Train: X shape =',X_train.shape,', Y shape=',y_train.shape)
print('Test: X shape =',X_test.shape,', Y shape=',y_test.shape)

# make some combined data frames with both labels and features. Useful later.
df_train = X_train.copy()
df_train['gname'] =  y_train

df_test = X_test.copy()
df_test['gname'] = y_test

eventid
197001310001        New People's Army (NPA)
197004020001        New People's Army (NPA)
197004250001                   Palestinians
197005040001                   Palestinians
197006260001    Irish Republican Army (IRA)
Name: gname, dtype: object

Unnamed: 0_level_0,year,country_txt,success,suicide,attacktype1_txt,targtype1_txt,targsubtype1_txt,nperps,claimed,weaptype1_txt,weapsubtype1_txt,nkill,nwound,property,propextent_txt,ishostkid,nhostkid,ransom,hostkidoutcome_txt
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
197001310001,1970s_yr,Philippines,0_success,0_suicide,,Military_target,"Military Personnel (soldiers, troops, officers...",,0_claimed,,,0_nkill,1_nwound,0_property,,0_ishostkid,,0_ransom,
197004020001,1970s_yr,Philippines,1_success,0_suicide,,Military_target,"Military Personnel (soldiers, troops, officers...",,0_claimed,,,2to10_nkill,0_nwound,0_property,,0_ishostkid,,0_ransom,
197004250001,1970s_yr,Turkey,1_success,0_suicide,Bombing/Explosion_attack,Airports & Aircraft_target,Airline Officer/Personnel_targetsub,,0_claimed,Explosives/Bombs/Dynamite_weapon,,0_nkill,0_nwound,1_property,,0_ishostkid,,0_ransom,
197005040001,1970s_yr,Paraguay,1_success,0_suicide,Assassination_attack,Government (Diplomatic)_target,"Diplomatic Personnel (outside of embassy, cons...",2to10_nperps,0_claimed,Firearms_weapon,Automatic Weapon_weaponsub,1_nkill,1_nwound,0_property,,0_ishostkid,,0_ransom,
197006260001,1970s_yr,United Kingdom,0_success,0_suicide,Armed Assault_attack,Terrorists/Non-State Militia_target,Terrorist_targetsub,,0_claimed,Incendiary_weapon,,2to10_nkill,,1_property,,0_ishostkid,,0_ransom,


Train: X shape = (42154, 19) , Y shape= (42154,)
Test: X shape = (18067, 19) , Y shape= (18067,)


## Calculate Characterising Values for each Group
i.e. the feature values that are most characteristic of an attack by each group

In [19]:
# no. incidents associated with each group
grp_incs = df['gname'].value_counts()

# loop over all columns excluding gname
grp_weights = dict()

for col in df.drop('gname',axis=1).columns:
    # for each group, how many times each unique value appears in this column
    grp_cnts = df.groupby('gname')[col].value_counts().unstack(col,fill_value=0)
    
    # for each unique value in this column, count how many groups have an incident including it
    # convert this for tf-idf weight using log(n_groups/count)
    w_col = np.log(n_groups/(grp_cnts>0).sum())
    
    # multiply w_col by no. occurences each column value to get weight for each group
    grp_weights[col] = (grp_cnts*w_col)#.div(grp_incs,axis=0) ### removed: normalise by no. incidents for that group, so groups can be compared more easily
    
# merge unique values for each column in to one large data frame
grp_aw = pd.DataFrame(index=top_grps)
for key, w_col in grp_weights.items():
    grp_aw = pd.merge(grp_aw, w_col, left_index=True, right_index=True,how='outer')
    
grp_aw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, Abu Sayyaf Group (ASG) to United Liberation Front of Assam (ULFA)
Columns: 312 entries, 1970s_yr to Successful Rescue
dtypes: float64(312)
memory usage: 122.3+ KB


## Print Top n Features for Top m Groups

In [20]:
n_print_grp=10
n_print_feat=5

for grp in grp_incs.head(n_print_grp).index:    
    print(grp_aw.loc[grp].sort_values(ascending=False).head(n_print_feat))
    print('----------------------------------------------')

Afghanistan       18351.878105
2010s_yr           1787.062124
NATO_targetsub      894.751212
1_claimed           878.746780
1_suicide           442.921048
Name: Taliban, dtype: float64
----------------------------------------------
Peru                      8314.422101
1980s_yr                  1551.704210
1990s_yr                   411.615596
Electricity_targetsub      137.896054
Dynamite/TNT_weaponsub     116.448726
Name: Shining Path (SL), dtype: float64
----------------------------------------------
Iraq         9181.023622
2010s_yr     1408.296935
Syria         998.644126
1_suicide     691.067739
1_claimed     472.305811
Name: Islamic State of Iraq and the Levant (ISIL), dtype: float64
----------------------------------------------
El Salvador              10718.856497
1980s_yr                  1275.399517
Electricity_targetsub      228.335989
1990s_yr                   224.039774
Utilities_target           117.990202
Name: Farabundo Marti National Liberation Front (FMLN), dtype: 

## Create Dummy variables for Each Category Value

In [26]:
# labels
# do on full dataset first to make sure same columns in test and train
y_dum = pd.get_dummies(y)
y_dum_train = y_dum.loc[y_train.index]
y_dum_test = y_dum.loc[y_test.index]

display(y_dum_train.head())

# features
# do on full dataset first to make sure same columns in test and train
X_dum = pd.get_dummies(X)
X_dum_train = X_dum.loc[X_train.index]
X_dum_test = X_dum.loc[X_test.index]

display(X_dum_train.head())

Unnamed: 0_level_0,Abu Sayyaf Group (ASG),African National Congress (South Africa),Al-Qaida in Iraq,Al-Qaida in the Arabian Peninsula (AQAP),Al-Shabaab,Algerian Islamic Extremists,Bangsamoro Islamic Freedom Movement (BIFM),Basque Fatherland and Freedom (ETA),Boko Haram,Chechen Rebels,...,Revolutionary Armed Forces of Colombia (FARC),Separatists,Shining Path (SL),Sikh Extremists,Sinai Province of the Islamic State,Taliban,Tehrik-i-Taliban Pakistan (TTP),Tripoli Province of the Islamic State,Tupac Amaru Revolutionary Movement (MRTA),United Liberation Front of Assam (ULFA)
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201606300009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
197601050002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199201160007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198508160030,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199608220002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,year_1970s_yr,year_1980s_yr,year_1990s_yr,year_2000s_yr,year_2010s_yr,country_txt_Afghanistan,country_txt_Algeria,country_txt_Angola,country_txt_Argentina,country_txt_Austria,...,nhostkid_2to10_nhostkid,nhostkid_11+_nhostkid,ransom_0_ransom,ransom_1_ransom,hostkidoutcome_txt_Attempted Rescue,hostkidoutcome_txt_Combination,hostkidoutcome_txt_Hostage(s) escaped (not during rescue attempt),hostkidoutcome_txt_Hostage(s) killed (not during rescue attempt),hostkidoutcome_txt_Hostage(s) released by perpetrators,hostkidoutcome_txt_Successful Rescue
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201606300009,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197601050002,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199201160007,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
198508160030,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199608220002,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## Fit a Classifier to the Data

In [27]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import average_precision_score, accuracy_score

model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_dum_train, y_dum_train)

y_svc_pred = model.predict(X_dum_test)

print('Accuracy score:',accuracy_score(y_dum_test,y_svc_pred))
print('Precision score:',average_precision_score(y_dum_test,y_svc_pred))

Accuracy score: 0.857751701998
Precision score: 0.681730034938


## Investigate the Results of the Classifier

In [29]:
# extract group for each event in test data
#y_test = y_test.stack()
#y_test = y_test[y_test>0].index.get_level_values(1)

# extract prediction for each event in test data
labels_svc_pred = y_dum_test.columns[y_svc_pred.argmax(axis=1)]

# create a data frame of labels and predictions
labels_svc = pd.DataFrame({'true':y_test.values, 'pred':labels_svc_pred.values})

# was the predcition correct?
labels_svc['correct'] = labels_svc['pred']==labels_svc['true']

# labelled correctly / total events
frac_true_svc = (labels_svc.loc[labels_svc.correct,'true'].value_counts()/labels_svc['true'].value_counts()).sort_values(ascending=False)

# predicted correctly / predicted total
frac_pred_svc = (labels_svc.loc[labels_svc.correct,'pred'].value_counts()/labels_svc['pred'].value_counts()).sort_values(ascending=False)


print('-------------------------------------------')
print('Groups commonly labelled incorrectly:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Common incorrect predictions:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Groups commonly labelled correctly:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.head(5))
print('-------------------------------------------')
print('Common correct predictions:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.head(5))

-------------------------------------------
Groups commonly labelled incorrectly:
-------------------------------------------


National Liberation Army of Colombia (ELN)       281
Revolutionary Armed Forces of Colombia (FARC)    228
Tupac Amaru Revolutionary Movement (MRTA)        157
Maoists                                          135
Al-Qaida in Iraq                                 123
Name: true, dtype: int64

Tupac Amaru Revolutionary Movement (MRTA)     0.122905
Muslim extremists                             0.253247
Al-Qaida in Iraq                              0.305085
National Liberation Army of Colombia (ELN)    0.352535
United Liberation Front of Assam (ULFA)       0.363636
Name: true, dtype: float64

-------------------------------------------
Common incorrect predictions:
-------------------------------------------


Abu Sayyaf Group (ASG)                            831
Shining Path (SL)                                 165
Islamic State of Iraq and the Levant (ISIL)       153
Revolutionary Armed Forces of Colombia (FARC)     142
Communist Party of India - Maoist (CPI-Maoist)    138
Name: pred, dtype: int64

Abu Sayyaf Group (ASG)                              0.096739
Fulani extremists                                   0.582090
Movement of the Revolutionary Left (MIR) (Chile)    0.608108
Tupac Amaru Revolutionary Movement (MRTA)           0.611111
M-19 (Movement of April 19)                         0.638298
Name: pred, dtype: float64

-------------------------------------------
Groups commonly labelled correctly:
-------------------------------------------


Taliban                                             2002
Shining Path (SL)                                   1338
Islamic State of Iraq and the Levant (ISIL)         1278
Farabundo Marti National Liberation Front (FMLN)     992
Al-Shabaab                                           795
Name: true, dtype: int64

Donetsk People's Republic                                      1.000000
Al-Shabaab                                                     1.000000
Nicaraguan Democratic Force (FDN)                              0.996350
Taliban                                                        0.992071
National Union for the Total Independence of Angola (UNITA)    0.991667
Name: true, dtype: float64

-------------------------------------------
Common correct predictions:
-------------------------------------------


Taliban                                             2002
Shining Path (SL)                                   1338
Islamic State of Iraq and the Levant (ISIL)         1278
Farabundo Marti National Liberation Front (FMLN)     992
Al-Shabaab                                           795
Name: pred, dtype: int64

Algerian Islamic Extremists                 1.000000
Donetsk People's Republic                   1.000000
African National Congress (South Africa)    1.000000
Taliban                                     0.998504
Nicaraguan Democratic Force (FDN)           0.996350
Name: pred, dtype: float64

## Abu Sayyaf Group (ASG): Frequently Predicted Wrongly

A look at some of the features of ASG events, and the groups that are often mistaken for ASG

In [30]:
print('=============================')
print('Abu Sayyaf Group (ASG)')
print('=============================')
asg=df[df.gname=='Abu Sayyaf Group (ASG)']
display(asg['country_txt'].value_counts())

print('=============================')
print('Philippines')
print('=============================')
print(df[df.country_txt == 'Philippines']['gname'].value_counts())

print('=============================')
print('Malaysia')
print('=============================')
print(df[df.country_txt == 'Malaysia']['gname'].value_counts())

print('================================================')
print('Regularly Mistaken for Abu Sayyaf Group (ASG)')
print('================================================')
print(labels_svc.loc[(~labels_svc.correct) & (labels_svc.pred=='Abu Sayyaf Group (ASG)')].true.value_counts().head(10))

asg_w = grp_aw.loc['Abu Sayyaf Group (ASG)']
farc_w = grp_aw.loc['Revolutionary Armed Forces of Colombia (FARC)']
asgfarc = pd.DataFrame({'Abu Sayyaf Group (ASG)':asg_w, 'Revolutionary Armed Forces of Colombia (FARC)':farc_w})
asgfarc['diff'] = asgfarc['Abu Sayyaf Group (ASG)']-asgfarc['Revolutionary Armed Forces of Colombia (FARC)']

print('================================================')
print('Features More Common in ASG than FARC')
print('================================================')
display(asgfarc.sort_values('diff').tail(5).sort_values(by='diff',ascending=False))

print('=========================================================================')
print('Features More Common FARC than ASG')
print('=========================================================================')
display(asgfarc.sort_values('diff').head(5))

print('=========================================================================')
print('Features Similar in Both')
print('=========================================================================')
inboth = (asgfarc['Abu Sayyaf Group (ASG)']>0.005) & (asgfarc['Revolutionary Armed Forces of Colombia (FARC)']>0.005)
display(abs(asgfarc.loc[inboth]).sort_values('diff'))


Abu Sayyaf Group (ASG)


Philippines    451
Malaysia        19
Name: country_txt, dtype: int64

Philippines
New People's Army (NPA)                        2412
Abu Sayyaf Group (ASG)                          451
Moro Islamic Liberation Front (MILF)            363
Bangsamoro Islamic Freedom Movement (BIFM)      320
Islamic State of Iraq and the Levant (ISIL)       8
Muslim extremists                                 8
Death Squad                                       2
Name: gname, dtype: int64
Malaysia
Abu Sayyaf Group (ASG)                         19
Islamic State of Iraq and the Levant (ISIL)     1
Name: gname, dtype: int64
Regularly Mistaken for Abu Sayyaf Group (ASG)
Revolutionary Armed Forces of Colombia (FARC)     137
National Liberation Army of Colombia (ELN)        135
M-19 (Movement of April 19)                        63
New People's Army (NPA)                            49
Narco-Terrorists                                   44
Bangsamoro Islamic Freedom Movement (BIFM)         34
Muslim extremists                                  33
Communist Party of India - Maoist (CPI-

Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
Philippines,886.716898,0.0,886.716898
Malaysia,61.158641,0.0,61.158641
Commercial Maritime_targetsub,27.360445,0.0,27.360445
Maritime_target,16.881291,2.935877,13.945414
1_ransom,23.323789,17.410716,5.913073


Features More Common FARC than ASG


Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
Colombia,0.0,4511.815564,-4511.815564
1980s_yr,0.0,245.232366,-245.232366
1990s_yr,14.782683,206.957562,-192.174879
2000s_yr,46.66516,228.697851,-182.032691
2010s_yr,99.865236,209.914099,-110.048862


Features Similar in Both


Unnamed: 0,Abu Sayyaf Group (ASG),Revolutionary Armed Forces of Colombia (FARC),diff
Affiliated Institution_targetsub,0.967584,0.967584,0.000000
Bus Station/Stop_targetsub,0.446287,0.446287,0.000000
Port_targetsub,1.272966,1.272966,0.000000
Memorial/Cemetery/Monument_targetsub,0.867501,0.867501,0.000000
Religious Figures/Institutions_target,0.424257,0.383851,0.040405
Named Civilian_targetsub,3.579334,3.962835,0.383500
"Procession/Gathering (funeral, wedding, birthday, religious)_targetsub",0.415515,0.831031,0.415515
School/University/Educational Building_targetsub,1.508229,1.055760,0.452469
International NGO_targetsub,2.464745,1.848558,0.616186
1_suicide,2.079442,1.386294,0.693147


## Countries of ASG Predicted Events

ASG often predicted for events in countries they were never active in. Suggests country should be weighted much more heavily? Try much simpler model using only year, latitude, longitude?

In [31]:
# merge the predicted labels columns to the test_events df
test_events_svc = pd.merge(df_test, 
                       pd.DataFrame({'pred':labels_svc_pred},index=df_test.index),
                       left_index=True,right_index=True)

test_events_svc['true'] = test_events_svc['gname']==test_events_svc['pred']

print('======================================================================')
print('Countries of Events Incorrectly Predicted as Abu Sayyaf Group (ASG)')
print('======================================================================')
print(test_events_svc[(test_events_svc.pred=='Abu Sayyaf Group (ASG)') & (~test_events_svc.true)]['country_txt'].unique())
print('======================================================================')
print('Countries Where Abu Sayyaf Group (ASG) Carried Out Attacks')
print('======================================================================')
print(df[df.gname=='Abu Sayyaf Group (ASG)']['country_txt'].unique())

Countries of Events Incorrectly Predicted as Abu Sayyaf Group (ASG)
['Colombia' 'Panama' 'Philippines' 'Yemen' 'France' 'India' 'Algeria'
 'Libya' 'Israel' 'United Kingdom' 'Turkey' 'Russia' 'Peru' 'Ecuador'
 'Sri Lanka' 'Central African Republic' 'West Bank and Gaza Strip' 'Greece'
 'Brazil' 'Egypt' 'El Salvador' 'Zambia' 'Saudi Arabia' 'Honduras'
 'Nigeria' 'Bangladesh' 'Italy' 'Austria' 'Iraq' 'Kuwait' 'Belgium'
 'South Sudan' 'Guatemala' 'Canada' 'Denmark' 'Guadeloupe' 'Netherlands'
 'Lebanon' 'Bulgaria' 'Tunisia' 'Iran' 'Pakistan' 'Mali' 'Bolivia' 'Chile'
 'Georgia' 'Syria' 'Costa Rica' 'Kosovo' 'South Yemen' 'Chad' 'Botswana']
Countries Where Abu Sayyaf Group (ASG) Carried Out Attacks
['Philippines' 'Malaysia']


## Model Using tf-idf Type Weights Derived Above
NB: weights above calculated on all data. Should be only on training data.

In [None]:
# get rid of NaN weights
grp_aw.fillna(0,inplace=True)

# subtract neg_scale*weight for features not present in event
# but present in group
neg_scale = 0.25 

# matrix multiply test events by group weights for each feature
tfidf_events = pd.DataFrame(np.inner(X_test,grp_aw),index=X_test.index,columns=grp_aw.index)

# subtract contribution of negative cases
tfidf_events = tfidf_events - neg_scale*pd.DataFrame(np.inner(X_test.replace({0:1,1:0}),grp_aw),index=X_test.index,columns=grp_aw.index)

tfidf_pred = pd.DataFrame({'gname':y_test,'pred':tfidf_events.idxmax(axis=1)})
tfidf_pred['true'] = tfidf_pred.gname == tfidf_pred.pred

# stats on accuracy of model overall and per group
print('overall accuracy',sum(tfidf_pred.true)/len(tfidf_pred))

# calculate metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

print('precision',precision_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('recall',recall_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('accuracy',accuracy_score(tfidf_pred.gname, tfidf_pred.pred))
print('confusion matrix (top 20 grps):')
cmatrix = confusion_matrix(tfidf_pred.gname, tfidf_pred.pred,labels=top_grps)        
display(pd.DataFrame(cmatrix).iloc[:20,:20])

print('grp13: ',top_grps[13])
print('grp6: ',top_grps[6])
print('----------------------------')
print('grp14: ',top_grps[14])
print('grp11: ',top_grps[11])
print('----------------------------')
print('grp19: ',top_grps[19])
print('grp17: ',top_grps[17])
print('----------------------------')

In [None]:
'''
print('==========================')
print('event 17041')
print('==========================')
print('group predictions:')
display(tfidf_events.loc[17041].sort_values(ascending=False).head(10))
print('real group:',y_test.loc[17041].gname)

tmp=X_test.loc[17041]
tmp.index=grp_aw.loc['Revolutionary Armed Forces of Colombia (FARC)'].index

ev17041 = pd.DataFrame({'event':tmp,
              'FARC':grp_aw.loc['Revolutionary Armed Forces of Colombia (FARC)'],
              'ELN':grp_aw.loc['National Liberation Army of Colombia (ELN)']})

ev17041['diff'] = ev17041.ELN - ev17041.FARC
ev17041.loc[ev17041.event==0, 'diff'] = neg_scale*ev17041.loc[ev17041.event==0, 'diff']

display(ev17041[(ev17041.FARC>0) | (ev17041.ELN>0) | (ev17041.event>0)].sort_values('diff',ascending=False).head(10))
display(ev17041[(ev17041.FARC>0) | (ev17041.ELN>0) | (ev17041.event>0)].sort_values('diff',ascending=False).tail(10))

print('==========================')
print('Frequent groups with incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].gname.value_counts().head(10))

print('==========================')
print('Incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].pred.value_counts().head(10))
'''