# Identifying Features Associated with Groups and First Model Attempts

## Load Data

In [None]:
# Imports
import pandas as pd
import numpy as np

# columns of interest
cols_to_extract = ['eventid', 'iyear',
 'country_txt','gname','attacktype1_txt',
 'success','suicide',
 'weaptype1_txt','weapsubtype1_txt',
 'targtype1_txt','targsubtype1_txt',
 'individual','nperps','claimed',
 'nkill','nwound',
 'property','propextent_txt',
 'ishostkid','nhostkid','hostkidoutcome_txt','ransom']

# Load data
df = pd.read_excel('GTD_0617dist/globalterrorismdb_0617dist.xlsx',
                   index='eventid',
                   usecols=cols_to_extract,
                   na_values = ['Unknown','-99','-9','Not Applicable'])

# setting index in read statement doesn't seem to work, so do it here
df.set_index('eventid',inplace=True) 

# replace unwanted _txt suffix from column names
df.columns = df.columns.str.replace('_txt','')

df.info()

## Extract events associated with groups of interest

In [None]:
# remove events with no group affiliation
no_grp = df.gname.isnull() | df.individual
with_grp = ~no_grp
df = df[with_grp]

# don't need the 'individual' column any more
df.drop('individual',axis=1,inplace=True)

# only keep the top n groups with the most incidents
n_groups = 50 #for all groups: df['gname'].nunique()

top_grps = df['gname'].value_counts().head(n_groups).index
df = df[df.gname.isin(top_grps)]

print('Number of events affiliated with individuals or unknown group: ',sum(no_grp))
print('Number of events affiliated with a group: ',sum(with_grp))
print('Number of events affiliated with top {} groups: {}'.format(n_groups,len(df)))

## Modify some unwanted columns and values

In [None]:
# replace some values not correctly dealt with by pandas import
df.replace(-9,np.nan,inplace=True)
df.replace(-99,np.nan,inplace=True)

# rename year column
df.rename(columns={'iyear':'year'}, inplace=True)

# If no claimed info - treat as not claimed
df['claimed'].fillna(0,inplace=True)

# remove some values that don't give useful information
df['weaptype1'].replace('Other',np.nan,inplace=True)

df['weapsubtype1'].replace(['Unknown Gun Type', 'Unknown Explosive Type',
                                'Other Explosive Type', 'Unknown Weapon Type',
                                'Other Gun Type'], np.nan, inplace=True)

df['targtype1'].replace('Other',np.nan,inplace=True)

df['targsubtype1'].replace(['Other Personnel', 'Other (including online news agencies)', 'Other Facility'],
                               np.nan,inplace=True)


df.info()

## Categorise some columns to reduce no. features

In [None]:
# Numeric columns - convert to values for 0, 1, 2-10, and more than 10 
conv_numeric = ['nkill','nwound','nperps','nhostkid']

for col in conv_numeric:
    df[col] = pd.cut(df[col],
                        [-0.1,0.9,1.9,10.9,max(df[col])+0.1],
                        labels=['0','1','2to10','11'])

# boolean columns - convert to yes/no to help identification in dummy variables later
conv_bool = ['success','suicide','claimed','property','ishostkid','ransom']

for col in conv_bool:
    df[col].replace({0:'no',1:'yes'},inplace=True)

# bin year in to decades
df['year'] = pd.cut(df['year'],
                        [1969.9,1979.9,1989.9,1999.9,2009.9,2019.9],
                        labels=['1970s','1980s','1990s','2000s','2010s'])  

display(df.head())

# warning message below r.e. empty bins

## Create training and testing datasets

In [None]:
from sklearn.model_selection import train_test_split

# labels
y = df['gname']
display(y.head())

# features
X = df.drop('gname',axis=1)
display(X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)
print('Train: X shape =',X_train.shape,', Y shape=',y_train.shape)
print('Test: X shape =',X_test.shape,', Y shape=',y_test.shape)

# make some combined data frames with both labels and features. Useful later.
df_train = X_train.copy()
df_train['gname'] =  y_train

df_test = X_test.copy()
df_test['gname'] = y_test

## Create Dummy variables for Each Category Value

Gives a bool column for each unique value.

In [None]:
# labels
# do on full dataset first to make sure same columns in test and train
y_dum = pd.get_dummies(y)
y_dum_train = y_dum.loc[y_train.index]
y_dum_test = y_dum.loc[y_test.index]

display(y_dum_train.head())

# features
# do on full dataset first to make sure same columns in test and train
X_dum = pd.get_dummies(X)
X_dum_train = X_dum.loc[X_train.index]
X_dum_test = X_dum.loc[X_test.index]

# NaN values in test data can destroy predictions so remove them
X_dum_test.fillna(0,inplace=True) 

display(X_dum_train.head())

## Calculate Characterising Values for each Group
i.e. the feature values that are most characteristic of an attack by each group

In [None]:
# no. incidents associated with each group
grp_incs = y.value_counts()

# add group name column to the data frame of dummies
df_dum_train = X_dum_train.copy()
df_dum_train['gname'] =  y_train

# loop over all columns excluding gname
col_tfidf = dict()

for col in df_dum_train.drop('gname',axis=1).columns:
    # for each group, how many times this unique value appears
    grp_cnts = df_dum_train.groupby('gname')[col].sum()
    
    # count how many groups have an incident including this unique value
    # convert this for tf-idf weight using log(n_groups/count)
    if (grp_cnts>0).sum()==0:
        print('Warning: No instances of',col)
        
    w_col = np.log(n_groups/((grp_cnts>0).sum()))
    
    # multiply w_col by no. occurences each column value to get weight for each group
    col_tfidf[col] = (grp_cnts*w_col)
    
# merge unique values for each column in to one large data frame
w_tfidf = pd.DataFrame(col_tfidf,index=top_grps)

# get rid of NaN weights
w_tfidf.fillna(0,inplace=True)

w_tfidf.info()

## Print Top n Features for Top m Groups

In [None]:
n_print_grp=10
n_print_feat=5

for grp in grp_incs.head(n_print_grp).index:    
    print(w_tfidf.loc[grp].sort_values(ascending=False).head(n_print_feat))
    print('----------------------------------------------')

## Model Using tf-idf Type Weights Derived Above
NB: weights above calculated on all data. Should be only on training data.

In [None]:
# subtract neg_scale*weight for features not present in event
# but present in group
neg_scale = 0.25 

# matrix multiply test events by group weights for each feature
tfidf_events = pd.DataFrame(np.inner(X_dum_test,w_tfidf),index=X_dum_test.index,columns=w_tfidf.index)

# subtract contribution of negative cases
tfidf_events = tfidf_events - neg_scale*pd.DataFrame(np.inner(X_dum_test.replace({0:1,1:0}),w_tfidf),index=X_dum_test.index,columns=w_tfidf.index)

tfidf_pred = pd.DataFrame({'gname':y_test,'pred':tfidf_events.idxmax(axis=1)})
tfidf_pred['true'] = tfidf_pred.gname == tfidf_pred.pred

# stats on accuracy of model overall and per group
print('overall accuracy',sum(tfidf_pred.true)/len(tfidf_pred))

# calculate metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

print('precision',precision_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('recall',recall_score(tfidf_pred.gname, tfidf_pred.pred, labels=top_grps,average='micro'))
print('accuracy',accuracy_score(tfidf_pred.gname, tfidf_pred.pred))
print('confusion matrix (top 20 grps):')
cmatrix = confusion_matrix(tfidf_pred.gname, tfidf_pred.pred,labels=top_grps)        
display(pd.DataFrame(cmatrix).iloc[:20,:20])

print('grp13: ',top_grps[13])
print('grp6: ',top_grps[6])
print('----------------------------')
print('grp14: ',top_grps[14])
print('grp11: ',top_grps[11])
print('----------------------------')
print('grp19: ',top_grps[19])
print('grp17: ',top_grps[17])
print('----------------------------')

In [None]:
print('==========================')
print('Frequent groups with incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].gname.value_counts().head(10))

print('==========================')
print('Incorrect predictions')
print('==========================')
display(tfidf_pred[(~tfidf_pred.true)].pred.value_counts().head(10))


## Fit a Classifier to the Data

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import average_precision_score, accuracy_score

model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_dum_train.fillna(0), y_dum_train.fillna(0))

y_svc_pred = model.predict(X_dum_test.fillna(0))

print('Accuracy score:',accuracy_score(y_dum_test,y_svc_pred))
print('Precision score:',average_precision_score(y_dum_test,y_svc_pred))

## Investigate the Results of the Classifier

In [None]:
# extract group for each event in test data
#y_test = y_test.stack()
#y_test = y_test[y_test>0].index.get_level_values(1)

# extract prediction for each event in test data
labels_svc_pred = y_dum_test.columns[y_svc_pred.argmax(axis=1)]

# create a data frame of labels and predictions
labels_svc = pd.DataFrame({'true':y_test.values, 'pred':labels_svc_pred.values})

# was the predcition correct?
labels_svc['correct'] = labels_svc['pred']==labels_svc['true']

# labelled correctly / total events
frac_true_svc = (labels_svc.loc[labels_svc.correct,'true'].value_counts()/labels_svc['true'].value_counts()).sort_values(ascending=False)

# predicted correctly / predicted total
frac_pred_svc = (labels_svc.loc[labels_svc.correct,'pred'].value_counts()/labels_svc['pred'].value_counts()).sort_values(ascending=False)


print('-------------------------------------------')
print('Groups commonly labelled incorrectly:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Common incorrect predictions:')
print('-------------------------------------------')
display(labels_svc.loc[~labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.tail(5).sort_values())
print('-------------------------------------------')
print('Groups commonly labelled correctly:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'true'].value_counts().head(5))
display(frac_true_svc.head(5))
print('-------------------------------------------')
print('Common correct predictions:')
print('-------------------------------------------')
display(labels_svc.loc[labels_svc.correct,'pred'].value_counts().head(5))
display(frac_pred_svc.head(5))

## Abu Sayyaf Group (ASG): Frequently Predicted Wrongly

A look at some of the features of ASG events, and the groups that are often mistaken for ASG

In [None]:
print('=============================')
print('Abu Sayyaf Group (ASG)')
print('=============================')
asg=df[df.gname=='Abu Sayyaf Group (ASG)']
display(asg['country'].value_counts())

print('=============================')
print('Philippines')
print('=============================')
print(df[df.country == 'Philippines']['gname'].value_counts())

print('=============================')
print('Malaysia')
print('=============================')
print(df[df.country == 'Malaysia']['gname'].value_counts())

print('================================================')
print('Regularly Mistaken for Abu Sayyaf Group (ASG)')
print('================================================')
print(labels_svc.loc[(~labels_svc.correct) & (labels_svc.pred=='Abu Sayyaf Group (ASG)')].true.value_counts().head(10))

asg_w = w_tfidf.loc['Abu Sayyaf Group (ASG)']
farc_w = w_tfidf.loc['Revolutionary Armed Forces of Colombia (FARC)']
asgfarc = pd.DataFrame({'Abu Sayyaf Group (ASG)':asg_w, 'Revolutionary Armed Forces of Colombia (FARC)':farc_w})
asgfarc['diff'] = asgfarc['Abu Sayyaf Group (ASG)']-asgfarc['Revolutionary Armed Forces of Colombia (FARC)']

print('================================================')
print('Features More Common in ASG than FARC')
print('================================================')
display(asgfarc.sort_values('diff').tail(5).sort_values(by='diff',ascending=False))

print('=========================================================================')
print('Features More Common FARC than ASG')
print('=========================================================================')
display(asgfarc.sort_values('diff').head(5))

print('=========================================================================')
print('Features Similar in Both')
print('=========================================================================')
inboth = (asgfarc['Abu Sayyaf Group (ASG)']>0.005) & (asgfarc['Revolutionary Armed Forces of Colombia (FARC)']>0.005)
display(abs(asgfarc.loc[inboth]).sort_values('diff'))


## Countries of ASG Predicted Events

ASG often predicted for events in countries they were never active in. Suggests country should be weighted much more heavily? Try much simpler model using only year, latitude, longitude?

In [None]:
# merge the predicted labels columns to the test_events df
test_events_svc = pd.merge(df_test, 
                       pd.DataFrame({'pred':labels_svc_pred},index=df_test.index),
                       left_index=True,right_index=True)

test_events_svc['true'] = test_events_svc['gname']==test_events_svc['pred']

print('======================================================================')
print('Countries of Events Incorrectly Predicted as Abu Sayyaf Group (ASG)')
print('======================================================================')
print(test_events_svc[(test_events_svc.pred=='Abu Sayyaf Group (ASG)') & (~test_events_svc.true)]['country'].unique())
print('======================================================================')
print('Countries Where Abu Sayyaf Group (ASG) Carried Out Attacks')
print('======================================================================')
print(df[df.gname=='Abu Sayyaf Group (ASG)']['country'].unique())

## Naive Bayes - train

In [None]:
# probability each feature value
p_x = X_dum_train.sum()/len(X_dum_train)
display(p_x.sort_values(ascending=False).head())

# probability each group y
grp_incs_train = y_train.value_counts()
p_y = grp_incs_train/len(X_dum_train)
display(p_y.head())

# probability each feature value, given group y
p_xgy = df_dum_train.groupby('gname').sum()
p_xgy = p_xgy.divide(grp_incs_train, axis=0)
display(p_xgy.head())

## Naive Bayes - Test

In [None]:
# series to store results in
y_baies = pd.Series(index=X_dum_test.index)

for event in X_dum_test.index:
    # likelihood for each feature given each group
    probs = X_dum_test.loc[event]*p_xgy
    
    # multiply likelihoods for each feature
    probs=probs.T
    probs=probs[probs.sum(axis=1)>0]
    probs=probs.product().multiply(p_y)
    
    # normalise
    #probs = probs/probs.sum()
    
    # store group with max likelihood
    y_baies[event]=probs.idxmax(axis=1)
    
df_baies = pd.DataFrame({'gname':y_test,'pred':y_baies})
df_baies['true'] = df_baies.gname == df_baies.pred

print('overall accuracy',sum(df_baies.true)/len(df_baies))


In [None]:
print('precision',precision_score(df_baies.gname, df_baies.pred, labels=top_grps,average='micro'))
print('recall',recall_score(df_baies.gname, df_baies.pred, labels=top_grps,average='micro'))
print('accuracy',accuracy_score(df_baies.gname, df_baies.pred))
print('confusion matrix (top 20 grps):')
cmatrix = confusion_matrix(df_baies.gname, df_baies.pred,labels=top_grps)        
display(pd.DataFrame(cmatrix).iloc[:20,:20])

print('grp13: ',top_grps[13])
print('grp6: ',top_grps[6])
print('----------------------------')
print('grp14: ',top_grps[14])
print('grp11: ',top_grps[11])
print('----------------------------')
print('grp19: ',top_grps[19])
print('grp17: ',top_grps[17])
print('----------------------------')