In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# columns of interest
cols_to_extract = ['imonth', 'iyear','iday',
 'country_txt','gname','attacktype1_txt',
 'success','suicide',
 'weaptype1_txt','weapsubtype1_txt',
 'targtype1_txt','targsubtype1_txt',
 'individual','nperps','claimed',
 'nkill','nwound',
 'property','propextent_txt',
 'ishostkid','nhostkid','hostkidoutcome_txt','ransom']

# Load data
df = pd.read_excel('GTD_0617dist/globalterrorismdb_0617dist.xlsx',
                   usecols=cols_to_extract,
                   na_values = ['Unknown','-99','-9','Not Applicable'])

# replace some values not correctly dealt with by pandas import
df.replace(-9,np.nan,inplace=True)
df.replace(-99,np.nan,inplace=True)

# entries without month or day - treat as 1st January
df['imonth'].replace(0,1,inplace=True)
df['iday'].replace(0,1,inplace=True)

# create a date column, then get rid of the month and day columns
df['date']=pd.to_datetime(dict(year=df.iyear, month=df.imonth, day=df.iday)) 
df.rename(columns={'iyear':'year'}, inplace=True)
df.drop(['imonth','iday'],axis=1,inplace=True)

# set date as the index
df.set_index('date',inplace=True)

# If no claimed info - treat as not claimed
df['claimed'].fillna(0,inplace=True)

display(df.head(5))
df.info()

Unnamed: 0_level_0,year,country_txt,success,suicide,attacktype1_txt,targtype1_txt,targsubtype1_txt,gname,individual,nperps,...,weaptype1_txt,weapsubtype1_txt,nkill,nwound,property,propextent_txt,ishostkid,nhostkid,ransom,hostkidoutcome_txt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-07-02,1970,Dominican Republic,1,0,Assassination,Private Citizens & Property,Named Civilian,MANO-D,0,,...,,,1.0,0.0,0.0,,0.0,,0.0,
1970-01-01,1970,Mexico,1,0,Hostage Taking (Kidnapping),Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",23rd of September Communist League,0,7.0,...,,,0.0,0.0,0.0,,1.0,1.0,1.0,
1970-01-01,1970,Philippines,1,0,Assassination,Journalists & Media,Radio Journalist/Staff/Facility,,0,,...,,,1.0,0.0,0.0,,0.0,,0.0,
1970-01-01,1970,Greece,1,0,Bombing/Explosion,Government (Diplomatic),Embassy/Consulate,,0,,...,Explosives/Bombs/Dynamite,Unknown Explosive Type,,,1.0,,0.0,,0.0,
1970-01-01,1970,Japan,1,0,Facility/Infrastructure Attack,Government (Diplomatic),Embassy/Consulate,,0,,...,Incendiary,,,,1.0,,0.0,,0.0,


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 170350 entries, 1970-07-02 to 2016-12-30
Data columns (total 21 columns):
year                  170350 non-null int64
country_txt           170350 non-null object
success               170350 non-null int64
suicide               170350 non-null int64
attacktype1_txt       163925 non-null object
targtype1_txt         165477 non-null object
targsubtype1_txt      161005 non-null object
gname                 92044 non-null object
individual            170350 non-null int64
nperps                26173 non-null float64
claimed               170350 non-null float64
weaptype1_txt         156498 non-null object
weapsubtype1_txt      150924 non-null object
nkill                 160668 non-null float64
nwound                155025 non-null float64
property              150771 non-null float64
propextent_txt        41479 non-null object
ishostkid             169903 non-null float64
nhostkid              11358 non-null float64
ransom             

In [2]:
# remove events with no group affiliation
no_grp = df.gname.isnull() | df.individual
with_grp = ~no_grp
df = df[with_grp]

# don't need the 'individual' column any more
df.drop('individual',axis=1,inplace=True)

# only keep the top n groups with the most incidents
n_groups = df['gname'].nunique() #100 #df['gname'].nunique() for all groups
top_grps = df['gname'].value_counts().head(n_groups).index
df = df[df.gname.isin(top_grps)]

print('Number of events affiliated with individuals or unknown group: ',sum(no_grp))
print('Number of events affiliated with a group: ',sum(with_grp))
print('Number of events affiliated with top 100 groups: ',len(df))
print('')

df.info()

Number of events affiliated with individuals or unknown group:  78620
Number of events affiliated with a group:  91730
Number of events affiliated with top 100 groups:  91730

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 91730 entries, 1970-07-02 to 2016-12-30
Data columns (total 20 columns):
year                  91730 non-null int64
country_txt           91730 non-null object
success               91730 non-null int64
suicide               91730 non-null int64
attacktype1_txt       86743 non-null object
targtype1_txt         90183 non-null object
targsubtype1_txt      87871 non-null object
gname                 91730 non-null object
nperps                16032 non-null float64
claimed               91730 non-null float64
weaptype1_txt         81854 non-null object
weapsubtype1_txt      78526 non-null object
nkill                 85163 non-null float64
nwound                80723 non-null float64
property              82962 non-null float64
propextent_txt        21367 non-null

In [3]:
# Categorise some columns to help feature extraction later

# Numeric columns - convert to values for 0, 1, 2-10, and more than 10 
conv_numeric = ['nkill','nwound','nperps','nhostkid']

for col in conv_numeric:
    df[col] = pd.cut(df[col],
                        [-0.1,0.9,1.9,10.9,max(df[col])+0.1],
                        labels=['0_'+col,'1_'+col,'2to10_'+col,'11+_'+col])

# boolean columns - add column suffix
conv_bool = ['success','suicide','claimed','ishostkid','ransom','property']

for col in conv_bool:
    df[col].replace({0:('0_'+col),1:('1_'+col)},inplace=True)

# bin year in to decades
df['year'] = pd.cut(df['year'],
                        [1969.9,1979.9,1989.9,1999.9,2009.9,2019.9],
                        labels=['1970s_yr','1980s_yr','1990s_yr','2000s_yr','2010s_yr'])  

df.head()

  if (np.diff(bins) < 0).any():


Unnamed: 0_level_0,year,country_txt,success,suicide,attacktype1_txt,targtype1_txt,targsubtype1_txt,gname,nperps,claimed,weaptype1_txt,weapsubtype1_txt,nkill,nwound,property,propextent_txt,ishostkid,nhostkid,ransom,hostkidoutcome_txt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1970-07-02,1970s_yr,Dominican Republic,1_success,0_suicide,Assassination,Private Citizens & Property,Named Civilian,MANO-D,,0_claimed,,,1_nkill,0_nwound,0_property,,0_ishostkid,,0_ransom,
1970-01-01,1970s_yr,Mexico,1_success,0_suicide,Hostage Taking (Kidnapping),Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",23rd of September Communist League,2to10_nperps,0_claimed,,,0_nkill,0_nwound,0_property,,1_ishostkid,1_nhostkid,1_ransom,
1970-01-01,1970s_yr,United States,1_success,0_suicide,Armed Assault,Police,"Police Building (headquarters, station, school)",Black Nationalists,,0_claimed,Firearms,Unknown Gun Type,0_nkill,0_nwound,1_property,Minor (likely < $1 million),0_ishostkid,,0_ransom,
1970-01-02,1970s_yr,Uruguay,0_success,0_suicide,Assassination,Police,Police Security Forces/Officers,Tupamaros (Uruguay),2to10_nperps,0_claimed,Firearms,Automatic Weapon,0_nkill,0_nwound,0_property,,0_ishostkid,,0_ransom,
1970-01-02,1970s_yr,United States,1_success,0_suicide,Facility/Infrastructure Attack,Military,Military Recruiting Station/Academy,New Year's Gang,1_nperps,1_claimed,Incendiary,Molotov Cocktail/Petrol Bomb,0_nkill,0_nwound,1_property,Minor (likely < $1 million),0_ishostkid,,0_ransom,


In [4]:
# calculate characterising values for each group
# i.e. the feature values that are most characteristic
# of an attack by each group

grp_incs = df['gname'].value_counts()
grp_weights = dict()

# loop over all columns excluding gname
for col in df.drop('gname',axis=1).columns:
    # for each group, how many times each unique value appears in this column
    grp_cnts = df.groupby('gname')[col].value_counts().unstack(col,fill_value=0)
    
    # for each unique value in this column, count how many groups have an incident including it
    # convert this for tf-idf weight using log(n_groups/count)
    w_col = np.log(n_groups/(grp_cnts>0).sum())
    
    # multiply w_col by no. occurences each column value to get weight for each group
    # normalise by no. incidents for that group, so groups can be compared more easily
    grp_weights[col] = (grp_cnts*w_col).div(grp_incs,axis=0)

# merge unique values for each column in to one large data frame
grp_aw = pd.DataFrame(index=top_grps)
for key, w_col in grp_weights.items():
    grp_aw = pd.merge(grp_aw, w_col, left_index=True, right_index=True,how='outer')

In [7]:
# print top n features for top n groups
n_print_grp=20
n_print_feat=10
for grp in grp_incs.head(n_print_grp).index:    
    print(grp_aw.loc[grp].sort_values(ascending=False).head(n_print_feat))
    print('----------------------------------------------')

Afghanistan         4.322831
2010s_yr            1.130291
1_claimed           0.584537
2to10_nkill         0.533899
Police              0.491902
Unknown Gun Type    0.429259
2to10_nwound        0.384559
0_property          0.319380
NATO                0.311006
1_suicide           0.299226
Name: Taliban, dtype: float64
----------------------------------------------
Peru                      4.438904
1980s_yr                  0.890011
Automatic Weapon          0.458145
Unknown Explosive Type    0.390900
Electricity               0.356235
Firearms                  0.328385
Utilities                 0.318227
0_ransom                  0.308849
Bombing/Explosion         0.297954
1990s_yr                  0.296919
Name: Shining Path (SL), dtype: float64
----------------------------------------------
Iraq                           3.060015
2010s_yr                       1.366115
1_suicide                      0.716037
Vehicle                        0.654772
1_claimed                      0.481