# Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

# Initial Load data from original csv

In [7]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        display(df)

In [6]:
PATH = "data/gtd/"

!ls {PATH}

answer1				 gtd_13to16_0617dist.xlsx  gtd_full.csv
answer-rf.csv			 gtd1993_0617dist.xlsx	   gtd-raw-allcats-dt
Codebook.pdf			 gtd_70to94_0617dist.xlsx  TermsofUse.pdf
globalterrorismdb_0617dist.xlsx  gtd_95to12_0617dist.xlsx


In [None]:
% time df_raw = pd.read_excel(f"{PATH}/globalterrorismdb_0617dist.xlsx")

In [None]:
display_all(xl.tail().T)

In [None]:
display_all(xl.describe(include='all').T)

In [None]:
% time df_raw.to_feather('tmp/gtd-raw')

# Load gtd_raw_sum

In [8]:
%time df_raw = pd.read_feather('tmp/gtd_raw_sum')
df_raw.shape

CPU times: user 458 ms, sys: 261 ms, total: 719 ms
Wall time: 2.25 s


(170350, 105)

In [10]:
# list number of unique values for each column
a = [(len(df_raw[c].unique()),df_raw[c].dtype,c) for c in df_raw.columns]
len(a),a

(105,
 [(170350, dtype('int64'), 'eventid'),
  (46, dtype('int64'), 'iyear'),
  (13, dtype('int64'), 'imonth'),
  (32, dtype('int64'), 'iday'),
  (2, dtype('int64'), 'extended'),
  (1862, dtype('<M8[ns]'), 'resolution'),
  (205, dtype('O'), 'country_txt'),
  (12, dtype('O'), 'region_txt'),
  (2495, dtype('O'), 'provstate'),
  (33958, dtype('O'), 'city'),
  (61029, dtype('float64'), 'latitude'),
  (60603, dtype('float64'), 'longitude'),
  (6, dtype('float64'), 'specificity'),
  (3, dtype('int64'), 'vicinity'),
  (39817, dtype('O'), 'location'),
  (2, dtype('int64'), 'crit1'),
  (2, dtype('int64'), 'crit2'),
  (2, dtype('int64'), 'crit3'),
  (3, dtype('int64'), 'doubtterr'),
  (6, dtype('O'), 'alternative_txt'),
  (2, dtype('int64'), 'multiple'),
  (2, dtype('int64'), 'success'),
  (2, dtype('int64'), 'suicide'),
  (9, dtype('O'), 'attacktype1_txt'),
  (10, dtype('O'), 'attacktype2_txt'),
  (8, dtype('O'), 'attacktype3_txt'),
  (22, dtype('O'), 'targtype1_txt'),
  (111, dtype('O'), 'targ

In [12]:
# list columns with unique values > n
n = 10000
a0 = [a0 for a0 in a if a0[0]>n]
len(a0),a0

(15,
 [(170350, dtype('int64'), 'eventid'),
  (33958, dtype('O'), 'city'),
  (61029, dtype('float64'), 'latitude'),
  (60603, dtype('float64'), 'longitude'),
  (39817, dtype('O'), 'location'),
  (31300, dtype('O'), 'corp1'),
  (82975, dtype('O'), 'target1'),
  (13034, dtype('O'), 'motive'),
  (18247, dtype('O'), 'weapdetail'),
  (18345, dtype('O'), 'propcomment'),
  (14151, dtype('O'), 'addnotes'),
  (75888, dtype('O'), 'scite1'),
  (56563, dtype('O'), 'scite2'),
  (32563, dtype('O'), 'scite3'),
  (20830, dtype('O'), 'related')])

In [13]:
df_raw['scite1'].dtype.name

'object'

In [15]:
# check freq of each value in a column
cname = 'scite2'
df_raw[cname].value_counts()

Christopher Hewitt, "Political Violence and Terrorism in Modern America: A Chronology," Praeger Security International, 2005.                                                                                                                                                                   134
"Iraq: Security Roundup 1900 GMT 27 September 2016," Summary, September 27, 2016.                                                                                                                                                                                                               100
"Attack on 80 electricity towers in Anbar leaves 2 casualties," Iraqi News, December 1, 2016.                                                                                                                                                                                                    63
"IS ignites sectarian tensions in multiple Iraq bombings," Middle East Eye, January 12, 2016.                               

# convert datetime

In [54]:
# convert all o dates to mid of month or year
df_raw['imonth'] = df_raw['imonth'].apply(lambda x: 6 if x==0 else x)
df_raw['iday'] = df_raw['iday'].apply(lambda x: 15 if x==0 else x)

In [55]:
#check if any more 0 dates
df_raw[df_raw['imonth']==0].shape, df_raw[df_raw['iday']==0].shape

((0, 105), (0, 105))

In [56]:
# combine year, month, day to 1 col
df_raw['mydate'] = df_raw['iyear'].astype(str)+'/'+df_raw['imonth'].astype(str)+'/'+df_raw['iday'].astype(str)
df_raw['mydate'] = pd.to_datetime(df_raw['mydate'], format='%Y/%m/%d')
for c in ['iyear','imonth','iday']: df_raw.drop(c, axis=1, inplace=True)

In [57]:
df_raw['mydate'] = pd.to_datetime(df_raw['mydate'], format='%Y/%m/%d')
df_raw.shape

(170350, 103)

In [58]:
add_datepart(df_raw, 'mydate')
df_raw.shape

(170350, 115)

In [59]:
%time df_raw.to_feather('tmp/gtd-raw-allcats-dt')

CPU times: user 407 ms, sys: 149 ms, total: 556 ms
Wall time: 886 ms


# Split data into train, valid, test sets

In [3]:
df_raw = pd.read_feather('tmp/gtd-raw-allcats-dt')

In [4]:
df_test = df_raw[df_raw['gname']=='Unknown'].copy()
df_trn = df_raw[df_raw['gname']!='Unknown'].copy()
df_test.shape, df_trn.shape

((78306, 115), (92044, 115))

# convert max cats

In [5]:
xcats = ['eventid', 'iyear', 'imonth', 'iday', 'latitude', 'longitude']

cats = [c for c in df_raw.columns if c not in xcats]
len(cats), len(xcats), df_raw.shape

(112, 6, (170350, 115))

In [6]:
for v in cats:
    df_trn[v] = df_trn[v].astype('category').cat.as_ordered()

In [7]:
apply_cats(df_test, df_trn)

# myidx, split_vals, proc_df

In [8]:
# split into training and valid indexes
seed = 42
def myidxs(n, cv_idx=0, val_pct=0.2, seed=seed):
    np.random.seed(seed)
    n_val = int(val_pct*n)
    idx_start = cv_idx*n_val
    idxs = np.random.permutation(n)
    return idxs[idx_start:idx_start+n_val], idxs[idx_start+n_val:] 

In [9]:
val_idx, trn_idx = myidxs(df_trn.shape[0])
val_idx.shape, trn_idx.shape

((18408,), (73636,))

In [10]:
def split_vals(a,t,v):
    if type(a)==type(df):
        trn, val = a.iloc[t].copy(), a.iloc[v].copy()
    elif type(a)==type(y):
        trn, val = a[[t]].copy(), a[[v]].copy()
    return trn, val

In [12]:
df, y, nas = proc_df(df_trn, 'gname')

In [13]:
dfts, yts, nas = proc_df(df_test, 'gname', na_dict=nas)

In [14]:
df.shape, y.shape, dfts.shape, yts.shape

((92044, 116), (92044,), (78306, 116), (78306,))

In [15]:
X_train, X_valid = split_vals(df,trn_idx,val_idx)
y_train, y_valid = split_vals(y,trn_idx,val_idx)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((73636, 116), (18408, 116), (73636,), (18408,))

# the Model

In [16]:
def print_score(m):
    print(m.score(X_train, y_train), m.score(X_valid, y_valid),m.oob_score_)

In [17]:
m = RandomForestClassifier(n_estimators=40, max_features=0.5, n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 11min 19s, sys: 21.1 s, total: 11min 40s
Wall time: 1min 54s
0.999619751208648 0.8538135593220338 0.8431202129393232


In [18]:
m = RandomForestClassifier(n_estimators=100, max_features=0.5, n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 28min 32s, sys: 51.4 s, total: 29min 23s
Wall time: 4min 45s
0.9999728393720463 0.8576162538026945 0.8506437068825031


# Test on test set 

In [19]:
% time ans = m.predict(dfts)

CPU times: user 2min 19s, sys: 51.7 s, total: 3min 11s
Wall time: 51.4 s


In [21]:
anskey = df_trn.gname.cat.categories.tolist()

In [23]:
final_answer = [anskey[x] for x in ans]

In [25]:
dfts['final_answer'] = final_answer

In [26]:
dfts.head()

Unnamed: 0,eventid,extended,resolution,country_txt,region_txt,provstate,city,latitude,longitude,specificity,...,myIs_month_end,myIs_month_start,myIs_quarter_end,myIs_quarter_start,myIs_year_end,myIs_year_start,myElapsed,latitude_na,longitude_na,final_answer
2,197001000001,1,0,130,10,1532,22109,15.478598,120.599741,4,...,1,1,1,1,1,1,11,False,False,Muslims
3,197001000002,1,0,63,12,136,1591,37.983773,23.728157,1,...,1,1,1,1,1,1,11,False,False,Left-wing extremists
4,197001000003,1,0,83,4,0,7350,33.580412,130.396361,1,...,1,1,1,1,1,1,11,False,False,Airport Protesters
7,197001020002,1,0,177,7,307,15882,37.805065,-122.273024,1,...,1,1,1,1,1,1,2,False,False,Strikers
12,197001080001,1,0,80,12,863,18258,41.89052,12.494249,1,...,1,1,1,1,1,1,0,False,False,Popular Front for the Liberation of Palestine ...


In [27]:
dfts.to_csv('data/gtd/gtd-finalanswer.csv')