In [199]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import seaborn as sns
import matplotlib.cm as cm
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [200]:
# Load PIRUS Data
pirus = pd.read_csv('../data/clean_data/pirus_deep_clean_Final.csv')

# Load PIRUS Codebook
with open('../data/clean_data/pirus_codebook.json') as f:
    pirus_codebook = json.load(f)

# Sanity Checks and Housekeeping

In [176]:
#Columns
pirus.columns

Index(['subject_id', 'loc_plot_state1', 'loc_plot_state1_us_dummy',
       'loc_plot_city1', 'date_exposure', 'plot_target1', 'attack_preparation',
       'op_security', 'anticp_fatals_targ', 'extent_plot', 'violent',
       'criminal_severity', 'criminal_charges', 'indict_arrest',
       'group_membership', 'terrorist_group_name1', 'actively_recruited',
       'recruiter1', 'role_group', 'length_group', 'clique',
       'clique_radicalize', 'clique_connect', 'internet_radicalization',
       'social_media', 'social_media_frequency', 'social_media_platform1',
       'social_media_activities1', 'radicalization_islamist',
       'radicalization_far_right', 'radicalization_far_left',
       'radicalization_single_issue', 'ideological_sub_category1',
       'loc_habitation_state1', 'loc_habitation_state1_us_dummy',
       'loc_habitation_city1', 'external_rad', 'radical_behaviors',
       'radical_beliefs', 'us_govt_leader', 'foreign_govt_leader',
       'event_influence1', 'radicalization

In [177]:
pirus = pirus[['subject_id', 'date_exposure', 'plot_target1', 'extent_plot', 'violent', 'role_group', 'ideological_sub_category1', 'radical_behaviors', 'marital_status', 'gender', 'education', 'employment_status', 'work_history', 'alcohol_drug', 'loc_plot1_lat', 'loc_plot1_long', 'loc_habit1_lat', 'loc_habit1_long', 'year', 'age', 'broad_ethnicity']]
print(pirus.isnull().sum())

subject_id                      0
date_exposure                   0
plot_target1                   20
extent_plot                    13
violent                         0
role_group                    126
ideological_sub_category1       0
radical_behaviors              76
marital_status                671
gender                          0
education                    1074
employment_status             948
work_history                 1017
alcohol_drug                    0
loc_plot1_lat                 226
loc_plot1_long                226
loc_habit1_lat                715
loc_habit1_long               715
year                            0
age                            53
broad_ethnicity               146
dtype: int64


In [188]:
#create new df, drop empty rows, and convert float to int
ideologicalcategory = pirus[['ideological_sub_category1']]
ideologicalcategory = ideologicalcategory.dropna(subset=['ideological_sub_category1'])
ideologicalcategory['ideological_sub_category1'] = ideologicalcategory['ideological_sub_category1'].astype(int)

#create function for codebook name
r_codes = pirus_codebook['ideological_sub_category']['codes']
target_fun = lambda x: r_codes[str(x)]

#apply function and get value counts
ideologicalcategory['ideological_sub_category1'] = ideologicalcategory['ideological_sub_category1'].apply(target_fun)
ideologicalcategory['ideological_sub_category1'].value_counts()

White supremacist/KKK/Neo-Nazi                                    529
Islamist                                                          510
Anti-government/Sovereign Citizens movement                       196
Animal rights/Environmentalist                                    149
Anti-abortion                                                     128
Militia/gun rights                                                 78
Other                                                              69
Xenophobic/Anti-immigrant                                          60
Puerto Rican independence/Puerto Rican nationalist                 30
Anti-gay                                                           28
Black Nationalist/Black Separatist                                 25
Anarchist                                                          19
Jewish Defense League                                              13
Christian Identity                                                 12
Anti-capitalist/Comm

In [178]:
#Date Range
Min = pirus['year'].min()
Max = pirus['year'].max()
print(f'Date range: {Min} to {Max}')

Date range: 1981 to 2018


In [179]:
#Gender
e_codes = pirus_codebook['gender']['codes']
e_fun = lambda x: e_codes[str(x)]
pirus['gender'] = pirus['gender'].apply(e_fun)
pirus['gender'].value_counts()

Male      1692
Female     180
Name: gender, dtype: int64

In [190]:
#create new df, drop empty rows, and convert float to int
educationcounts = pirus[['education']]
educationcounts = educationcounts.dropna(subset=['education'])
educationcounts['education'] = educationcounts['education'].astype(int)

#create function for codebook name
e_codes = pirus_codebook['education']['codes']
e_fun = lambda x: e_codes[str(x)]

#apply function and get value counts
educationcounts['education'] = educationcounts['education'].apply(e_fun)
educationcounts['education'].value_counts()

High school diploma                  203
Some College                         196
College degree                       157
Some High school                     115
Master’s degree                       41
Doctoral/Professional degree          35
Vocational school degree              12
Some Doctoral/Professional degree     11
Some Master’s school                  10
Did not attempt high school           10
Some Vocational school                 8
Name: education, dtype: int64

In [180]:
#Age
pirus['age'].describe()

count    1819.000000
mean       34.664101
std        13.536037
min        10.000000
25%        24.000000
50%        31.000000
75%        43.000000
max        88.000000
Name: age, dtype: float64

In [181]:
#Habitation Longitude
pirus['loc_habit1_long'].describe()

count    1157.000000
mean      -92.136558
std        16.733116
min      -157.855676
25%      -103.660937
50%       -87.365314
75%       -78.833717
max       -65.957695
Name: loc_habit1_long, dtype: float64

In [182]:
#Habitation Latitude
pirus['loc_habit1_lat'].describe()

count    1157.000000
mean       38.400000
std         5.537335
min        18.379441
25%        34.257038
50%        39.140448
75%        41.875562
max        64.837845
Name: loc_habit1_lat, dtype: float64

In [185]:
#ethnicity Latitude
pirus['broad_ethnicity'].describe()

count    1726.000000
mean        3.057358
std         0.917249
min         1.000000
25%         3.000000
50%         3.000000
75%         3.000000
max         7.000000
Name: broad_ethnicity, dtype: float64

In [207]:
#create new df, drop empty rows, and convert float to int
b_ethnicity = pirus[['broad_ethnicity']]
b_ethnicity = b_ethnicity.dropna(subset=['broad_ethnicity'])
b_ethnicity['broad_ethnicity'] = b_ethnicity['broad_ethnicity'].astype(int)

#create function for codebook name
b_codes = pirus_codebook['broad_ethnicity']['codes']
ethnicity_fun = lambda x: b_codes[str(x)]


#apply function and get value counts
b_ethnicity['broad_ethnicity'] = b_ethnicity['broad_ethnicity'].apply(ethnicity_fun)
b_ethnicity['broad_ethnicity'].value_counts()

White                                            1185
Black/African-American                            213
Middle Eastern/North African                      171
Asian (incl. Iran, Afghanistan, and Pakistan)      77
Hispanic/Latino                                    65
Native American                                    10
Other                                               5
Name: broad_ethnicity, dtype: int64