In [1]:
import sys
print(sys.version)
import numpy as np
print(np.__version__)
import pandas as pd
print(pd.__version__)
import matplotlib.pyplot as plt
import json

%matplotlib inline

3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
1.19.2
1.1.3


In [2]:
# Load Original PIRUS File
file = "../og_data/PIRUS_May2020/PIRUS_Public_May2020.xlsx"
pirus = pd.read_excel(file, parse_dates=['Date_Exposure', 'Convert_Date', 'Reawakening_Date'])

In [3]:
# Remove all spaces from column names and convert to lower case
pirus.rename(columns=lambda x: x.replace(' ','_').lower(), inplace=True)

In [4]:
#merge coordinates for plot locations
pirus_plot_coordinates = pd.read_csv('../clean_data/pirus_plot_coordinates.csv')
pirus = pd.merge(pirus,pirus_plot_coordinates,on='subject_id',how='outer')

In [5]:
#merge coordinates for habitation locations
pirus_habit_coordinates = pd.read_csv('../clean_data/pirus_habit_coordinates.csv')
pirus = pd.merge(pirus,pirus_habit_coordinates,on='subject_id',how='outer')

In [6]:
pirus

Unnamed: 0,subject_id,loc_plot_state1,loc_plot_city1,loc_plot_state2,loc_plot_city2,date_exposure,plot_target1,plot_target2,plot_target3,attack_preparation,...,gang_age_joined,trauma,other_ideologies,angry_us,group_grievance,standing,loc_plot1_lat,loc_plot1_long,loc_habit1_lat,loc_habit1_long
0,1000,Illinois,Chicago,Florida,Broward County,2002-05-08,14,,,1,...,3,0,0,0,0,0,41.875562,-87.624421,26.122308,-80.143379
1,1001,Afghanistan,-99,,,2001-11-25,-88,,,-88,...,-88,-99,0,1,-99,-99,,,37.779026,-122.419906
2,1002,California,Santa Ana,,,1997-05-01,-99,,,-99,...,-88,-99,0,1,1,-99,33.749495,-117.873221,33.749495,-117.873221
3,1005,New York,New York,,,1993-04-10,1,,,2,...,-88,0,0,1,2,0,40.712728,-74.006015,,
4,1006,New York,New York,,,1993-03-04,1,,,2,...,-88,-99,0,1,-99,-99,40.712728,-74.006015,40.728158,-74.077642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2221,7424,Nevada,Hoover Dam,Arizona,Mohave County,2018-06-15,-88,,,-88,...,-88,-99,0,1,0,-99,36.015721,-114.737468,36.030113,-114.982619
2222,7761,District of Columbia,"National Mall, D.C",,,2018-10-09,15,,,2,...,-88,0,0,1,0,0,38.885428,-77.041473,41.024974,-73.950612
2223,7161,South Carolina,Anderson,,,2018-03-13,14,,,2,...,-88,-99,0,1,0,0,34.506860,-82.650626,34.506860,-82.650626
2224,8341,Utah,St. George,,,2018-03-05,8,,,2,...,-88,-99,0,1,0,0,37.104153,-113.584131,37.104153,-113.584131


In [7]:
# Set index to subject_id
pirus.set_index(pirus['subject_id'], inplace = True)
pirus.index.name = 'subject_id'

In [8]:
# Drop unnecessary fields: 
unnecessary_fields = ['subject_id', 'age_child', 
                      'itinerant','group_competition', 
                      'current_status', 'changing_target', 
                      'internet_use_plot', 'loc_plot_state2',
                      'loc_plot_city2', 'plot_target2', 'plot_target3',
                      'terrorist_group_name2', 'terrorist_group_name3',
                      'recruiter2', 'recruiter3', 'actively_connect',
                      'media_radicalization', 'social_media_platform2',
                      'social_media_platform3', 'social_media_platform4',
                      'social_media_platform5', 'social_media_activities2',
                      'social_media_activities3', 'social_media_activities4',
                      'social_media_activities5', 'social_media_activities6',
                      'social_media_activities7', 'ideological_sub_category2',
                      'ideological_sub_category3', 'loc_habitation_state2',
                      'loc_habitation_city2', 'rad_duration', 'event_influence2',
                      'event_influence3', 'event_influence4', 'beliefs_trajectory',
                      'behaviors_trajectory', 'radicalization_place', 'religious_background',
                      'reawakening', 'reawakening_date','change_performance',
                      'social_stratum_childhood', 'aspirations', 'abuse_type2',
                      'abuse_type3', 'absent_parent','overseas_family',
                      'close_family', 'family_religiosity','family_ideology',
                      'family_ideological_level', 'prison_family_friend',
                      'crime_family_friend', 'radical_family', 'radical_signif_other',
                      'relationship_troubles', 'platonic_troubles', 'unstructured_time',
                      'friendship_source1', 'friendship_source2', 'friendship_source3',
                      'kicked_out', 'previous_criminal_activity_type2',
                      'previous_criminal_activity_type3', 'trauma', 'standing']
pirus.drop(unnecessary_fields, axis=1, inplace=True)

In [9]:
# Add year and month columns (derived from date_exposure)
pirus['year'] = pd.DatetimeIndex(pirus['date_exposure']).year
pirus['month'] = pd.DatetimeIndex(pirus['date_exposure']).month

In [10]:
# Remove years that are not between 1980 and 2019
mask = (pirus['year'] >= 1980) & (pirus['year'] < 2019)
pirus = pirus.loc[mask]

In [11]:
def special_nan_counter(ser, code):
    try:
        return ser.value_counts()[code]
    except:
        return 0

def build_nan_dict(df):
    # Loop through columns to built a column/NaN value dictionary
    d = {}
    for col in df.columns:
        count_nan = df[col].isna().sum()
        count_88 = special_nan_counter(df[col], '-88') + special_nan_counter(df[col], -88)
        count_99 = special_nan_counter(df[col], '-99') + special_nan_counter(df[col], -99)

        count_sum =  count_88 + count_99
        count_normal = df[col].size - count_sum
        typ = df[col].dtype
        
        d.update({col : {'column_name': col, 
                         'type' : typ, 
                         'Normal Value' : count_normal, 
                         'NaN' : count_nan, 
                         '-88' : count_88, 
                         '-99' : count_99, 
                         'sum' : count_sum}})
    return d

def nan_analysis(df):
    '''Convert nan_dict into a dataframe.'''
    df = pd.DataFrame(build_nan_dict(pirus)).T
    df.index = df.column_name
    df.drop('column_name', axis = 1, inplace=True)
    
    mask = df['sum'] == 0
    df = df.loc[~mask]
    
    df.drop('sum', axis=1, inplace=True)
    df.name = 'NaN Value Counts by Column'
    
    return df

df = nan_analysis(pirus)

In [12]:
# Remove Special NaN Values
for col in pirus.columns:
    pirus[col] = pirus[col].replace(to_replace=[-99,'-99'], value=np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pirus[col] = pirus[col].replace(to_replace=[-99,'-99'], value=np.nan)


In [13]:
nan_analysis(pirus)

Unnamed: 0_level_0,type,Normal Value,NaN,-88,-99
column_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
loc_plot_city1,object,1902,234,1,1
date_exposure,datetime64[ns],1902,0,1,1
plot_target1,float64,957,21,947,0
attack_preparation,float64,957,145,947,0
op_security,float64,957,320,947,0
anticp_fatals_targ,float64,957,221,947,0
criminal_severity,float64,1903,13,1,0
criminal_charges,float64,1738,64,166,0
indict_arrest,float64,1732,23,172,0
terrorist_group_name1,object,1093,39,811,0


In [14]:
# Add State Dummy Variables to State Variables (In US = 1, Not in US = 0)

# Load Real States
with open('../clean_data/state_codes.json') as f:
    real_states = json.load(f)
    real_states = list(real_states.values())
print(real_states)

['Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Federated States Of Micronesia', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Marshall Islands', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Palau', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


In [15]:
# Add State Dummy Columns to PIRUS
def build_us_dummy(ser):
    dummy_col = []
    for s in pirus[v]:
        if s in real_states:
            dummy_col.append(1)
        else:
            dummy_col.append(0)
    return dummy_col

state_vs = ['loc_plot_state1', 'loc_habitation_state1']
for v in state_vs:
    loc = pirus.columns.get_loc(v) + 1
    dummy_col = build_us_dummy(pirus[v])
    name = v + '_us_dummy'
    pirus.insert(loc, name, dummy_col)

In [16]:
# US Dummy Sanity Check
habit = pd.DataFrame({'states' : pirus.loc_habitation_state1, 'dummy' : pirus.loc_habitation_state1_us_dummy})
plot = pd.DataFrame({'states' : pirus.loc_plot_state1, 'dummy' : pirus.loc_plot_state1_us_dummy})
states = pd.DataFrame.append(habit,plot)

states = states.groupby(['dummy', 'states'])
zeros = sorted(list({name[0][1] for name in states if name[0][0] == 0}))
ones = sorted(list({name[0][1] for name in states if name[0][0] == 1}))

print('US States\n', ones, '\n')
print('*'*100)
print('\nNon-US States\n', zeros)

US States
 ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'] 

****************************************************************************************************

Non-US States
 ['Afghanistan', 'Canada', 'Dominica', 'Egypt', 'Iraq', 'Kenya', 'Libya', 'Mexico', 'Pakistan', 'Somalia', 'Syria', 'Tunisia']


In [17]:
# Save Data to CSV
pirus.to_csv('../clean_data/pirus_deep_clean_Final.csv')