In [None]:
import sys
print(sys.version)
import numpy as np
print(np.__version__)
import pandas as pd
print(pd.__version__)
import matplotlib.pyplot as plt
import json

%matplotlib inline

In [None]:
# Load Original PIRUS File
file = "../og_data/PIRUS_May2020/PIRUS_Public_May2020.xlsx"
pirus = pd.read_excel(file, parse_dates=['Date_Exposure', 'Convert_Date', 'Reawakening_Date'])

In [None]:
# Remove all spaces from column names and convert to lower case
pirus.rename(columns=lambda x: x.replace(' ','_').lower(), inplace=True)

In [None]:
#merge coordinates for plot locations
pirus_plot_coordinates = pd.read_csv('../clean_data/pirus_plot_coordinates.csv')
pirus = pd.merge(pirus,pirus_plot_coordinates,on='subject_id',how='outer')

In [None]:
#merge coordinates for habitation locations
pirus_habit_coordinates = pd.read_csv('../clean_data/pirus_habit_coordinates.csv')
pirus = pd.merge(pirus,pirus_habit_coordinates,on='subject_id',how='outer')

In [None]:
pirus

In [None]:
# Set index to subject_id
pirus.set_index(pirus['subject_id'], inplace = True)
pirus.index.name = 'subject_id'

In [None]:
# Drop unnecessary fields: 
unnecessary_fields = ['subject_id', 'age_child', 
                      'itinerant','group_competition', 
                      'current_status', 'changing_target', 
                      'internet_use_plot', 'loc_plot_state2',
                      'loc_plot_city2', 'plot_target2', 'plot_target3',
                      'terrorist_group_name2', 'terrorist_group_name3',
                      'recruiter2', 'recruiter3', 'actively_connect',
                      'media_radicalization', 'social_media_platform2',
                      'social_media_platform3', 'social_media_platform4',
                      'social_media_platform5', 'social_media_activities2',
                      'social_media_activities3', 'social_media_activities4',
                      'social_media_activities5', 'social_media_activities6',
                      'social_media_activities7', 'ideological_sub_category2',
                      'ideological_sub_category3', 'loc_habitation_state2',
                      'loc_habitation_city2', 'rad_duration', 'event_influence2',
                      'event_influence3', 'event_influence4', 'beliefs_trajectory',
                      'behaviors_trajectory', 'radicalization_place', 'religious_background',
                      'reawakening', 'reawakening_date','change_performance',
                      'social_stratum_childhood', 'aspirations', 'abuse_type2',
                      'abuse_type3', 'absent_parent','overseas_family',
                      'close_family', 'family_religiosity','family_ideology',
                      'family_ideological_level', 'prison_family_friend',
                      'crime_family_friend', 'radical_family', 'radical_signif_other',
                      'relationship_troubles', 'platonic_troubles', 'unstructured_time',
                      'friendship_source1', 'friendship_source2', 'friendship_source3',
                      'kicked_out', 'previous_criminal_activity_type2',
                      'previous_criminal_activity_type3', 'trauma', 'standing']
pirus.drop(unnecessary_fields, axis=1, inplace=True)

In [None]:
# Add year and month columns (derived from date_exposure)
pirus['year'] = pd.DatetimeIndex(pirus['date_exposure']).year
pirus['month'] = pd.DatetimeIndex(pirus['date_exposure']).month

In [None]:
# Remove years that are not between 1980 and 2019
mask = (pirus['year'] >= 1980) & (pirus['year'] < 2019)
pirus = pirus.loc[mask]

In [None]:
def special_nan_counter(ser, code):
    try:
        return ser.value_counts()[code]
    except:
        return 0

def build_nan_dict(df):
    # Loop through columns to built a column/NaN value dictionary
    d = {}
    for col in df.columns:
        count_nan = df[col].isna().sum()
        count_88 = special_nan_counter(df[col], '-88') + special_nan_counter(df[col], -88)
        count_99 = special_nan_counter(df[col], '-99') + special_nan_counter(df[col], -99)

        count_sum =  count_88 + count_99
        count_normal = df[col].size - count_sum
        typ = df[col].dtype
        
        d.update({col : {'column_name': col, 
                         'type' : typ, 
                         'Normal Value' : count_normal, 
                         'NaN' : count_nan, 
                         '-88' : count_88, 
                         '-99' : count_99, 
                         'sum' : count_sum}})
    return d

def nan_analysis(df):
    '''Convert nan_dict into a dataframe.'''
    df = pd.DataFrame(build_nan_dict(pirus)).T
    df.index = df.column_name
    df.drop('column_name', axis = 1, inplace=True)
    
    mask = df['sum'] == 0
    df = df.loc[~mask]
    
    df.drop('sum', axis=1, inplace=True)
    df.name = 'NaN Value Counts by Column'
    
    return df

df = nan_analysis(pirus)

In [None]:
# Remove Special NaN Values
for col in pirus.columns:
    pirus[col] = pirus[col].replace(to_replace=[-99,'-99'], value=np.nan)

In [None]:
nan_analysis(pirus)

In [None]:
# Add State Dummy Variables to State Variables (In US = 1, Not in US = 0)

# Load Real States
with open('../clean_data/state_codes.json') as f:
    real_states = json.load(f)
    real_states = list(real_states.values())
print(real_states)

In [None]:
# Add State Dummy Columns to PIRUS
def build_us_dummy(ser):
    dummy_col = []
    for s in pirus[v]:
        if s in real_states:
            dummy_col.append(1)
        else:
            dummy_col.append(0)
    return dummy_col

state_vs = ['loc_plot_state1', 'loc_habitation_state1']
for v in state_vs:
    loc = pirus.columns.get_loc(v) + 1
    dummy_col = build_us_dummy(pirus[v])
    name = v + '_us_dummy'
    pirus.insert(loc, name, dummy_col)

In [None]:
# US Dummy Sanity Check
habit = pd.DataFrame({'states' : pirus.loc_habitation_state1, 'dummy' : pirus.loc_habitation_state1_us_dummy})
plot = pd.DataFrame({'states' : pirus.loc_plot_state1, 'dummy' : pirus.loc_plot_state1_us_dummy})
states = pd.DataFrame.append(habit,plot)

states = states.groupby(['dummy', 'states'])
zeros = sorted(list({name[0][1] for name in states if name[0][0] == 0}))
ones = sorted(list({name[0][1] for name in states if name[0][0] == 1}))

print('US States\n', ones, '\n')
print('*'*100)
print('\nNon-US States\n', zeros)

In [None]:
# Save Data to CSV
pirus.to_csv('../clean_data/pirus_deep_clean_Final.csv')