In [2]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib as plt
from sqlalchemy import create_engine

%matplotlib inline

In [4]:
# import db password
from config import db_password

In [5]:
# credentials for connecting to Postgres db
POSTGRES_ADDRESS = 'bootcamp-final-project.c8u2worjd1ui.us-east-1.rds.amazonaws.com'
POSTGRES_PORT = 5432
POSTGRES_USERNAME = 'peter_jennifer'
POSTGRES_PASSWORD = db_password
POSTGRES_DBNAME = 'us_gun_violence'

In [6]:
# creat connection string and database engine
db_string = f'postgres://{POSTGRES_USERNAME}:{POSTGRES_PASSWORD}@{POSTGRES_ADDRESS}:{POSTGRES_PORT}/{POSTGRES_DBNAME}'

engine = create_engine(db_string)

# Encoding

## Guns Table

In [27]:
# import guns dataset from AWS
guns_df = pd.read_sql_table('guns', engine, columns=['incident_id', 'n_guns_involved', 'gun_stolen', 'gun_type'])
guns_df.head()

Unnamed: 0,incident_id,n_guns_involved,gun_stolen,gun_type
0,478855,2,Unknown,Unknown
1,478855,2,Unknown,Unknown
2,478959,2,Unknown,Handgun
3,478959,2,Unknown,Handgun
4,479363,2,Unknown,22 LR


In [29]:
# explore value counts of gun_type column
guns_df['gun_type'].value_counts()

Unknown            130838
Handgun             25038
9mm                  6448
Rifle                5268
Shotgun              4263
22 LR                3346
40 SW                2745
380 Auto             2392
45 Auto              2360
38 Spl               1809
223 Rem [AR-15]      1613
12 gauge             1112
Other                1060
7.62 [AK-47]          939
357 Mag               822
25 Auto               610
32 Auto               488
20 gauge              205
44 Mag                197
30-30 Win             110
410 gauge              97
308 Win                92
30-06 Spr              84
10mm                   50
16 gauge               32
300 Win                23
28 gauge                6
Name: gun_type, dtype: int64

In [30]:
# define dictionary to be used to bin above values
gun_types = {'9mm': 'Handgun', '22 LR': 'Rifle', '40 SW': 'Handgun', '380 Auto': 'Handgun', 
            '45 Auto': 'Handgun', '38 Spl': 'Handgun', '223 Rem [AR-15]': 'Assault Rifle',
            '12 gauge': 'Shotgun', '7.62 [AK-47]': 'Assault Rifle', '357 Mag': 'Handgun',
            '25 Auto': 'Handgun', '32 Auto': 'Handgun', '20 gauge': 'Shotgun', '44 Mag': 'Handgun',
            '30-30 Win': 'Rifle', '410 gauge': 'Shotgun', '308 Win': 'Rifle', '30-06 Spr': 'Rifle',
            '10mm': 'Handgun', '16 gauge': 'Shotgun', '300 Win': 'Rifle', '28 gauge': 'Shotgun'}

In [31]:
# map dictionary keys to dataframe
guns_df['category'] = guns_df['gun_type'].map(gun_types)
guns_df.head()

Unnamed: 0,incident_id,n_guns_involved,gun_stolen,gun_type,category
0,478855,2,Unknown,Unknown,
1,478855,2,Unknown,Unknown,
2,478959,2,Unknown,Handgun,
3,478959,2,Unknown,Handgun,
4,479363,2,Unknown,22 LR,Rifle


In [32]:
guns_df['category'].value_counts()

Handgun          17921
Rifle             3655
Assault Rifle     2552
Shotgun           1452
Name: category, dtype: int64

In [33]:
# discard previous gun type column and rename newly generated categories
guns_df.drop(columns=['gun_type'], inplace=True)
guns_df.rename(columns={'category': 'gun_type'}, inplace=True)

guns_df.head()

Unnamed: 0,incident_id,n_guns_involved,gun_stolen,gun_type
0,478855,2,Unknown,
1,478855,2,Unknown,
2,478959,2,Unknown,
3,478959,2,Unknown,
4,479363,2,Unknown,Rifle


In [34]:
# inspect gun_stolen value counts
guns_df['gun_stolen'].value_counts()

Unknown       172525
Stolen         17610
Not-stolen      1804
Name: gun_stolen, dtype: int64

In [35]:
# replace Unknown values with NaN
guns_df['gun_stolen'].replace({'Unknown': np.nan}, inplace=True)
guns_df.head()

In [37]:
# encode gun_stolen and gun_type
guns_df_encoded = pd.get_dummies(guns_df, columns=['gun_stolen', 'gun_type'])
guns_df_encoded.head()

Unnamed: 0,incident_id,n_guns_involved,gun_stolen_Not-stolen,gun_stolen_Stolen,gun_type_Assault Rifle,gun_type_Handgun,gun_type_Rifle,gun_type_Shotgun
0,478855,2,0,0,0,0,0,0
1,478855,2,0,0,0,0,0,0
2,478959,2,0,0,0,0,0,0
3,478959,2,0,0,0,0,0,0
4,479363,2,0,0,0,0,1,0


In [38]:
# rename columns
guns_df_encoded.rename(columns={'gun_stolen_Not-stolen': 'not_stolen', 'gun_stolen_Stolen': 'stolen',
                               'gun_type_Assault Rifle': 'assault_rifle', 'gun_type_Handgun': 'handgun',
                               'gun_type_Rifle': 'rifle', 'gun_type_Shotgun': 'shotgun'}, inplace=True)
guns_df_encoded.head()

Unnamed: 0,incident_id,n_guns_involved,not_stolen,stolen,assault_rifle,handgun,rifle,shotgun
0,478855,2,0,0,0,0,0,0
1,478855,2,0,0,0,0,0,0
2,478959,2,0,0,0,0,0,0
3,478959,2,0,0,0,0,0,0
4,479363,2,0,0,0,0,1,0


## Suspects Table

In [53]:
# import suspects dataset from AWS
suspects_df = pd.read_sql_table('suspects', engine, columns=['incident_id', 'participant_gender', 
                                                            'participant_age', 'participant_age_group',
                                                            'participant_status'])
suspects_df.head()

Unnamed: 0,incident_id,participant_gender,participant_age,participant_age_group,participant_status
0,461105,Female,,Adult 18+,Injured
1,478855,Male,25.0,Adult 18+,"Injured, Unharmed, Arrested"
2,478855,Male,31.0,Adult 18+,"Unharmed, Arrested"
3,478925,Male,33.0,Adult 18+,Killed
4,478959,Female,47.0,Adult 18+,Killed


In [54]:
# inspect gender column
suspects_df['participant_gender'].value_counts()

Male      167708
Female     11746
Name: participant_gender, dtype: int64

In [55]:
# inspect age_group column
suspects_df['participant_age_group'].value_counts()

Adult 18+     151072
Teen 12-17     12850
Child 0-11       578
Name: participant_age_group, dtype: int64

In [56]:
# inspect status column
suspects_df['participant_status'].value_counts()

Unharmed, Arrested             84542
Unharmed                       77456
Arrested                       10092
Killed                          8870
Injured                         4780
Injured, Arrested               2842
Killed, Arrested                  37
Injured, Unharmed, Arrested       20
Killed, Unharmed                  15
Killed, Unharmed, Arrested        11
Injured, Unharmed                 11
Killed, Injured                    1
Name: participant_status, dtype: int64

In [57]:
# Clean up bins, can't be killed and uninjured, assume the person died post-incident, report as killed
status_labels = {'Killed, Arrested': 'Killed', 'Injured, Unharmed, Arrested': 'Injured, Arrested',
                'Killed, Unharmed': 'Killed', 'Killed, Unharmed, Arrested': 'Killed', 'Injured, Unharmed': 
                'Injured', 'Killed, Injured': 'Killed'}

In [58]:
# map dictionary keys to dataframe
suspects_df['status'] = suspects_df['participant_status'].map(status_labels).fillna(suspects_df['participant_status'])
suspects_df.drop(columns=['participant_status'], inplace=True)
suspects_df.head()

Unnamed: 0,incident_id,participant_gender,participant_age,participant_age_group,status
0,461105,Female,,Adult 18+,Injured
1,478855,Male,25.0,Adult 18+,"Injured, Arrested"
2,478855,Male,31.0,Adult 18+,"Unharmed, Arrested"
3,478925,Male,33.0,Adult 18+,Killed
4,478959,Female,47.0,Adult 18+,Killed


In [60]:
suspects_df_encoded = pd.get_dummies(suspects_df, columns=['participant_gender', 'participant_age_group',
                                                          'status'])
suspects_df_encoded.head()

Unnamed: 0,incident_id,participant_age,participant_gender_Female,participant_gender_Male,participant_age_group_Adult 18+,participant_age_group_Child 0-11,participant_age_group_Teen 12-17,status_Arrested,status_Injured,"status_Injured, Arrested",status_Killed,status_Unharmed,"status_Unharmed, Arrested"
0,461105,,1,0,1,0,0,0,1,0,0,0,0
1,478855,25.0,0,1,1,0,0,0,0,1,0,0,0
2,478855,31.0,0,1,1,0,0,0,0,0,0,0,1
3,478925,33.0,0,1,1,0,0,0,0,0,1,0,0
4,478959,47.0,1,0,1,0,0,0,0,0,1,0,0


In [61]:
suspects_df_encoded.rename(columns={'participant_gender_Female': 'female', 'participant_age_group_Adult 18+':
                                   'Adult_18+', 'participant_age_group_Child 0-11': 'Child_0-11', 'participant_age_group_Teen 12-17':
                                   'Teen_12-17'}, inplace=True)
suspects_df_encoded.head()

Unnamed: 0,incident_id,participant_age,female,participant_gender_Male,Adult_18+,Child_0-11,Teen_12-17,status_Arrested,status_Injured,"status_Injured, Arrested",status_Killed,status_Unharmed,"status_Unharmed, Arrested"
0,461105,,1,0,1,0,0,0,1,0,0,0,0
1,478855,25.0,0,1,1,0,0,0,0,1,0,0,0
2,478855,31.0,0,1,1,0,0,0,0,0,0,0,1
3,478925,33.0,0,1,1,0,0,0,0,0,1,0,0
4,478959,47.0,1,0,1,0,0,0,0,0,1,0,0


## Incidents Table

In [63]:
# import incidents dataset from AWS
incidents_df = pd.read_sql_table('incidents', engine, columns=['date', 'state', 'latitude', 'longitude', 'n_killed',
                                                              'n_injured', 'incident_characteristics', 'notes', 'congressional_district',
                                                              'state_house_district', 'state_senate_district'])
incidents_df.head()

Unnamed: 0,date,state,latitude,longitude,n_killed,n_injured,incident_characteristics,notes,congressional_district,state_house_district,state_senate_district
0,2013-01-01,Pennsylvania,40.3467,-79.8559,0,4,Shot - Wounded/Injured||Mass Shooting (4+ vict...,Julian Sims under investigation: Four Shot and...,14.0,,
1,2013-01-01,California,33.909,-118.333,1,3,"Shot - Wounded/Injured||Shot - Dead (murder, a...",Four Shot; One Killed; Unidentified shooter in...,43.0,62.0,35.0
2,2013-01-01,Ohio,41.4455,-82.1377,1,3,"Shot - Wounded/Injured||Shot - Dead (murder, a...",,9.0,56.0,13.0
3,2013-01-05,Colorado,39.6518,-104.802,4,0,"Shot - Dead (murder, accidental, suicide)||Off...",,6.0,40.0,28.0
4,2013-01-07,North Carolina,36.114,-79.9569,2,2,"Shot - Wounded/Injured||Shot - Dead (murder, a...",Two firearms recovered. (Attempted) murder sui...,6.0,62.0,27.0
