###Read git data

In [None]:
!git clone https://github.com/washingtonpost/data-police-shootings.git

Cloning into 'data-police-shootings'...
remote: Enumerating objects: 5662, done.[K
remote: Counting objects: 100% (772/772), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 5662 (delta 768), reused 745 (delta 743), pack-reused 4890[K
Receiving objects: 100% (5662/5662), 3.17 MiB | 22.73 MiB/s, done.
Resolving deltas: 100% (4363/4363), done.


###libraries

In [None]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
from scipy.interpolate import make_interp_spline
import numpy as np

import plotly.express as px
import plotly.graph_objects as go


from geopy.geocoders import Nominatim


###Data before preprocess

In [None]:
data=pd.read_csv("/content/data-police-shootings/fatal-police-shootings-data.csv",sep=",",encoding='latin-1')
data.head(5)

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,longitude,latitude,is_geocoding_exact
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False,-123.122,47.247,True
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False,-122.892,45.487,True
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False,-97.281,37.695,True
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False,-122.422,37.763,True
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False,-104.692,40.384,True


In [None]:
data.shape

(7504, 17)

In [None]:
data.isna().sum()

id                            0
name                        388
date                          0
manner_of_death               0
armed                       209
age                         449
gender                       19
race                       1382
city                          0
state                         0
signs_of_mental_illness       0
threat_level                  0
flee                        763
body_camera                   0
longitude                   712
latitude                    712
is_geocoding_exact            0
dtype: int64

###helper dicts

In [None]:
states_initial = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'
}
race_initial={"W": "White",
"B": "Black",
"A": "Asian",
"N": "Native American",
"H": "Hispanic",
"O": "Other",
"unknown":"unknown"}

###Preprocess

####Fix null values

In [None]:
df = data[data['name'].notna()]
df = df[df['age'].notna()]

df["armed"].fillna("undetermined",inplace=True)
df["gender"].fillna("unknown",inplace=True)
df["race"].fillna("unknown",inplace=True)
df["flee"].fillna("unknown",inplace=True)
df = df[df['gender']!="unknown"]

geolocator = Nominatim(user_agent='test1')
to_remove=[]
for idx,row in df[df['longitude'].isna()].iterrows():
  try:
    location = geolocator.geocode(row["city"]+" "+states_initial[row["state"]])
    df.at[idx,"longitude"]=round(location.longitude,3)
    df.at[idx,"latitude"]=round(location.latitude,3)
    df.at[idx,'is_geocoding_exact']=False
  except:
    to_remove.append(idx)
df.drop(index=to_remove,inplace=True)


####Adding derived columns

In [None]:
temp_df=pd.DataFrame(df['date'].map(lambda a: [int(i) for i in a.split("-")]).tolist(),
                     columns=['year',"month","day"],index=df.index)
df["race_full"]=df['race'].map(lambda a: race_initial[a])
df["state_full"]=df['state'].map(lambda a: states_initial[a])
df["gender_binary"]=df['gender'].map(lambda a: 1 if a=="M" else 0)

df=pd.concat([df,temp_df],axis=1)



###Data after preprocess

In [None]:
df.isna().sum()

In [None]:
df.head(5)

In [None]:
df.shape

###Save

In [None]:
df.to_csv("run_df.csv")

###Testing

In [None]:
df=pd.read_csv("/content/run_df_19_6.csv")

In [None]:
df.shape

(6956, 24)