# Data Preprocessing

In [11]:
import pandas as pd

## Total Arrests in 2016 Dataset

In [8]:
total_arrests_nonupdated = pd.read_csv('total_arrest.csv')

#dropping the ethnicity columns from the dataset as I will be analyzing by race, not ethnicity
total_arrests_nonupdated.drop(["Hispanic\nor\nLatino", "Total.2", "Hispanic\nor\nLatino.1", "Not\nHispanic\nor Latino", "Not\nHispanic\nor Latino.1", "Total2"], axis = 1, inplace = True) 

#renaming columns due to odd formatting from importing the .csv file
preprocessed_total_arrests = total_arrests_nonupdated.rename(columns={"Black or\nAfrican\nAmerican": "Black", 
                              "American\nIndian or\nAlaska\nNative": "American Indian or Alaska Native", 
                              "Native\nHawaiian\nor Other\nPacific\nIslander": "Native Hawaiian or Pacific Islander",
                              "Total.1": "Percent Total",
                              "White.1" : "Percent Distribution: White",
                              "Black or\nAfrican\nAmerican.1": "Percent Distribution: Black",
                              "American\nIndian or\nAlaska\nNative.1": "Percent Distribution: American Indian or Alaska Native",
                              "Asian.1": "Percent Distribution: Asian",
                              "Native\nHawaiian\nor Other\nPacific\nIslander.1": "Percent Distribution: Native Hawaiian or Pacific Islander"})

#dropped rows 31 - 34 as they were "NaN" values due to the original dataset containing lines of text for those rows
preprocessed_total_arrests = preprocessed_total_arrests.dropna()  

#removing all commas in the dataset so I can convert the strings to float
preprocessed_total_arrests = preprocessed_total_arrests.replace(',','', regex=True)

preprocessed_total_arrests.to_csv("preprocessed_total_arrests.csv")

#final dataset for all arrests in 2016
preprocessed_total_arrests.head()



Unnamed: 0,Offense charged,Total,White,Black,American Indian or Alaska Native,Asian,Native Hawaiian or Pacific Islander,Percent Total,Percent Distribution: White,Percent Distribution: Black,Percent Distribution: American Indian or Alaska Native,Percent Distribution: Asian,Percent Distribution: Native Hawaiian or Pacific Islander
0,TOTAL,8421481,5858330,2263112,171185,103244,25610,100,69.6,26.9,2.0,1.2,0.3
1,Murder and nonnegligent manslaughter,9374,4192,4935,108,109,30,100,44.7,52.6,1.2,1.2,0.3
2,Rape,18606,12571,5412,233,309,81,100,67.6,29.1,1.3,1.7,0.4
3,Robbery,76267,33095,41562,663,659,288,100,43.4,54.5,0.9,0.9,0.4
4,Aggravated assault,304626,191205,101432,6374,4678,937,100,62.8,33.3,2.1,1.5,0.3


## Total Arrests for Individuals Under 18 Dataset

In [9]:
arrests_under_eighteen_unupdated = pd.read_csv("arrests_under_eighteen.csv")
arrests_under_eighteen_unupdated.head()

#dropping the ethnicity columns from the dataset as I will be analyzing by race, not ethnicity
arrests_under_eighteen_unupdated.drop(["Hispanic\nor\nLatino", "Total.2", "Hispanic\nor\nLatino.1", "Not\nHispanic\nor Latino", "Not\nHispanic\nor Latino.1", "Total2"], axis = 1, inplace = True) 

#renaming columns due to odd formatting from importing the .csv file
preprocessed_arrests_under_eighteen = arrests_under_eighteen_unupdated.rename(columns={"Black or\nAfrican\nAmerican": "Black", 
                                                                          "American\nIndian or\nAlaska\nNative": "American Indian or Alaska Native", 
                                                                          "Native\nHawaiian\nor Other\nPacific\nIslander": "Native Hawaiian or Pacific Islander",
                                                                          "Total.1": "Percent Total",
                                                                          "White.1" : "Percent Distribution: White",
                                                                          "Black or\nAfrican\nAmerican.1": "Percent Distribution: Black",
                                                                          "American\nIndian or\nAlaska\nNative.1": "Percent Distribution: American Indian or Alaska Native",
                                                                          "Asian.1": "Percent Distribution: Asian",
                                                                          "Native\nHawaiian\nor Other\nPacific\nIslander.1": "Percent Distribution: Native Hawaiian or Pacific Islander"})

#dropped rows 31 - 34 as they were "NaN" values due to the original dataset containing lines of text for those rows
preprocessed_arrests_under_eighteen.dropna()  

#removing all commas from the dataset
preprocessed_arrests_under_eighteen = preprocessed_arrests_under_eighteen.replace(',','', regex=True)

preprocessed_arrests_under_eighteen.to_csv("preprocessed_arrests_under_18.csv")

#final dataset for all arrests of individuals under 18 in 2016
preprocessed_arrests_under_eighteen.head() 

Unnamed: 0,Offense charged,Total,White,Black,American Indian or Alaska Native,Asian,Native Hawaiian or Pacific Islander,Percent Total,Percent Distribution: White,Percent Distribution: Black,Percent Distribution: American Indian or Alaska Native,Percent Distribution: Asian,Percent Distribution: Native Hawaiian or Pacific Islander
0,TOTAL,674820,419393,234092,11509,7424,2402,100,62.1,34.7,1.7,1.1,0.4
1,Murder and nonnegligent manslaughter,679,244,413,9,11,2,100,35.9,60.8,1.3,1.6,0.3
2,Rape,2900,1877,956,23,31,13,100,64.7,33.0,0.8,1.1,0.4
3,Robbery,15293,4468,10520,94,139,72,100,29.2,68.8,0.6,0.9,0.5
4,Aggravated assault,22217,12086,9486,350,223,72,100,54.4,42.7,1.6,1.0,0.3
