# Preparation for XGBoost Model

In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score

In [None]:
data = pd.read_csv('complete_clean_data.csv')

In [3]:
data2 = data.copy()

In [4]:
data2.columns

Index(['id', 'state', 'stop_time', 'county_name', 'driver_gender',
       'driver_age', 'driver_race', 'violation', 'search_conducted',
       'search_type_raw', 'stop_outcome', 'officer_gender', 'officer_race',
       'contact_type', 'African American alone', 'American Indian alone',
       'Asian alone', 'Hispanic or Latino', 'Pacific Islander alone',
       'Two or More Races', 'White alone', 'year', 'Democratic', 'Other',
       'Republican', 'DUIviolation', 'SpeedingViolation', 'EquipmentViolation',
       'LicenseViolation', 'LightsViolation', 'PaperworkViolation',
       'SafeMovementViolation', 'StoppingViolation', 'RegistrationViolation',
       'SeatBeltViolation', 'MovingViolation', 'CellPhoneViolation',
       'TruckViolation', 'OtherViolation', 'TotalViolations', 'MinorityDriver',
       'MinorityOfficer', 'DriverOfficerSameRace', 'ArrestOrCitation'],
      dtype='object')

In [5]:
# Sets the search conducted to an integer 
data2.search_conducted = data2['search_conducted'].astype(int)

# Stores time as an interger
f = lambda x: int(x['stop_time'].split(':')[0])
data2.stop_time = data2.apply(f, axis=1)

In [6]:
# Gets dummy variables for driver race, officer race, and officer gender 
driver_gender_dummy = pd.get_dummies(data2.driver_gender, prefix = 'driver_gender')
officer_gender_dummy = pd.get_dummies(data2.officer_gender, prefix = 'officer_gender')
officer_race_dummy = pd.get_dummies(data2.officer_race, prefix = 'officer_race')
data2 = pd.concat([data2, driver_gender_dummy, officer_gender_dummy, officer_race_dummy], axis = 1)

In [7]:
# Drops columns not needed for model
drop = ['id', 'state', 'county_name', 'driver_race', 'violation', 'search_type_raw', 'stop_outcome', 'contact_type', 'DriverOfficerSameRace', 'African American alone', 'American Indian alone', 'Asian alone', 'Hispanic or Latino', 'Pacific Islander alone', 'Two or More Races', 'White alone', 'officer_race_Other', 'driver_gender_F', 'officer_gender_F', 'driver_gender', 'officer_gender', 'officer_race', 'Other']
data2 = data2.drop(drop, axis = 1)

In [8]:
data2.columns

Index(['stop_time', 'driver_age', 'search_conducted', 'year', 'Democratic',
       'Republican', 'DUIviolation', 'SpeedingViolation', 'EquipmentViolation',
       'LicenseViolation', 'LightsViolation', 'PaperworkViolation',
       'SafeMovementViolation', 'StoppingViolation', 'RegistrationViolation',
       'SeatBeltViolation', 'MovingViolation', 'CellPhoneViolation',
       'TruckViolation', 'OtherViolation', 'TotalViolations', 'MinorityDriver',
       'MinorityOfficer', 'ArrestOrCitation', 'driver_gender_M',
       'officer_gender_M', 'officer_race_Asian', 'officer_race_Black',
       'officer_race_Hispanic', 'officer_race_White'],
      dtype='object')

In [9]:
data2.head()

Unnamed: 0,stop_time,driver_age,search_conducted,year,Democratic,Republican,DUIviolation,SpeedingViolation,EquipmentViolation,LicenseViolation,...,TotalViolations,MinorityDriver,MinorityOfficer,ArrestOrCitation,driver_gender_M,officer_gender_M,officer_race_Asian,officer_race_Black,officer_race_Hispanic,officer_race_White
0,0,27,0,2009,48.19,49.34,0,0,1,0,...,1,0,0,0,1,1,0,0,0,1
1,0,17,0,2009,70.3,28.17,0,1,0,0,...,1,0,0,0,0,1,0,0,0,1
2,0,31,1,2009,70.3,28.17,0,0,0,1,...,3,1,0,1,1,1,0,0,0,1
3,0,24,0,2009,55.19,42.87,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
4,0,36,0,2009,55.18,42.99,1,1,0,1,...,3,0,0,1,1,1,0,0,0,1


In [10]:
data2.to_csv('machinelearning.csv')