Import the required libraries and modules

In [1]:
import sys
sys.path.insert(0, '..')
import zipfile
import numpy as np
import pandas as pd
import xgboost as xgb
import src.utilities.data_cleaning as dc
import src.utilities.outlier_handler as oh

Load the data

In [2]:
train_zip = zipfile.ZipFile('../data/raw/sf-crime.zip').open('train.csv')
train_data = pd.read_csv(train_zip)
pd.set_option('display.max_columns', 37)

Summary of the data

In [3]:
train_data.info()
train_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null object
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


Unnamed: 0,X,Y
count,878049.0,878049.0
mean,-122.422616,37.77102
std,0.030354,0.456893
min,-122.513642,37.707879
25%,-122.432952,37.752427
50%,-122.41642,37.775421
75%,-122.406959,37.784369
max,-120.5,90.0


Exploratory Data Analysis

In [4]:
# TODO:
train_data = train_data.drop(columns = ['Descript', 'Resolution'])

Handle X and Y outliers

In [5]:
# handle_outliers() returns the processed dataframe and calculated mean coord. by district
result_tuple = oh.handle_outliers(train_data)
train_data = result_tuple[0]
train_data.describe()

Unnamed: 0,X,Y
count,878049.0,878049.0
mean,-122.422763,37.767035
std,0.025285,0.024164
min,-122.513642,37.707879
25%,-122.432952,37.752427
50%,-122.41642,37.775421
75%,-122.406959,37.784368
max,-122.364937,37.819975


Data pre-processing and feature engineering

In [6]:
train_data = dc.main_clean(train_data, center_scale = False)

Model Fitting and Prediction

In [7]:
category = pd.factorize(train_data['Category'], sort = True)
y_train = pd.Series(category[0]).astype('category')
X_train = train_data.drop(columns = 'Category', axis = 1)

Train the best model

In [8]:
clf = xgb.XGBClassifier(max_depth = 12, learning_rate = 0.05, n_estimators = 505, 
                        objective ='multi:softmax', gamma = 0.75, min_child_weight = 3, 
                        subsample = 0.75, colsample_bytree = 0.8, reg_alpha = 5, 
                        reg_lambda = 1.8, seed = 2019, n_jobs = -1)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.75,
              learning_rate=0.05, max_delta_step=0, max_depth=12,
              min_child_weight=3, missing=None, n_estimators=1, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=5, reg_lambda=1.8, scale_pos_weight=1, seed=2019,
              silent=None, subsample=0.75, verbosity=1)

Load test data and pre-process

In [9]:
test_zip = zipfile.ZipFile('../data/raw/sf-crime.zip').open('test.csv')
test_data = pd.read_csv(test_zip)
test_data.describe()

Unnamed: 0,Id,X,Y
count,884262.0,884262.0,884262.0
mean,442130.5,-122.422693,37.771476
std,255264.596205,0.030985,0.484824
min,0.0,-122.513642,37.707879
25%,221065.25,-122.433069,37.752374
50%,442130.5,-122.416517,37.775421
75%,663195.75,-122.406959,37.784353
max,884261.0,-120.5,90.0


Handle X and Y outliers using the mean coordinates obtained from train data

In [10]:
test_data = oh.handle_outliers(test_data, avg_XY = result_tuple[1])[0]
test_data = dc.main_clean(test_data, center_scale = False)
test_data.describe()

Unnamed: 0,Id,X,Y,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,Patrol_Division,Intersection,DayOfWeek_X,DayOfWeek_Y,DayOfYear_X,...,Hour_Y,00:00-05:59,06:00-17:59,18:00-23:59,2003-2005,2006-2009,2010-2012,2013-2016,Polar_Rho,Polar_Phi,X_R30,Y_R30,X_R60,Y_R60,XY_PCA1,XY_PCA2,Nearest_Station,Nearest_Station_Bearing
count,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,...,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0,884262.0
mean,442130.5,-122.422858,37.766986,0.101317,0.097403,0.090566,0.136214,0.121024,0.056769,0.050858,0.178065,0.075336,0.092447,0.579505,0.296471,-0.02115,0.009007,0.012946,...,-0.241627,0.143728,0.536253,0.320019,0.250387,0.31616,0.228678,0.204775,128.115971,-0.299233,-56.19888,-115.132043,105.085341,-73.285555,-4.808892e-14,3.160258e-14,1.041684,-6.520268
std,255264.596205,0.025345,0.024179,0.301749,0.296506,0.286991,0.343016,0.326155,0.231402,0.219708,0.382568,0.263933,0.289655,0.493639,0.456701,0.706397,0.707444,0.701209,...,0.639708,0.350814,0.498684,0.466484,0.433236,0.464976,0.419981,0.403538,0.024168,0.000198,0.023605,0.02588,0.026303,0.023133,0.02668908,0.02268669,0.668142,99.884047
min,0.0,-122.513642,37.707879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.900969,-0.974928,-0.999963,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.054347,-0.299767,-56.246055,-115.220717,105.01365,-73.339157,-0.07861527,-0.06408853,0.005755,-179.986817
25%,221065.25,-122.43307,37.752361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.900969,-0.781831,-0.683919,...,-0.866025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.103112,-0.299385,-56.215369,-115.141947,105.066881,-73.300802,-0.01930443,-0.01380383,0.542348,-92.190295
50%,442130.5,-122.416517,37.775421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.222521,0.0,0.021516,...,-0.382683,0.0,1.0,0.0,0.0,0.0,0.0,0.0,128.111894,-0.299278,-56.205762,-115.124714,105.077221,-73.292835,-0.006869091,-0.004325124,0.934336,-18.732671
75%,663195.75,-122.406959,37.784353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.62349,0.781831,0.714673,...,0.258819,0.0,1.0,1.0,1.0,1.0,0.0,0.0,128.125712,-0.299106,-56.184521,-115.114528,105.095663,-73.271701,0.01207638,0.01094741,1.359485,81.771502
max,884261.0,-122.364751,37.820621,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.974928,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,128.206104,-0.29865,-56.135906,-115.066358,105.17664,-73.218411,0.09165545,0.07059576,4.663075,179.963528


Prediction

In [11]:
X_test = test_data.drop('Id', axis = 1)
y_pred = clf.predict_proba(X_test)

Save result

In [12]:
result = pd.DataFrame(y_pred)
result.insert(0, 'Id', test_data['Id'])
column_names = np.insert(category[1], 0, 'Id')
result.columns = column_names
# result.to_csv('submission.csv', index = False)
print(result)

            Id     ARSON   ASSAULT  BAD CHECKS   BRIBERY  BURGLARY  \
0            0  0.025148  0.028122    0.025122  0.025119  0.026352   
1            1  0.024932  0.027911    0.024906  0.024903  0.024937   
2            2  0.025047  0.026331    0.025021  0.025017  0.030186   
3            3  0.025120  0.029266    0.025094  0.025090  0.026323   
4            4  0.025120  0.029266    0.025094  0.025090  0.026323   
...        ...       ...       ...         ...       ...       ...   
884257  884257  0.024866  0.026913    0.024840  0.024837  0.026057   
884258  884258  0.024803  0.026343    0.024778  0.024774  0.029893   
884259  884259  0.024989  0.025600    0.024963  0.024960  0.025600   
884260  884260  0.024976  0.026363    0.024950  0.024947  0.025577   
884261  884261  0.024861  0.026292    0.024835  0.024832  0.026328   

        DISORDERLY CONDUCT  DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  \
0                 0.025169                     0.025196       0.025506   
1          