In [1]:
import pandas as pd
import numpy as np

## Import Feature Engineered Data

In [2]:
train_df = pd.read_csv('../input/sf-crimes-feature-engineering/train_clean.csv')
test_df = pd.read_csv('../input/sf-crimes-feature-engineering/test_clean.csv')

In [3]:
test_df.columns

Index(['Id', 'Dates', 'DayOfWeek', 'Address', 'X', 'Y', 'Year', 'Month',
       'Hour', 'IsDay', 'HourSin', 'HourCos', 'MonthSin', 'MonthCos',
       'DayOfWeekSin', 'DayOfWeekCos', 'PdDistrict_BAYVIEW',
       'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE', 'PdDistrict_MISSION',
       'PdDistrict_NORTHERN', 'PdDistrict_PARK', 'PdDistrict_RICHMOND',
       'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN'],
      dtype='object')

## Feature Selection

In [4]:
drop_cols = ['Dates', 'Address', 'DayOfWeek', 'Month', 'Hour' ]
train_df = train_df.drop(drop_cols+['Descript', 'Resolution'], axis=1)
test_df = test_df.drop(drop_cols, axis=1)

In [5]:
train_df.columns

Index(['Category', 'X', 'Y', 'Year', 'IsDay', 'HourSin', 'HourCos', 'MonthSin',
       'MonthCos', 'DayOfWeekSin', 'DayOfWeekCos', 'PdDistrict_BAYVIEW',
       'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE', 'PdDistrict_MISSION',
       'PdDistrict_NORTHERN', 'PdDistrict_PARK', 'PdDistrict_RICHMOND',
       'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN'],
      dtype='object')

In [6]:
test_df.columns

Index(['Id', 'X', 'Y', 'Year', 'IsDay', 'HourSin', 'HourCos', 'MonthSin',
       'MonthCos', 'DayOfWeekSin', 'DayOfWeekCos', 'PdDistrict_BAYVIEW',
       'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE', 'PdDistrict_MISSION',
       'PdDistrict_NORTHERN', 'PdDistrict_PARK', 'PdDistrict_RICHMOND',
       'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN'],
      dtype='object')

## Separate Features (X) and Targets (Y):

In [7]:
feature_cols = [ 'DayOfWeekCos', 'MonthCos', 'X', 'Y', 'IsDay', 'Year', 'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL',
       'PdDistrict_INGLESIDE', 'PdDistrict_MISSION', 'PdDistrict_NORTHERN',
       'PdDistrict_PARK', 'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN',
       'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN']
train_x = train_df[feature_cols]
train_y = train_df['Category']
test_ids = test_df.Id # Save Ids for submission
test_x  = test_df[feature_cols]

In [8]:
train_x.sample(3)

Unnamed: 0,DayOfWeekCos,MonthCos,X,Y,IsDay,Year,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
351868,0.62349,-1.0,-122.395971,37.743924,1,10,1,0,0,0,0,0,0,0,0,0
513141,-0.900969,0.5,-122.419672,37.76505,0,8,0,0,0,1,0,0,0,0,0,0
661859,-0.222521,1.0,-122.40916,37.761817,0,5,0,0,0,1,0,0,0,0,0,0


In [9]:
test_x.sample(3)

Unnamed: 0,DayOfWeekCos,MonthCos,X,Y,IsDay,Year,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
27105,1.0,0.8660254,-122.444028,37.772296,0,15,0,0,0,0,0,1,0,0,0,0
587309,0.62349,0.5,-122.407387,37.781069,1,7,0,0,0,0,0,0,0,1,0,0
827106,1.0,-1.83697e-16,-122.421876,37.764089,0,3,0,0,0,1,0,0,0,0,0,0


In [10]:
train_y.sample(3)

220322    25
118421    16
30433     16
Name: Category, dtype: int64

## Prediction

In [11]:
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
XTrain, XTest, YTrain, YTest = train_test_split(train_x, train_y, test_size=0.2)

In [12]:
import xgboost as xgb
params = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 39
    }

num_rounds = 5

#model = xgb.XGBClassifier()
train_dmat = xgb.DMatrix(data=XTrain, label=YTrain)
model = xgb.train(params, train_dmat, num_rounds)

In [13]:
print('Predicting ...')
test_dmat = xgb.DMatrix(XTest)
preds_proba = model.predict(test_dmat)
preds_proba.shape

Predicting ...


(175610, 39)

In [14]:
from sklearn.metrics import accuracy_score, log_loss
#print('Accuracy Score: ', accuracy_score(preds, YTest))
print('Log Loss: ', log_loss(y_true=YTest, y_pred =preds_proba, labels=np.arange(39)))
#log_loss(y_true=[3,2,3,1], y_pred=[[0, 0, 1], [0, 1, 0], [0, 0, 1], [1, 0, 0]])

Log Loss:  2.763393812979634


In [15]:
'''
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

#model = KNeighborsClassifier(metric='euclidean')
#model = GaussianNB()

model = RandomForestClassifier()

print('Fitting ...')
model.fit(XTrain, YTrain)
'''

"\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.ensemble import RandomForestClassifier\n\n#model = KNeighborsClassifier(metric='euclidean')\n#model = GaussianNB()\n\nmodel = RandomForestClassifier()\n\nprint('Fitting ...')\nmodel.fit(XTrain, YTrain)\n"

In [16]:
PREDICT = True
if PREDICT:
    import pandas as pd
    test_dmat = xgb.DMatrix(test_x)
    res = model.predict(test_dmat)
    #res.shape
    submission = pd.DataFrame(res, columns=['ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT','DRIVING UNDER THE INFLUENCE',
         'DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION','FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING',
         'LARCENY/THEFT','LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES','PORNOGRAPHY/OBSCENE MAT','PROSTITUTION',
         'RECOVERED VEHICLE','ROBBERY','RUNAWAY','SECONDARY CODES','SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY','SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM','VEHICLE THEFT','WARRANTS','WEAPON LAWS']
    )
    submission['Id']=test_ids
    submission = submission[['Id','ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT','DRIVING UNDER THE INFLUENCE',
         'DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION','FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING',
         'LARCENY/THEFT','LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES','PORNOGRAPHY/OBSCENE MAT','PROSTITUTION',
         'RECOVERED VEHICLE','ROBBERY','RUNAWAY','SECONDARY CODES','SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY','SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM','VEHICLE THEFT','WARRANTS','WEAPON LAWS']]


In [17]:
SUBMIT = True
if SUBMIT:
    submission.to_csv('submission.csv', index=False)
    print('Done Exporting !')
    print(submission.sample(3))

Done Exporting !
            Id     ARSON     ...       WARRANTS  WEAPON LAWS
129207  129207  0.011213     ...       0.046576     0.015932
544990  544990  0.009829     ...       0.024886     0.012054
247361  247361  0.012018     ...       0.066964     0.015604

[3 rows x 40 columns]
