In [224]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, label_binarize, StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

In [225]:
train= pd.read_csv("train.csv",parse_dates=["Dates"],index_col=False)

In [226]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Id
0,2013-06-28 17:40:00,SEX OFFENSES FORCIBLE,"FORCIBLE RAPE, BODILY FORCE",Friday,MISSION,NONE,2100 Block of MISSION ST,-122.419331,37.762264,141546
1,2004-02-19 02:46:00,LIQUOR LAWS,CONSUMING ALCOHOL IN PUBLIC VIEW,Thursday,SOUTHERN,"ARREST, BOOKED",1000 Block of MARKET ST,-122.41134,37.781271,794152
2,2007-11-14 00:01:00,FRAUD,"CREDIT CARD, THEFT BY USE OF",Wednesday,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,37.775421,531205
3,2007-12-27 18:30:00,ROBBERY,ROBBERY OF A CHAIN STORE WITH A GUN,Thursday,BAYVIEW,DISTRICT ATTORNEY REFUSES TO PROSECUTE,2400 Block of SAN BRUNO AV,-122.404715,37.730161,523137
4,2012-09-09 17:02:00,OTHER OFFENSES,PROBATION VIOLATION,Sunday,SOUTHERN,"ARREST, BOOKED",4TH ST / STEVENSON ST,-122.405239,37.785265,200968


In [227]:
train = train.drop(["Descript","Resolution"],axis=1)

In [228]:
def datesplit(data):
    data["Year"] = data["Dates"].dt.year
    data["Month"] = data["Dates"].dt.month
    data["Day"] = data["Dates"].dt.day
    data["Hour"] = data["Dates"].dt.hour
    data["Minute"] = data["Dates"].dt.minute
    return data

In [229]:
train= datesplit(train)

In [230]:
train.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y,Id,Year,Month,Day,Hour,Minute
0,2013-06-28 17:40:00,SEX OFFENSES FORCIBLE,Friday,MISSION,2100 Block of MISSION ST,-122.419331,37.762264,141546,2013,6,28,17,40
1,2004-02-19 02:46:00,LIQUOR LAWS,Thursday,SOUTHERN,1000 Block of MARKET ST,-122.41134,37.781271,794152,2004,2,19,2,46
2,2007-11-14 00:01:00,FRAUD,Wednesday,SOUTHERN,800 Block of BRYANT ST,-122.403405,37.775421,531205,2007,11,14,0,1
3,2007-12-27 18:30:00,ROBBERY,Thursday,BAYVIEW,2400 Block of SAN BRUNO AV,-122.404715,37.730161,523137,2007,12,27,18,30
4,2012-09-09 17:02:00,OTHER OFFENSES,Sunday,SOUTHERN,4TH ST / STEVENSON ST,-122.405239,37.785265,200968,2012,9,9,17,2


In [231]:
#train = pd.concat([train,pd.get_dummies(train.Category)], axis=1)

In [232]:
cat_encoder=LabelEncoder()
cat_encoder.fit(train["Category"])
cat_encoder.classes_.shape

(36,)

In [233]:
train["CategoryEncoded"] = cat_encoder.transform(train["Category"])

In [234]:
train["CategoryEncoded"].head()

0    27
1    17
2    13
3    24
4    21
Name: CategoryEncoded, dtype: int64

In [235]:
train["Category"] = train["CategoryEncoded"]

In [236]:
train = pd.concat([train,pd.get_dummies(train.PdDistrict)], axis=1)
train = pd.concat([train,pd.get_dummies(train.DayOfWeek)], axis=1)

In [237]:
add_encoder = LabelEncoder()

In [238]:
train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
train["Intersection"]= train["Address"].apply(lambda x: 1 if "/" in x else 0)
train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
add_encoder.fit(train["Address"])
train["Address"]= add_encoder.transform(train["Address"])

In [239]:
train["Morning"] = train["Hour"].apply(lambda x: 1 if x>= 6 and x < 12 else 0)
train["Noon"] = train["Hour"].apply(lambda x: 1 if x>= 12 and x < 17 else 0)
train["Evening"] = train["Hour"].apply(lambda x: 1 if x>= 17 and x < 20 else 0)
train["Night"] = train["Hour"].apply(lambda x: 1 if x >= 20 or x < 6 else 0)
train["Fall"] = train["Month"].apply(lambda x: 1 if x>=3 and x <=5 else 0)
train["Winter"] = train["Month"].apply(lambda x: 1 if x>=6 and x <=8 else 0)
train["Spring"] = train["Month"].apply(lambda x: 1 if x>=9 and x <=11 else 0)
train["Summer"] = train["Month"].apply(lambda x: 1 if x>=12 or x <=2 else 0)

In [240]:
train.shape

(867873, 41)

In [241]:
train_columns = list(train.columns.values)
print(train_columns)

['Dates', 'Category', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'Id', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'CategoryEncoded', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'StreetNo', 'Intersection', 'Morning', 'Noon', 'Evening', 'Night', 'Fall', 'Winter', 'Spring', 'Summer']


In [242]:
train = train.drop(["CategoryEncoded","DayOfWeek"], axis=1)

In [243]:
train = train.drop(["PdDistrict"],axis=1)

In [244]:
train = train.drop(["Dates"],axis=1)

In [245]:
train['StreetNo'] = train['StreetNo'].astype(int)

In [246]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867873 entries, 0 to 867872
Data columns (total 37 columns):
Category        867873 non-null int64
Address         867873 non-null int64
X               867873 non-null float64
Y               867873 non-null float64
Id              867873 non-null int64
Year            867873 non-null int64
Month           867873 non-null int64
Day             867873 non-null int64
Hour            867873 non-null int64
Minute          867873 non-null int64
BAYVIEW         867873 non-null uint8
CENTRAL         867873 non-null uint8
INGLESIDE       867873 non-null uint8
MISSION         867873 non-null uint8
NORTHERN        867873 non-null uint8
PARK            867873 non-null uint8
RICHMOND        867873 non-null uint8
SOUTHERN        867873 non-null uint8
TARAVAL         867873 non-null uint8
TENDERLOIN      867873 non-null uint8
Friday          867873 non-null uint8
Monday          867873 non-null uint8
Saturday        867873 non-null uint8
Sunday     

In [247]:
training,validation = train_test_split(train, train_size=0.80,random_state=51)

In [248]:
#train = train.drop(["Category"],axis=1)

In [249]:
from catboost import CatBoostClassifier

In [250]:
model=CatBoostClassifier(iterations=100, depth=5, learning_rate=0.01, loss_function='MultiClass')

In [251]:
train.head()

Unnamed: 0,Category,Address,X,Y,Id,Year,Month,Day,Hour,Minute,...,StreetNo,Intersection,Morning,Noon,Evening,Night,Fall,Winter,Spring,Summer
0,27,4423,-122.419331,37.762264,141546,2013,6,28,17,40,...,2100,0,0,0,1,0,0,1,0,0
1,17,4354,-122.41134,37.781271,794152,2004,2,19,2,46,...,1000,0,0,0,0,1,0,0,0,1
2,13,3493,-122.403405,37.775421,531205,2007,11,14,0,1,...,800,0,0,0,0,1,0,0,1,0
3,24,4773,-122.404715,37.730161,523137,2007,12,27,18,30,...,2400,0,0,0,1,0,0,0,0,1
4,21,1760,-122.405239,37.785265,200968,2012,9,9,17,2,...,0,1,0,0,1,0,0,0,1,0


In [252]:
train.columns
features=[ 'Address', 'X', 'Y', 'Year', 'Month', 'Day', 'Hour',
       'Minute', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN',
       'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday',
       'StreetNo', 'Intersection', 'Morning', 'Noon', 'Evening', 'Night',
       'Fall', 'Winter', 'Spring', 'Summer']


In [253]:
test=pd.read_csv("test.csv",parse_dates=["Dates"],index_col=False)

In [254]:
test.head()

Unnamed: 0,Dates,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Id
0,2010-07-04 23:00:00,ATTEMPTED GRAND THEFT FROM PERSON,Sunday,TENDERLOIN,NONE,100 Block of POWELL ST,-122.407878,37.785968,349598
1,2004-06-26 22:43:00,BATTERY OF A POLICE OFFICER,Saturday,PARK,"ARREST, BOOKED",2400 Block of GEARY BL,-122.443597,37.782644,766313
2,2013-02-09 21:26:00,VIOLATION OF MUNICIPAL CODE,Saturday,SOUTHERN,"ARREST, CITED",HOWARD ST / 3RD ST,-122.400474,37.785029,169887
3,2006-12-03 22:30:00,POSS OF PROHIBITED WEAPON,Sunday,NORTHERN,"ARREST, CITED",VANNESS AV / HAYES ST,-122.419698,37.777301,594704
4,2014-09-21 08:45:00,PETTY THEFT FROM UNLOCKED AUTO,Sunday,PARK,NONE,BEULAH ST / SHRADER ST,-122.451488,37.767516,47900


In [255]:
test = test.drop(["Descript","Resolution"],axis=1)

In [256]:
def datesplit(data):
    data["Year"] = data["Dates"].dt.year
    data["Month"] = data["Dates"].dt.month
    data["Day"] = data["Dates"].dt.day
    data["Hour"] = data["Dates"].dt.hour
    data["Minute"] = data["Dates"].dt.minute
    return data

In [257]:
test= datesplit(test)

In [258]:
test = pd.concat([test,pd.get_dummies(test.PdDistrict)], axis=1)
test = pd.concat([test,pd.get_dummies(test.DayOfWeek)], axis=1)

In [259]:
add_encoder = LabelEncoder()

In [260]:
test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
test["Intersection"]= test["Address"].apply(lambda x: 1 if "/" in x else 0)
test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
add_encoder.fit(test["Address"])
test["Address"]= add_encoder.transform(test["Address"])

In [261]:
test["Morning"] = test["Hour"].apply(lambda x: 1 if x>= 6 and x < 12 else 0)
test["Noon"] = test["Hour"].apply(lambda x: 1 if x>= 12 and x < 17 else 0)
test["Evening"] = test["Hour"].apply(lambda x: 1 if x>= 17 and x < 20 else 0)
test["Night"] = test["Hour"].apply(lambda x: 1 if x >= 20 or x < 6 else 0)
test["Fall"] = test["Month"].apply(lambda x: 1 if x>=3 and x <=5 else 0)
test["Winter"] = test["Month"].apply(lambda x: 1 if x>=6 and x <=8 else 0)
test["Spring"] = test["Month"].apply(lambda x: 1 if x>=9 and x <=11 else 0)
test["Summer"] = test["Month"].apply(lambda x: 1 if x>=12 or x <=2 else 0)

In [262]:
test.shape

(10000, 39)

In [263]:
test.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y,Id,Year,Month,Day,...,StreetNo,Intersection,Morning,Noon,Evening,Night,Fall,Winter,Spring,Summer
0,2010-07-04 23:00:00,Sunday,TENDERLOIN,1187,-122.407878,37.785968,349598,2010,7,4,...,100,0,0,0,0,1,0,1,0,0
1,2004-06-26 22:43:00,Saturday,PARK,875,-122.443597,37.782644,766313,2004,6,26,...,2400,0,0,0,0,1,0,1,0,0
2,2013-02-09 21:26:00,Saturday,SOUTHERN,1864,-122.400474,37.785029,169887,2013,2,9,...,0,1,0,0,0,1,0,0,0,1
3,2006-12-03 22:30:00,Sunday,NORTHERN,2672,-122.419698,37.777301,594704,2006,12,3,...,0,1,0,0,0,1,0,0,0,1
4,2014-09-21 08:45:00,Sunday,PARK,455,-122.451488,37.767516,47900,2014,9,21,...,0,1,1,0,0,0,0,0,1,0


In [264]:
test = test.drop(["DayOfWeek"], axis=1)

In [265]:
test = test.drop(["PdDistrict","Dates"],axis=1)

In [266]:
test.shape

(10000, 36)

In [267]:
test['StreetNo'] = test['StreetNo'].astype(int)

In [268]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 36 columns):
Address         10000 non-null int64
X               10000 non-null float64
Y               10000 non-null float64
Id              10000 non-null int64
Year            10000 non-null int64
Month           10000 non-null int64
Day             10000 non-null int64
Hour            10000 non-null int64
Minute          10000 non-null int64
BAYVIEW         10000 non-null uint8
CENTRAL         10000 non-null uint8
INGLESIDE       10000 non-null uint8
MISSION         10000 non-null uint8
NORTHERN        10000 non-null uint8
PARK            10000 non-null uint8
RICHMOND        10000 non-null uint8
SOUTHERN        10000 non-null uint8
TARAVAL         10000 non-null uint8
TENDERLOIN      10000 non-null uint8
Friday          10000 non-null uint8
Monday          10000 non-null uint8
Saturday        10000 non-null uint8
Sunday          10000 non-null uint8
Thursday        10000 non-null uint8


In [269]:
test.shape

(10000, 36)

In [270]:
test_features=[ 'Address', 'X', 'Y', 'Year', 'Month', 'Day', 'Hour',
       'Minute', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN',
       'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday',
       'StreetNo', 'Intersection', 'Morning', 'Noon', 'Evening', 'Night',
       'Fall', 'Winter', 'Spring', 'Summer']


In [271]:
from sklearn.cluster import KMeans

In [272]:
dataXY_train = train[['X','Y']]
dataXY_test = test[['X','Y']]

kmeans=KMeans(n_clusters=2)

kmeans.fit(dataXY_train)
train['LocCluster']=kmeans.labels_

kmeans.fit(dataXY_test)


test['LocCluster'] = kmeans.labels_

train = train.drop(['X','Y'], axis=1)
test = test.drop(['X','Y'], axis=1)

In [273]:
train.columns

Index(['Category', 'Address', 'Id', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK',
       'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'StreetNo',
       'Intersection', 'Morning', 'Noon', 'Evening', 'Night', 'Fall', 'Winter',
       'Spring', 'Summer', 'LocCluster'],
      dtype='object')

In [274]:
features = ['Address', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK',
       'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'StreetNo',
       'Intersection', 'Morning', 'Noon', 'Evening', 'Night', 'Fall', 'Winter',
       'Spring', 'Summer', 'LocCluster']

test_features = ['Address', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK',
       'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'StreetNo',
       'Intersection', 'Morning', 'Noon', 'Evening', 'Night', 'Fall', 'Winter',
       'Spring', 'Summer', 'LocCluster']

In [275]:
training,validation = train_test_split(train, train_size=0.80,random_state=51)

In [276]:
model_catBoost = model.fit(training[features],training['Category'])

0:	learn: -3.5503541	total: 7.52s	remaining: 12m 24s
1:	learn: -3.5196886	total: 14s	remaining: 11m 24s
2:	learn: -3.4911373	total: 20.4s	remaining: 10m 58s
3:	learn: -3.4643379	total: 26.8s	remaining: 10m 42s
4:	learn: -3.4390727	total: 33.2s	remaining: 10m 30s
5:	learn: -3.4152065	total: 41.2s	remaining: 10m 45s
6:	learn: -3.3923921	total: 48.7s	remaining: 10m 46s
7:	learn: -3.3706830	total: 55.2s	remaining: 10m 35s
8:	learn: -3.3499987	total: 1m 1s	remaining: 10m 25s
9:	learn: -3.3302908	total: 1m 8s	remaining: 10m 14s
10:	learn: -3.3115041	total: 1m 14s	remaining: 10m 4s
11:	learn: -3.2933014	total: 1m 21s	remaining: 9m 55s
12:	learn: -3.2758054	total: 1m 29s	remaining: 9m 58s
13:	learn: -3.2589873	total: 1m 35s	remaining: 9m 49s
14:	learn: -3.2427534	total: 1m 44s	remaining: 9m 49s
15:	learn: -3.2271562	total: 1m 51s	remaining: 9m 44s
16:	learn: -3.2120803	total: 1m 59s	remaining: 9m 43s
17:	learn: -3.1976410	total: 2m 5s	remaining: 9m 33s
18:	learn: -3.1836712	total: 2m 12s	remai

In [277]:
ypredict=model_catBoost.predict_proba(validation[features])

In [278]:
ypredict.shape

(173575, 36)

In [279]:
print(log_loss(validation['Category'],ypredict))

2.6664504383393197


In [280]:
ypredict=model_catBoost.predict_proba(test[test_features])

In [281]:
ypredict.shape

(10000, 36)

In [282]:
result=pd.DataFrame(ypredict,columns=cat_encoder.classes_)
result['Id']=test['Id']

In [283]:
result.to_csv('predProbCatBoost26Oct.csv',index=False,index_label='Id')

In [284]:
result.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS,Id
0,0.011435,0.084476,0.01118,0.010814,0.046688,0.012908,0.010931,0.043209,0.011852,0.012028,...,0.017382,0.012501,0.011176,0.038195,0.01511,0.039221,0.034283,0.027079,0.013246,349598
1,0.012227,0.093174,0.010465,0.010564,0.039209,0.014175,0.011969,0.053393,0.015843,0.010815,...,0.013107,0.014668,0.010966,0.035481,0.018744,0.046722,0.044241,0.049689,0.019656,766313
2,0.011021,0.050343,0.010236,0.01033,0.011634,0.01295,0.016088,0.060123,0.015655,0.010286,...,0.011636,0.01468,0.010292,0.027283,0.011233,0.030803,0.025625,0.051423,0.018632,169887
3,0.010843,0.046989,0.010248,0.010279,0.011365,0.013175,0.013716,0.04147,0.013147,0.010313,...,0.012195,0.012819,0.010268,0.022007,0.010766,0.039318,0.069453,0.035048,0.015317,594704
4,0.010959,0.05309,0.010266,0.010385,0.011693,0.014573,0.015275,0.05902,0.015023,0.010305,...,0.011829,0.014362,0.010315,0.02919,0.011432,0.028503,0.026618,0.056235,0.018231,47900
