In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, label_binarize, StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss



In [6]:
train= pd.read_csv("train.csv",parse_dates=["Dates"],index_col=False)

In [7]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Id
0,2013-06-28 17:40:00,SEX OFFENSES FORCIBLE,"FORCIBLE RAPE, BODILY FORCE",Friday,MISSION,NONE,2100 Block of MISSION ST,-122.419331,37.762264,141546
1,2004-02-19 02:46:00,LIQUOR LAWS,CONSUMING ALCOHOL IN PUBLIC VIEW,Thursday,SOUTHERN,"ARREST, BOOKED",1000 Block of MARKET ST,-122.41134,37.781271,794152
2,2007-11-14 00:01:00,FRAUD,"CREDIT CARD, THEFT BY USE OF",Wednesday,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,37.775421,531205
3,2007-12-27 18:30:00,ROBBERY,ROBBERY OF A CHAIN STORE WITH A GUN,Thursday,BAYVIEW,DISTRICT ATTORNEY REFUSES TO PROSECUTE,2400 Block of SAN BRUNO AV,-122.404715,37.730161,523137
4,2012-09-09 17:02:00,OTHER OFFENSES,PROBATION VIOLATION,Sunday,SOUTHERN,"ARREST, BOOKED",4TH ST / STEVENSON ST,-122.405239,37.785265,200968


In [8]:
train = train.drop(["Descript","Resolution"],axis=1)

In [9]:
def datesplit(data):
    data["Year"] = data["Dates"].dt.year
    data["Month"] = data["Dates"].dt.month
    data["Day"] = data["Dates"].dt.day
    data["Hour"] = data["Dates"].dt.hour
    data["Minute"] = data["Dates"].dt.minute
    return data

In [10]:
train= datesplit(train)

In [11]:
train.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y,Id,Year,Month,Day,Hour,Minute
0,2013-06-28 17:40:00,SEX OFFENSES FORCIBLE,Friday,MISSION,2100 Block of MISSION ST,-122.419331,37.762264,141546,2013,6,28,17,40
1,2004-02-19 02:46:00,LIQUOR LAWS,Thursday,SOUTHERN,1000 Block of MARKET ST,-122.41134,37.781271,794152,2004,2,19,2,46
2,2007-11-14 00:01:00,FRAUD,Wednesday,SOUTHERN,800 Block of BRYANT ST,-122.403405,37.775421,531205,2007,11,14,0,1
3,2007-12-27 18:30:00,ROBBERY,Thursday,BAYVIEW,2400 Block of SAN BRUNO AV,-122.404715,37.730161,523137,2007,12,27,18,30
4,2012-09-09 17:02:00,OTHER OFFENSES,Sunday,SOUTHERN,4TH ST / STEVENSON ST,-122.405239,37.785265,200968,2012,9,9,17,2


In [12]:
#train = pd.concat([train,pd.get_dummies(train.Category)], axis=1)

In [13]:
cat_encoder=LabelEncoder()
cat_encoder.fit(train["Category"])
cat_encoder.classes_.shape

(36,)

In [14]:
train["CategoryEncoded"] = cat_encoder.transform(train["Category"])

In [15]:
train["CategoryEncoded"].head()

0    27
1    17
2    13
3    24
4    21
Name: CategoryEncoded, dtype: int64

In [16]:
train["Category"] = train["CategoryEncoded"]

In [17]:
train = pd.concat([train,pd.get_dummies(train.PdDistrict)], axis=1)
train = pd.concat([train,pd.get_dummies(train.DayOfWeek)], axis=1)

In [18]:
add_encoder = LabelEncoder()

In [19]:
train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
train["Intersection"]= train["Address"].apply(lambda x: 1 if "/" in x else 0)
train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
add_encoder.fit(train["Address"])
train["Address"]= add_encoder.transform(train["Address"])

In [20]:
train["Morning"] = train["Hour"].apply(lambda x: 1 if x>= 6 and x < 12 else 0)
train["Noon"] = train["Hour"].apply(lambda x: 1 if x>= 12 and x < 17 else 0)
train["Evening"] = train["Hour"].apply(lambda x: 1 if x>= 17 and x < 20 else 0)
train["Night"] = train["Hour"].apply(lambda x: 1 if x >= 20 or x < 6 else 0)
train["Fall"] = train["Month"].apply(lambda x: 1 if x>=3 and x <=5 else 0)
train["Winter"] = train["Month"].apply(lambda x: 1 if x>=6 and x <=8 else 0)
train["Spring"] = train["Month"].apply(lambda x: 1 if x>=9 and x <=11 else 0)
train["Summer"] = train["Month"].apply(lambda x: 1 if x>=12 or x <=2 else 0)

In [21]:
train.shape

(867873, 41)

In [22]:
train_columns = list(train.columns.values)
print(train_columns)

['Dates', 'Category', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'Id', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'CategoryEncoded', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'StreetNo', 'Intersection', 'Morning', 'Noon', 'Evening', 'Night', 'Fall', 'Winter', 'Spring', 'Summer']


In [23]:
train = train.drop(["CategoryEncoded","DayOfWeek"], axis=1)

In [24]:
train = train.drop(["PdDistrict"],axis=1)

In [25]:
train = train.drop(["Dates"],axis=1)

In [26]:
train['StreetNo'] = train['StreetNo'].astype(int)

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867873 entries, 0 to 867872
Data columns (total 37 columns):
Category        867873 non-null int64
Address         867873 non-null int64
X               867873 non-null float64
Y               867873 non-null float64
Id              867873 non-null int64
Year            867873 non-null int64
Month           867873 non-null int64
Day             867873 non-null int64
Hour            867873 non-null int64
Minute          867873 non-null int64
BAYVIEW         867873 non-null uint8
CENTRAL         867873 non-null uint8
INGLESIDE       867873 non-null uint8
MISSION         867873 non-null uint8
NORTHERN        867873 non-null uint8
PARK            867873 non-null uint8
RICHMOND        867873 non-null uint8
SOUTHERN        867873 non-null uint8
TARAVAL         867873 non-null uint8
TENDERLOIN      867873 non-null uint8
Friday          867873 non-null uint8
Monday          867873 non-null uint8
Saturday        867873 non-null uint8
Sunday     

In [28]:
training,validation = train_test_split(train, train_size=0.80,random_state=51)

In [1]:
#train = train.drop(["Category"],axis=1)

In [29]:
from catboost import CatBoostClassifier

In [38]:
model=CatBoostClassifier(iterations=100, depth=5, learning_rate=0.01, loss_function='MultiClass')

In [33]:
train.head()

Unnamed: 0,Category,Address,Id,Year,Month,Day,Hour,Minute,BAYVIEW,CENTRAL,...,Intersection,Morning,Noon,Evening,Night,Fall,Winter,Spring,Summer,LocCluster
0,27,4423,141546,2013,6,28,17,40,0,0,...,0,0,0,1,0,0,1,0,0,12
1,17,4354,794152,2004,2,19,2,46,0,0,...,0,0,0,0,1,0,0,0,1,6
2,13,3493,531205,2007,11,14,0,1,0,0,...,0,0,0,0,1,0,0,1,0,30
3,24,4773,523137,2007,12,27,18,30,1,0,...,0,0,0,1,0,0,0,0,1,33
4,21,1760,200968,2012,9,9,17,2,0,0,...,1,0,0,1,0,0,0,1,0,9


In [32]:
from sklearn.cluster import KMeans 

dataXY_train = train[['X','Y']]

kmeans=KMeans(n_clusters=40)

kmeans.fit(dataXY_train)

train['LocCluster']=kmeans.labels_

train = train.drop(['X','Y'], axis=1)


In [34]:
train.columns

Index(['Category', 'Address', 'Id', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK',
       'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'StreetNo',
       'Intersection', 'Morning', 'Noon', 'Evening', 'Night', 'Fall', 'Winter',
       'Spring', 'Summer', 'LocCluster'],
      dtype='object')

In [51]:
train[features].head()

Unnamed: 0,Address,Id,Year,Month,Day,Hour,Minute,BAYVIEW,CENTRAL,INGLESIDE,...,Intersection,Morning,Noon,Evening,Night,Fall,Winter,Spring,Summer,LocCluster
0,4423,141546,2013,6,28,17,40,0,0,0,...,0,0,0,1,0,0,1,0,0,12
1,4354,794152,2004,2,19,2,46,0,0,0,...,0,0,0,0,1,0,0,0,1,6
2,3493,531205,2007,11,14,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,30
3,4773,523137,2007,12,27,18,30,1,0,0,...,0,0,0,1,0,0,0,0,1,33
4,1760,200968,2012,9,9,17,2,0,0,0,...,1,0,0,1,0,0,0,1,0,9


In [52]:
train['Category'].head()

0    27
1    17
2    13
3    24
4    21
Name: Category, dtype: int64

In [35]:
train.columns
features=[ 'Address', 'Id', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK',
       'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'StreetNo',
       'Intersection', 'Morning', 'Noon', 'Evening', 'Night', 'Fall', 'Winter',
       'Spring', 'Summer', 'LocCluster']


In [39]:
model_catBoost = model.fit(train[features],train['Category'])

0:	learn: -3.5496571	total: 4.02s	remaining: 6m 38s
1:	learn: -3.5184194	total: 8.05s	remaining: 6m 34s
2:	learn: -3.4893781	total: 12s	remaining: 6m 29s
3:	learn: -3.4622143	total: 16.1s	remaining: 6m 25s
4:	learn: -3.4366817	total: 20.1s	remaining: 6m 21s
5:	learn: -3.4125843	total: 24.1s	remaining: 6m 17s
6:	learn: -3.3897632	total: 28.2s	remaining: 6m 14s
7:	learn: -3.3680867	total: 32.3s	remaining: 6m 11s
8:	learn: -3.3474447	total: 36.5s	remaining: 6m 9s
9:	learn: -3.3277434	total: 40.8s	remaining: 6m 7s
10:	learn: -3.3089022	total: 45.1s	remaining: 6m 4s
11:	learn: -3.2908516	total: 49.3s	remaining: 6m 1s
12:	learn: -3.2735305	total: 53.5s	remaining: 5m 58s
13:	learn: -3.2568854	total: 57.8s	remaining: 5m 55s
14:	learn: -3.2408686	total: 1m 2s	remaining: 5m 51s
15:	learn: -3.2254379	total: 1m 6s	remaining: 5m 47s
16:	learn: -3.2103776	total: 1m 10s	remaining: 5m 44s
17:	learn: -3.1958489	total: 1m 14s	remaining: 5m 40s
18:	learn: -3.1819479	total: 1m 19s	remaining: 5m 38s
19:	le

In [41]:
training,validation = train_test_split(train, train_size=0.80,random_state=51)

In [44]:
ypredict=model_catBoost.predict_proba(validation[features])

In [None]:
ypredict.shape

In [45]:
print(log_loss(validation['Category'],ypredict))

2.659137816746002


In [46]:
test=pd.read_csv("test.csv",parse_dates=["Dates"],index_col=False)

In [None]:
test.head()

In [47]:
test = test.drop(["Descript","Resolution"],axis=1)

In [48]:
def datesplit(data):
    data["Year"] = data["Dates"].dt.year
    data["Month"] = data["Dates"].dt.month
    data["Day"] = data["Dates"].dt.day
    data["Hour"] = data["Dates"].dt.hour
    data["Minute"] = data["Dates"].dt.minute
    return data

In [49]:
test= datesplit(test)

In [None]:
test = pd.concat([test,pd.get_dummies(test.PdDistrict)], axis=1)
test = pd.concat([test,pd.get_dummies(test.DayOfWeek)], axis=1)

In [None]:
add_encoder = LabelEncoder()

In [None]:
test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
test["Intersection"]= test["Address"].apply(lambda x: 1 if "/" in x else 0)
test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
add_encoder.fit(test["Address"])
test["Address"]= add_encoder.transform(test["Address"])

In [None]:
test["Morning"] = test["Hour"].apply(lambda x: 1 if x>= 6 and x < 12 else 0)
test["Noon"] = test["Hour"].apply(lambda x: 1 if x>= 12 and x < 17 else 0)
test["Evening"] = test["Hour"].apply(lambda x: 1 if x>= 17 and x < 20 else 0)
test["Night"] = test["Hour"].apply(lambda x: 1 if x >= 20 or x < 6 else 0)
test["Fall"] = test["Month"].apply(lambda x: 1 if x>=3 and x <=5 else 0)
test["Winter"] = test["Month"].apply(lambda x: 1 if x>=6 and x <=8 else 0)
test["Spring"] = test["Month"].apply(lambda x: 1 if x>=9 and x <=11 else 0)
test["Summer"] = test["Month"].apply(lambda x: 1 if x>=12 or x <=2 else 0)

In [None]:
test.shape

In [None]:
test.head()

In [None]:
test = test.drop(["DayOfWeek"], axis=1)

In [None]:
test = test.drop(["PdDistrict","Dates"],axis=1)

In [None]:
test.shape

In [None]:
test['StreetNo'] = test['StreetNo'].astype(int)

In [None]:
test.info()

In [None]:
test.shape

In [None]:
test_features=[ 'Address', 'X', 'Y', 'Year', 'Month', 'Day', 'Hour',
       'Minute', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN',
       'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday',
       'StreetNo', 'Intersection', 'Morning', 'Noon', 'Evening', 'Night',
       'Fall', 'Winter', 'Spring', 'Summer']


In [None]:
ypredict=model_catBoost.predict_proba(test[test_features])

In [None]:
ypredict.shape

In [None]:
result=pd.DataFrame(ypredict,columns=cat_encoder.classes_)
result['Id']=test['Id']

In [None]:
result.to_csv('predictionCatBoostOct',index=False,index_label='Id')