## Data fields
* 날짜 - 범죄 사건의 타임 스탬프
* 범주 - 범죄 사건 카테고리 (train.csv에만 해당) 이 변수를 예측하는 게 이 경진대회 과제임
* 설명 - 범죄 사건에 대한 자세한 설명 (train.csv에만 있음)
* DayOfWeek - 요일
* PdDistrict - 경찰서 구의 이름
* 해결 방법 - 범죄 사건이 어떻게 해결 되었는지 (train.csv에서만)
* 주소 - 범죄 사건의 대략적인 주소
* X - 경도
* Y - 위도


* Dates - timestamp of the crime incident
* Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
* Descript - detailed description of the crime incident (only in train.csv)
* DayOfWeek - the day of the week
* PdDistrict - name of the Police Department District
* Resolution - how the crime incident was resolved (only in train.csv)
* Address - the approximate street address of the crime incident 
* X - Longitude 
* Y - Latitude 


* 양이 많아짐으로서 데이터를 분석하는 전략이 달라짐
* 문자열, 숫자 넣는 방법, 좌표데이터, 주소데이터를 어떻게 분석해서 넣을것인가에 대한 노하우를 알게 될 것임
엑셀로 데이터를 열어보는 것을 추천


In [10]:
import pandas as pd

In [11]:
train = pd.read_csv('data/train.csv', parse_dates=["Dates"])
test =  pd.read_csv('data/test.csv', parse_dates=["Dates"])

In [12]:
train.shape

(878049, 9)

In [13]:
test.shape

(884262, 7)

In [14]:
# OAK ST / LAGUNA ST -> OAK ST & LAGUNA ST
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [15]:
test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [16]:
feature_names = ["X", "Y"]
feature_names

['X', 'Y']

In [17]:
PdDistrict = train.PdDistrict.unique()
PdDistrict

array(['NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL',
       'TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN'], dtype=object)

In [18]:
for p in PdDistrict:
    train[p] = train['PdDistrict'] == p
    test[p] = test['PdDistrict'] == p
    feature_names.append(p)

In [19]:
train.shape

(878049, 19)

In [20]:
test.shape

(884262, 17)

In [21]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,NORTHERN,PARK,INGLESIDE,BAYVIEW,RICHMOND,CENTRAL,TARAVAL,TENDERLOIN,MISSION,SOUTHERN
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,True,False,False,False,False,False,False,False,False,False
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,True,False,False,False,False,False,False,False,False,False
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,True,False,False,False,False,False,False,False,False,False
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,True,False,False,False,False,False,False,False,False,False
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,False,True,False,False,False,False,False,False,False,False


In [25]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(878049, 12)


Unnamed: 0,X,Y,NORTHERN,PARK,INGLESIDE,BAYVIEW,RICHMOND,CENTRAL,TARAVAL,TENDERLOIN,MISSION,SOUTHERN
0,-122.425892,37.774599,True,False,False,False,False,False,False,False,False,False
1,-122.425892,37.774599,True,False,False,False,False,False,False,False,False,False
2,-122.424363,37.800414,True,False,False,False,False,False,False,False,False,False
3,-122.426995,37.800873,True,False,False,False,False,False,False,False,False,False
4,-122.438738,37.771541,False,True,False,False,False,False,False,False,False,False


In [26]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(884262, 12)


Unnamed: 0,X,Y,NORTHERN,PARK,INGLESIDE,BAYVIEW,RICHMOND,CENTRAL,TARAVAL,TENDERLOIN,MISSION,SOUTHERN
0,-122.399588,37.735051,False,False,False,True,False,False,False,False,False,False
1,-122.391523,37.732432,False,False,False,True,False,False,False,False,False,False
2,-122.426002,37.792212,True,False,False,False,False,False,False,False,False,False
3,-122.437394,37.721412,False,False,True,False,False,False,False,False,False,False
4,-122.437394,37.721412,False,False,True,False,False,False,False,False,False,False


In [27]:
label_name = "Category"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

(878049,)


0          WARRANTS
1    OTHER OFFENSES
2    OTHER OFFENSES
3     LARCENY/THEFT
4     LARCENY/THEFT
Name: Category, dtype: object

### RandomForest

In [28]:
# from sklearn.ensemble import RandomForestClassifier

# seed = 37

# model = RandomForestClassifier(random_state=37, n_jobs=-1)
# model

In [29]:
# from sklearn.model_selection import cross_val_score

# %time score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss").mean()

# print("Score = {0:.5f}".format(score))

####  model, X_train, y_train, cv=5, 
CPU times: user 2min 54s, sys: 8.14 s, total: 3min 2s
Wall time: 1min 21s
Score = 0.26269


#### [San Francisco Crime Classification | Kaggle](https://www.kaggle.com/c/sf-crime#evaluation)
logloss=−1N∑i=1N∑j=1Myijlog(pij),

### XGBoost

In [30]:
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators=5, nthread=-1, seed=37)
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=5,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=37, silent=True,
       subsample=1)

In [31]:
from sklearn.cross_validation import cross_val_score

%time score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss").mean()

print("Score = {0:.5f}".format(score))



CPU times: user 30min 9s, sys: 50 s, total: 30min 59s
Wall time: 33min 15s
Score = -3.15008


* 그라디언트 부스팅 머신이 기본임
* 섞어쓰는 것을 앙상블이라고 함
* 디시전+랜덤포레스트 => 배깅
* 스태킹은 그리 많이 안 씀
* 부스딩은 디시전트리로 많이 씀 정답과 y트레인이 정확하지 않음 그 차이를 두번째 트리 세번째트리...에 넣으며 정답과 예측을 보간한다.
* 트리를 계속 만들며 보간해 가는 것이 그라디언트부스팅
* XGB의 장점은 현존하는 알고리즘의 장점을 모두 넣어둠
* 사이킷런에도 그라디언트부스팅이 있는데 사용해 보면 XGB만큼의 성능이 나오지 않음
* 웬만하면 XGB로 대부분 해결이 가능하고 파라메터가 많음

* LGBM도 상당히 좋음 속도가 훨씬 빠름
* 하나를 더 쓴다면 RF를 쓴다.
* 트리모델이나 리니어모델을 사용

* 엑셀과 시각화로 분석해 보는 것을 추천함

* 노트북을 2개 틀고 하나는 시각화 하나는 알고리즘을 돌려본다.
* 가능한 엑셀과 시각화로 분석하는 습관을 연습한다.

[Paperspace: Your full computer in the cloud](https://www.paperspace.com/)

### Predict

In [32]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=5,
       n_jobs=1, nthread=-1, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=37, silent=True,
       subsample=1)

In [33]:
# predict_proba 결과를 확률로 예측
predictions = model.predict_proba(X_test)

print(predictions.shape)
predictions[0]

(884262, 39)


array([ 0.02022124,  0.04888738,  0.01965614,  0.01970115,  0.0297361 ,
        0.02022987,  0.01999734,  0.03361598,  0.02000142,  0.0198145 ,
        0.01962891,  0.01973814,  0.02298402,  0.02227425,  0.01960465,
        0.02055035,  0.03948945,  0.01992091,  0.01971399,  0.03603652,
        0.03628583,  0.06173178,  0.01957521,  0.01971059,  0.02123524,
        0.02963921,  0.02004267,  0.02288588,  0.02053764,  0.01960653,
        0.02061543,  0.01970069,  0.02868956,  0.01957124,  0.02125099,
        0.03076947,  0.04041701,  0.03249181,  0.02344085], dtype=float32)

In [34]:
# 모두 출력
predictions[:,:].shape
# 첫번째 컬럼
predictions[:,1].shape

(884262,)

In [35]:
submission = pd.read_csv('data/SampleSubmission.csv', index_col='Id')

# 모든 범죄의 카테고리가 나오고 그 카테고리의 확률이 나옴
print(submission.shape)
submission.head()

(884262, 39)


Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [36]:
for i, column in enumerate(submission.columns):
    submission[column] = predictions[:, i]

print(submission.shape)
submission.head()

(884262, 39)


Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.020221,0.048887,0.019656,0.019701,0.029736,0.02023,0.019997,0.033616,0.020001,0.019814,...,0.019607,0.020615,0.019701,0.02869,0.019571,0.021251,0.030769,0.040417,0.032492,0.023441
1,0.020412,0.049349,0.019842,0.019887,0.030017,0.020421,0.020186,0.033933,0.02019,0.020001,...,0.019791,0.02081,0.019887,0.02896,0.019756,0.021451,0.03106,0.035408,0.032798,0.023662
2,0.019555,0.037231,0.019496,0.019323,0.041958,0.020435,0.019916,0.022232,0.019916,0.019586,...,0.019315,0.020528,0.019379,0.026135,0.01928,0.020988,0.032119,0.035625,0.026063,0.020339
3,0.01992,0.045586,0.01948,0.019524,0.02768,0.020048,0.020035,0.026835,0.019897,0.019637,...,0.01943,0.02043,0.019524,0.028432,0.019396,0.020668,0.033695,0.046232,0.026219,0.021701
4,0.01992,0.045586,0.01948,0.019524,0.02768,0.020048,0.020035,0.026835,0.019897,0.019637,...,0.01943,0.02043,0.019524,0.028432,0.019396,0.020668,0.033695,0.046232,0.026219,0.021701


In [37]:
submission.to_csv("data/baseline-script.csv")