In [26]:
import pandas as pd
import numpy as np
import datetime
import scipy
from matplotlib import pyplot as plt
import pylab
import os
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
from sklearn.linear_model import Ridge, Lasso
import lightgbm as lgb
from sklearn.svm import SVR
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [309]:
#Main Preprocessing sales data
train = pd.read_csv("../data/train.csv", parse_dates=['Dates'])
test = pd.read_csv("../data/test.csv", parse_dates=['Dates'])
sampleSubmission = pd.read_csv("../data/sampleSubmission.csv")

In [270]:
train['Date']=pd.to_datetime(train['Dates'].dt.strftime('%Y-%m-%d'),format='%Y-%m-%d')
test['Date']=pd.to_datetime(test['Dates'].dt.strftime('%Y-%m-%d'),format='%Y-%m-%d')

In [271]:
# Preprocessing weather data
weather_data = pd.read_csv("../data/weather san_francisco_Jan2003-Dec15.csv")
weather_data["Date"]= weather_data["Date"].str.replace(" ", "")
weather_data["Date"] = pd.to_datetime(weather_data["Date"],format='%Y-%m-%d')

weather_data.columns = ['t_max','t_avg','t_min','dew_max','dew_avg','dew_min','hum_max',
                        'hum_avg','hum_min','wind_max','wind_avg','wind_min','pres_max','pres_avg','pres_min','percip','Date']

In [272]:
merged_train=pd.merge(train, weather_data, on ="Date", how="left")
merged_test=pd.merge(test, weather_data, on ="Date", how="left")

In [None]:
# merged_train,merged_test = train_test_split(merged, test_size=0.2, random_state=42)

In [216]:
def extract_features(train,test):
    data = train[train.X < -121]
    data = train[train.Y < 40]
    data = pd.concat([train,test],ignore_index=True)
    #data = data.fillna(0)

    # Dates
    data['Hour'] = data.Dates.dt.hour
    data['Day'] = data.Dates.dt.day
#    data['DayOfWeekNum'] = pd.Categorical.from_array(data.DayOfWeek).codes
    data['DayOfMonth'] = data.Dates.dt.day
    data['DayOfYear'] = data.Dates.dt.dayofyear
    data['WeekOfYear'] = data.Dates.dt.weekofyear
    data['Month'] = data.Dates.dt.month
    data['Year'] = data.Dates.dt.year
    data["Fri"] = np.where(data.DayOfWeek == "Friday",1,0)
    data["Sat"] = np.where(data.DayOfWeek == "Saturday",1,0)
    data["Weekend"] = data["Fri"] + data["Sat"]

    # PdDisrict
    data['PdDistrictCat'] = data.PdDistrict.astype("category").cat.codes
  
    
    # Lat/Long
    
    
    data["X_reduced"] = data.X.apply(lambda x: "{0:.2f}".format(x)).astype(float)
    data["Y_reduced"] = data.Y.apply(lambda x: "{0:.2f}".format(x)).astype(float)
    data["X_reduced_cat"] = data.X_reduced.astype("category").cat.codes
    data["Y_reduced_cat"] = data.Y_reduced.astype("category").cat.codes
    
    data["rot_45_X"] = .707*data["Y"] + .707*data["X"]
    data["rot_45_Y"] = .707* data["Y"] - .707* data["X"]

    data["rot_30_X"] = (1.732/2)*data["X"] + (1./2)*data["Y"]
    data["rot_30_Y"] = (1.732/2)* data["Y"] - (1./2)* data["X"]

    data["rot_60_X"] = (1./2)*data["X"] + (1.732/2)*data["Y"]
    data["rot_60_Y"] = (1./2)* data["Y"] - (1.732/2)* data["X"]

    data["radial_r"] = np.sqrt( np.power(data["Y"],2) + np.power(data["X"],2) )

    # Output feature - crime category
    dictionary=dict(enumerate(data.Category.astype("category").cat.categories))
    data["CategoryNum"]=data.Category.astype("category").cat.codes
    
    X_var= pd.concat([data.Hour,
                      data.Day,
 #                     data.DayOfWeekNum,
                      data.DayOfMonth,
                      pd.get_dummies(data.Month),
                      pd.get_dummies(data.Year),
                      data.PdDistrictCat,
                      data.rot_45_X,
                      data.rot_45_Y,
                      data.t_max,
                      data.percip
                     ], axis=1)
    
    train_final=pd.concat([X_var,data.CategoryNum], axis=1)[:train.shape[0]]
    test_final=X_var[train.shape[0]:]
    
    return train_final,test_final, dictionary

In [225]:
# Set parameters for XGBoost
def set_param():
    
    # setup parameters for xgboost
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.4
    param['silent'] = 0
    param['nthread'] = 4
    param['num_class'] = num_class
    param['eval_metric'] = 'mlogloss'

    # Model complexity
    param['max_depth'] = 8 #set to 8
    param['min_child_weight'] = 1
    param['gamma'] = 0 
    param['reg_alfa'] = 0.05

    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.8 #set to 1

    # Imbalanced data
    param['max_delta_step'] = 1
    
    return param

In [267]:
# Load data and extract features
#data = pd.read_csv('../data/train.csv', parse_dates=['Dates'])

data, test_x, dict_y = extract_features(merged_train,merged_test)

classes=data.CategoryNum
data=data.drop(columns=['CategoryNum'])

train_X, validate_X,train_Y,validate_Y = train_test_split(data, classes, test_size=0.2, random_state=42)
   
dtrain = xgb.DMatrix(train_X, label=train_Y)
dvalidate = xgb.DMatrix(validate_X, label=validate_Y)

num_class = len(classes.unique())

param = set_param()
watchlist = [ (dtrain,'train'), (dvalidate, 'eval') ]
num_round = 10


In [248]:

# Train XGBoost    
bst = xgb.train(param, dtrain, num_round, watchlist);
yprob = bst.predict(dvalidate).reshape( validate_Y.shape[0], num_class)
ylabel = np.argmax(yprob, axis=1)

[0]	train-mlogloss:3.3391	eval-mlogloss:3.34298
[1]	train-mlogloss:3.08031	eval-mlogloss:3.08855
[2]	train-mlogloss:2.88443	eval-mlogloss:2.89804
[3]	train-mlogloss:2.74558	eval-mlogloss:2.76468
[4]	train-mlogloss:2.64765	eval-mlogloss:2.67166
[5]	train-mlogloss:2.57653	eval-mlogloss:2.60586
[6]	train-mlogloss:2.5246	eval-mlogloss:2.55882
[7]	train-mlogloss:2.48739	eval-mlogloss:2.52626
[8]	train-mlogloss:2.45789	eval-mlogloss:2.50098
[9]	train-mlogloss:2.43399	eval-mlogloss:2.48157


In [None]:
yprob = bst.predict(dvalidate).reshape( validate_Y.shape[0], num_class)

In [276]:
test_Y=merged_train.Category.astype("category").cat.codes

dtest = xgb.DMatrix(test_x)

In [277]:
test['Id']

0              0
1              1
2              2
3              3
4              4
           ...  
884257    884257
884258    884258
884259    884259
884260    884260
884261    884261
Name: Id, Length: 884262, dtype: int64

In [250]:
yprob = bst.predict(dtest)

In [312]:
submission1=pd.DataFrame(yprob,columns=sorted(merged_train.Category.unique().tolist()))
submission=pd.concat([test.Id,submission1],axis=1)

In [324]:
os.chdir('C:/Users/progbld/Documents/SF crimes/data')

In [325]:
submission.to_csv("submission.csv", index = False)

In [322]:
submission

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.007369,0.099167,0.003913,0.004256,0.028041,0.004984,0.006354,0.030083,0.005423,...,0.003921,0.006976,0.004110,0.047214,0.003743,0.006162,0.071894,0.095030,0.050796,0.024288
1,1,0.006338,0.123118,0.003720,0.004059,0.020566,0.004738,0.006611,0.046966,0.010541,...,0.003728,0.006494,0.003907,0.032229,0.003558,0.008061,0.058639,0.074147,0.059718,0.033738
2,2,0.004579,0.050630,0.003458,0.003390,0.070131,0.005983,0.004790,0.015370,0.007393,...,0.003346,0.009460,0.003540,0.022713,0.003258,0.007332,0.052922,0.067385,0.024348,0.006330
3,3,0.004350,0.103393,0.003305,0.003610,0.024316,0.005286,0.005802,0.024136,0.009554,...,0.003330,0.006146,0.003786,0.032287,0.003178,0.005142,0.043356,0.133285,0.030012,0.015360
4,4,0.004350,0.103393,0.003305,0.003610,0.024316,0.005286,0.005802,0.024136,0.009554,...,0.003330,0.006146,0.003786,0.032287,0.003178,0.005142,0.043356,0.133285,0.030012,0.015360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,884257,0.003681,0.093183,0.003191,0.003081,0.016878,0.010940,0.005029,0.034138,0.008404,...,0.004844,0.005971,0.003021,0.057103,0.002725,0.005672,0.034893,0.077367,0.043603,0.009962
884258,884258,0.004069,0.072112,0.005570,0.003203,0.048216,0.015889,0.005066,0.022909,0.008305,...,0.003225,0.009940,0.003374,0.032434,0.003079,0.009091,0.033524,0.047026,0.035011,0.007334
884259,884259,0.004443,0.106637,0.003625,0.003107,0.019927,0.004479,0.004287,0.016443,0.012543,...,0.003037,0.007253,0.003134,0.059531,0.002747,0.005534,0.046078,0.080962,0.042677,0.009277
884260,884260,0.014585,0.071398,0.003869,0.003219,0.030618,0.004993,0.005097,0.030641,0.004100,...,0.003942,0.006247,0.003247,0.041325,0.002941,0.008765,0.026737,0.044297,0.055525,0.014266


In [None]:
https://www.kaggle.com/lesibius/crime-scene-exploration-and-model-fit
    
    Do this!