***Imports***

In [41]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder

***Load Dataset***

In [59]:
train_set = pd.read_csv('sf-crime/train.csv')
test_set = pd.read_csv('sf-crime/test.csv')

***Changing Dates Column into Separate Columns***

In [60]:
years = []
months = []
days = []
times = []

# Separate dates in terms of year, month, day and time
def change_dates(date):
    # split date
    parts = date.split('-')
    years.append(int(parts[0]))
    months.append(int(parts[1]))
    # second split to split day and time apart
    parts = parts[2].split(' ')
    days.append(int(parts[0]))
    # removing : from time
    time = parts[1].replace(':', '')
    times.append(int(time))
    
    

train_set['Dates'].apply(change_dates)

# turn into dataframes
years = pd.DataFrame(years, columns=['Year'])
months =  pd.DataFrame(months, columns=['Month'])
days = pd.DataFrame(days, columns=['Day'])
times = pd.DataFrame(times, columns=['Time'])

# readd to original dataframe
train_set = pd.concat([train_set, years, months, days, times], axis=1)
train_set.drop(['Dates'], axis=1, inplace=True)

In [61]:
years = []
months = []
days = []
times = []

# Separate dates in terms of year, month, day and time
def change_dates(date):
    # split date
    parts = date.split('-')
    years.append(int(parts[0]))
    months.append(int(parts[1]))
    # second split to split day and time apart
    parts = parts[2].split(' ')
    days.append(int(parts[0]))
    # removing : from time
    time = parts[1].replace(':', '')
    times.append(int(time))
    
    

test_set['Dates'].apply(change_dates)

# turn into dataframes
years = pd.DataFrame(years, columns=['Year'])
months =  pd.DataFrame(months, columns=['Month'])
days = pd.DataFrame(days, columns=['Day'])
times = pd.DataFrame(times, columns=['Time'])

# readd to original dataframe
test_set = pd.concat([test_set, years, months, days, times], axis=1)
test_set.drop(['Dates'], axis=1, inplace=True)

***Encode Day of Week***

In [62]:
day_encoder = OrdinalEncoder()

# fit transforming using encoder
train_set['DayOfWeek'] = day_encoder.fit_transform(train_set[['DayOfWeek']])

# transforming using same encoder
test_set['DayOfWeek'] = day_encoder.transform(test_set[['DayOfWeek']])

***Encode Category***

In [66]:
category_encoder = OrdinalEncoder()

# fit transforming using encoder
train_set['Category'] = category_encoder.fit_transform(train_set[['Category']])

***Determin X and Y data***

In [73]:
# keeping only relavant columns
columns = ['Year', 'Month', 'Day', 'Time', 'DayOfWeek', 'X', 'Y']

X_train = train_set[columns]
Y_train = train_set['Category']
X_test = test_set[columns]

***Train Model***

In [79]:
dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test)

# parameters for XGBoost
params = {
        'objective' : 'multi:softmax',
        'num_class' : 39,
        'max_depth' : 3,
        'eta' : 0.1 }

# train model
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds)

***Predict on Test Data***

In [92]:
prediction = model.predict(dtest)

In [93]:
prediction = prediction.reshape(-1, 1)

# decode prediction
decode_prediction = category_encoder.inverse_transform(prediction)

In [104]:
# turn into dataframe
predictions = pd.DataFrame(decode_prediction)

# create dummies
predictions = pd.get_dummies(predictions, prefix='', prefix_sep='', dtype=int)

In [135]:
# adding missing categories

category = train_set['Category'].to_numpy().reshape(-1, 1)

# get separate decoded categories
decoded_categories = category_encoder.inverse_transform(category)
decoded_categories = pd.DataFrame(decoded_categories)
decoded_categories = pd.get_dummies(decoded_categories, prefix='', prefix_sep='')

# getting missing categories
missing_categories = list(set(decoded_categories.columns) - set(predictions.columns))

# turning missing cateegories into dataframe
missing_categories = pd.DataFrame(0, index=predictions.index, columns=missing_categories)

# combining dataframes
predictions = pd.concat([predictions, missing_categories], axis=1)

In [137]:
# add ID column
predictions['Id'] = test_set['Id']

***Create Submission***

In [139]:
predictions.to_csv('submission.csv', index=False)