In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('Dataset/Train.csv')
train_df.head()

Unnamed: 0,INCIDENT_ID,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15,MULTIPLE_OFFENSE
0,CR_102659,04-JUL-04,0,36,34,2,1,5,6,1,6,1,174,1.0,92,29,36,0
1,CR_189752,18-JUL-17,1,37,37,0,0,11,17,1,6,1,236,1.0,103,142,34,1
2,CR_184637,15-MAR-17,0,3,2,3,5,1,0,2,3,1,174,1.0,110,93,34,1
3,CR_139071,13-FEB-09,0,33,32,2,1,7,1,1,6,1,249,1.0,72,29,34,1
4,CR_109335,13-APR-05,0,33,32,2,1,8,3,0,5,1,174,0.0,112,29,43,1


In [3]:
test_df = pd.read_csv('Dataset/Test.csv')
test_df.head()

Unnamed: 0,INCIDENT_ID,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,CR_195453,01-FEB-18,0,30,35,7,3,6,4,0,5,1,174,,72,119,23
1,CR_103520,05-MAR-04,0,44,44,1,3,7,1,4,6,1,316,0.0,12,29,34
2,CR_196089,27-JAN-18,0,34,33,3,5,2,7,3,0,1,316,1.0,72,0,34
3,CR_112195,18-AUG-06,7,3,2,3,5,9,8,0,5,1,174,1.0,112,87,34
4,CR_149832,31-OCT-11,0,7,8,7,3,2,7,1,5,1,174,0.0,112,93,43


## Train data cleaning

In [4]:
# dropping date & Incident Id as they are randomly assigned data. As per the problem statement the anonymous logs are
# the ones containing the pattern.

X = train_df.drop(['INCIDENT_ID', 'DATE'], axis=1)

In [5]:
X.isnull().sum()

X_1                   0
X_2                   0
X_3                   0
X_4                   0
X_5                   0
X_6                   0
X_7                   0
X_8                   0
X_9                   0
X_10                  0
X_11                  0
X_12                182
X_13                  0
X_14                  0
X_15                  0
MULTIPLE_OFFENSE      0
dtype: int64

In [6]:
# Since the missing values in X_12 are less than 1% of the whole package, we will go ahead and drop null values.
X.dropna(inplace=True)
X.isnull().sum()

X_1                 0
X_2                 0
X_3                 0
X_4                 0
X_5                 0
X_6                 0
X_7                 0
X_8                 0
X_9                 0
X_10                0
X_11                0
X_12                0
X_13                0
X_14                0
X_15                0
MULTIPLE_OFFENSE    0
dtype: int64

In [7]:
y = X['MULTIPLE_OFFENSE']
X = X.drop(['MULTIPLE_OFFENSE'], axis=1)

## Test data cleaning

In [8]:
test_df.isnull().sum()

INCIDENT_ID      0
DATE             0
X_1              0
X_2              0
X_3              0
X_4              0
X_5              0
X_6              0
X_7              0
X_8              0
X_9              0
X_10             0
X_11             0
X_12           127
X_13             0
X_14             0
X_15             0
dtype: int64

In [9]:
# We will go ahead and delete the two rows.
# mean fill the "Missing at random" values
test_df.fillna(train_df['X_12'].mean(), inplace=True)
n_test_df = test_df.drop(['INCIDENT_ID', 'DATE'], axis=1)
n_test_df.isnull().sum()

X_1     0
X_2     0
X_3     0
X_4     0
X_5     0
X_6     0
X_7     0
X_8     0
X_9     0
X_10    0
X_11    0
X_12    0
X_13    0
X_14    0
X_15    0
dtype: int64

## Model Training

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# To make sure train data is not overfitting we will check its score with k fold cross validation.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model_random = RandomForestClassifier()
print(cross_val_score(model_random, X_train, y_train, cv=10, scoring='accuracy').mean())

0.991446199553627


In [20]:
#Since cross val score proved that the forest will be a perfect fot the data, will go ahead and train the data on the same.
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
model.score(X_test, y_test)

0.9945089757127772

In [22]:
# Perfect score without any hyperparameter tuning

prediction = model.predict(n_test_df)

output = pd.DataFrame({'INCIDENT_ID': test_df.INCIDENT_ID, 'MULTIPLE_OFFENSE': prediction})
output.to_csv('my_submission.csv', index=False)