### Load dataset

In [3]:
import pandas as pd

pd.set_option('display.max_columns', 50)

df = pd.read_csv('NYPD_Complaint_Data_Current_YTD.csv')

### Remove unnecessary columns

In [4]:
columns_remove = ['CMPLNT_NUM', 'ADDR_PCT_CD', 'BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_TO_DT', 
                  'CMPLNT_TO_TM', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'KY_CD', 
                  'LOC_OF_OCCUR_DESC', 'PARKS_NM', 'PATROL_BORO', 'PD_CD', 
                  'RPT_DT', 'STATION_NAME', 'TRANSIT_DISTRICT', 'X_COORD_CD',
                  'Y_COORD_CD', 'Latitude', 'Longitude', 'Lat_Lon']

df = df.drop(columns_remove, axis=1)

df.head()

Unnamed: 0,CRM_ATPT_CPTD_CD,LAW_CAT_CD,OFNS_DESC,PD_DESC,PREM_TYP_DESC,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,COMPLETED,MISDEMEANOR,DANGEROUS WEAPONS,"WEAPONS, POSSESSION, ETC",STREET,,,,UNKNOWN,UNKNOWN,E
1,COMPLETED,FELONY,RAPE,RAPE 2,RESIDENCE - PUBLIC HOUSING,18-24,UNKNOWN,M,<18,BLACK,F
2,COMPLETED,MISDEMEANOR,OFF. AGNST PUB ORD SENSBLTY &,AGGRAVATED HARASSMENT 2,RESIDENCE-HOUSE,25-44,BLACK,M,18-24,BLACK,F
3,COMPLETED,FELONY,ROBBERY,"ROBBERY,DELIVERY PERSON",RESIDENCE - APT. HOUSE,UNKNOWN,WHITE HISPANIC,M,25-44,WHITE HISPANIC,M
4,COMPLETED,VIOLATION,HARRASSMENT 2,"HARASSMENT,SUBD 3,4,5",RESIDENCE - PUBLIC HOUSING,45-64,WHITE HISPANIC,F,25-44,BLACK,F


### Remove rows with null values

In [3]:
import numpy as np

# Replace UNKNOWN values with nulls
df.replace('UNKNOWN', np.NaN, inplace=True)

print('Number of rows before removing rows with missing values: ' + str(df.shape[0]))

# Remove rows with np.NaN
df.dropna(axis=0, inplace=True)

print('Number of rows after removing rows with missing values: ' + str(df.shape[0]))

Number of rows before removing rows with missing values: 109543
Number of rows after removing rows with missing values: 32061


### Get the feature and target vector

In [9]:
# Get the feature vector
X = df.drop('CRM_ATPT_CPTD_CD', axis = 1)

# Get the target vector
y = df['CRM_ATPT_CPTD_CD']

print(X.shape)
print(y.shape)

(32061, 10)
(32061,)


### Encode the target vector

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [16]:
X = MultiColumnLabelEncoder(columns = ['LAW_CAT_CD','OFNS_DESC','PD_DESC', 'PREM_TYP_DESC', 
                                       'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 
                                       'VIC_RACE', 'VIC_SEX']).fit_transform(X)

In [17]:
from sklearn.model_selection import train_test_split

# Randomly choose 30% of the data for testing (set randome_state as 0 and stratify as y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))

(array(['ATTEMPTED', 'COMPLETED'], dtype=object), array([  354, 22088]))
(array(['ATTEMPTED', 'COMPLETED'], dtype=object), array([ 152, 9467]))


In [23]:
from sklearn.linear_model import LogisticRegression

sklearn_lr = LogisticRegression(random_state=0, class_weight='balanced')

sklearn_lr.fit(X_train, y_train)

print('Accuracy of Model: ' + str(sklearn_lr.score(X_test, y_test)))

Accuracy of Model: 0.7488304397546522


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# The list of value for hyperparameter C (penalty parameter)
Cs = [0.01, 0.1, 1]

# The list of choice for hyperparameter kernel
kernels = ['linear', 'rbf', 'sigmoid', 'polynomial']

# The list of [score, setting], where score is the score of the classifier and setting a pair of (C, kernel)
score_settings = []

# For each C
for C in Cs:
    # For each kernel
    for kernel in kernels:
        # Declare the classifier with hyperparameter C, kernel, class_weight, and random_state
        # Implement me
        clf = SVC(C = C, kernel = kernel, class_weight = 'balanced', random_state = 0)
        
        # The pipeline, with StandardScaler and clf defined above
        # Implement me
        pipe_clf = Pipeline([('StandardScaler',StandardScaler()),('clf',clf)])

        # Fit the pipeline
        # Implement me
        pipe_clf.fit(X_train, y_train)
        # Get the score (rounding to two decimal places)
        score = round(pipe_clf.score(X_test, y_test), 2)
        
        # Get the setting, which is a pair of (C, kernel)
        # Implement me
        setting = [C, kernel]

        # Append [score, setting] to score_settings
        # Implement me
        score_settings.append([score, setting])

        # Sort score_settings in descending order of score
# Implement me
score_settings = sorted(score_settings, key = lambda x: x[0], reverse= True)

# Print score_settings
print('The list of [score, setting] is:')
for score_setting in score_settings:
    print(score_setting)
print()

# Print the best setting
print('The best setting is:')
print('C: ' + str(score_settings[0][1][0]))
print('kernel: ' + score_settings[0][1][1])

The list of [score, setting] is:
[0.85, [1, 'rbf']]
[0.8, [0.1, 'rbf']]
[0.76, [0.01, 'linear']]
[0.76, [0.1, 'linear']]
[0.76, [1, 'linear']]
[0.73, [0.01, 'rbf']]
[0.64, [0.01, 'sigmoid']]
[0.6, [0.1, 'sigmoid']]
[0.59, [1, 'sigmoid']]

The best setting is:
C: 1
kernel: rbf


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

clfs = {'lr': LogisticRegression(random_state=0),
        'mlp': MLPClassifier(random_state=0),
        'dt': DecisionTreeClassifier(random_state=0),
        'rf': RandomForestClassifier(random_state=0),
        'svc': SVC(random_state=0)}

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe_clfs = {}

for name, clf in clfs.items():
    # Implement me
    pipe_clfs[name] = Pipeline([('StandardScaler',StandardScaler()), ('clf', clf)])

{'lr': Pipeline(memory=None,
      steps=[('StandardScaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False))]), 'mlp': Pipeline(memory=None,
      steps=[('StandardScaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
        beta_2=0.999, early_stopping=False, epsilon=1e-08,
        hidden_layer_sizes=(100,), learning_rate='constant',
        lear...       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
        warm_start=False))]), 'dt': Pipeline(memory=None,
      steps=[('StandardScaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', DecisionTreeClassifier(class_we

In [26]:
param_grids = {}

In [36]:
C_range = [10 ** i for i in range(-4, 5)]

param_grid = [{'clf__multi_class': ['ovr'], 
               'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
               'clf__C': C_range},
              {'clf__multi_class': ['multinomial'],
               'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
               'clf__C': C_range}]

param_grids['lr'] = param_grid

In [37]:
param_grid = [{'clf__hidden_layer_sizes': [10, 100, 200],
               'clf__activation': ['identity', 'logistic', 'tanh', 'relu']}]

param_grids['mlp'] = param_grid

In [38]:
param_grid = [{'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]

param_grids['dt'] = param_grid

In [39]:
param_grid = [{'clf__n_estimators': [2, 10, 30],
               'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]

param_grids['rf'] = param_grid

In [40]:
param_grid = [{'clf__C': [0.01, 0.1, 1, 10, 100],
               'clf__gamma': [0.01, 0.1, 1, 10, 100],
               'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}]

param_grids['svc'] = param_grid

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# The list of [best_score_, best_params_, best_estimator_]
best_score_param_estimators = []

# For each classifier
for name in pipe_clfs.keys():
    # GridSearchCV
    # Implement me
    gs = GridSearchCV(estimator=pipe_clfs[name],
                  param_grid=param_grids[name],
                  scoring='accuracy',
                  n_jobs=-1,
                  cv=StratifiedKFold(n_splits=10,
                                     shuffle=True,
                                     random_state=0))
    
    # Fit the pipeline
    # Implement me
    gs = gs.fit(X, y)
    
    # Update best_score_param_estimators
    best_score_param_estimators.append([gs.best_score_, gs.best_params_, gs.best_estimator_])