### Load dataset

In [2]:
import pandas as pd

# Load the data
df = pd.read_csv('NYPD_Complaint_Data_COMPLETE_DATA_ONLY_2.csv')

df.head()

Unnamed: 0,CRM_ATPT_CPTD_CD,LAW_CAT_CD,OFNS_DESC,PD_DESC,PREM_TYP_DESC,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,COMPLETED,MISDEMEANOR,OFF. AGNST PUB ORD SENSBLTY &,AGGRAVATED HARASSMENT 2,RESIDENCE-HOUSE,25-44,BLACK,M,18-24,BLACK,F
1,COMPLETED,VIOLATION,HARRASSMENT 2,"HARASSMENT,SUBD 3,4,5",RESIDENCE - PUBLIC HOUSING,45-64,WHITE HISPANIC,F,25-44,BLACK,F
2,COMPLETED,FELONY,CRIMINAL MISCHIEF & RELATED OF,"MISCHIEF,CRIMINAL, UNCL 2ND",RESIDENCE - PUBLIC HOUSING,25-44,BLACK,M,25-44,BLACK,F
3,COMPLETED,VIOLATION,HARRASSMENT 2,"HARASSMENT,SUBD 3,4,5",RESIDENCE - APT. HOUSE,25-44,WHITE HISPANIC,M,25-44,ASIAN/PAC.ISL,M
4,COMPLETED,VIOLATION,HARRASSMENT 2,"HARASSMENT,SUBD 3,4,5",RESIDENCE-HOUSE,25-44,BLACK,F,18-24,BLACK,F


### Remove rows with missing value

In [3]:
import numpy as np

print('Number of rows before removing rows with missing values: ' + str(df.shape[0]))

# Remove rows with np.NaN
df.dropna(how='any', inplace=True)

print('Number of rows after removing rows with missing values: ' + str(df.shape[0]))

Number of rows before removing rows with missing values: 31710
Number of rows after removing rows with missing values: 31585


### Get the feature and target vector

In [4]:
# Get the feature vector
X = df.drop('CRM_ATPT_CPTD_CD', axis = 1)

# Get the target vector
y = df['CRM_ATPT_CPTD_CD']

print(X.shape)
print(y.shape)

(31585, 10)
(31585,)


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)


In [6]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)


In [7]:
X = MultiColumnLabelEncoder(columns = ['LAW_CAT_CD','OFNS_DESC','PD_DESC', 'PREM_TYP_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX']).fit_transform(X)

In [13]:
from sklearn.model_selection import train_test_split

# Randomly choose 30% of the data for testing (set randome_state as 0 and stratify as y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))

(array([0, 1]), array([  345, 21764]))
(array([0, 1]), array([ 148, 9328]))


In [10]:
from sklearn.linear_model import LogisticRegression

sklearn_lr = LogisticRegression()

sklearn_lr.fit(X_train, y_train)

print('Accuracy of Model: ' + str(sklearn_lr.score(X_test, y_test)))

Accuracy of Model: 0.984381595609962


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# The list of value for hyperparameter C (penalty parameter)
Cs = [0.01, 0.1, 1]

# The list of choice for hyperparameter kernel
kernels = ['linear', 'rbf', 'sigmoid']

# The list of [score, setting], where score is the score of the classifier and setting a pair of (C, kernel)
score_settings = []

# For each C
for C in Cs:
    # For each kernel
    for kernel in kernels:
        # Declare the classifier with hyperparameter C, kernel, class_weight, and random_state
        # Implement me
        clf = SVC(C = C, kernel = kernel, class_weight = 'balanced', random_state = 0)
        
        # The pipeline, with StandardScaler and clf defined above
        # Implement me
        pipe_clf = Pipeline([('StandardScaler',StandardScaler()),('clf',clf)])

        # Fit the pipeline
        # Implement me
        pipe_clf.fit(X_train, y_train)
        # Get the score (rounding to two decimal places)
        score = round(pipe_clf.score(X_test, y_test), 2)
        
        # Get the setting, which is a pair of (C, kernel)
        # Implement me
        setting = [C, kernel]

        # Append [score, setting] to score_settings
        # Implement me
        score_settings.append([score, setting])
# Sort score_settings in descending order of score
# Implement me
score_settings = sorted(score_settings, key = lambda x: x[0], reverse= True)

# Print score_settings
print('The list of [score, setting] is:')
for score_setting in score_settings:
    print(score_setting)
print()

# Print the best setting
print('The best setting is:')
print('C: ' + str(score_settings[0][1][0]))
print('kernel: ' + score_settings[0][1][1])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

clfs = {'lr': LogisticRegression(random_state=0),
        'mlp': MLPClassifier(random_state=0),
        'dt': DecisionTreeClassifier(random_state=0),
        'rf': RandomForestClassifier(random_state=0),
        'svc': SVC(random_state=0)}

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe_clfs = {}

for name, clf in clfs.items():
    # Implement me
    pipe_clfs[name] = Pipeline([('StandardScaler',StandardScaler()), ('clf', clf)])

In [None]:
param_grids = {}

In [None]:
C_range = [10 ** i for i in range(-4, 5)]

param_grid = [{'clf__multi_class': ['ovr'], 
               'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
               'clf__C': C_range},
              {'clf__multi_class': ['multinomial'],
               'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
               'clf__C': C_range}]

# Implement me
param_grids['lr'] = param_grid

In [None]:
param_grid = [{'clf__hidden_layer_sizes': [10, 100, 200],
               'clf__activation': ['identity', 'logistic', 'tanh', 'relu']}]

# Implement me
param_grids['mlp'] =  param_grid

In [None]:
param_grid = [{'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]

# Implement me
param_grids['dt'] = param_grid

In [None]:
param_grid = [{'clf__n_estimators': [2, 10, 30],
               'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]

# Implement me
param_grids['rf'] = param_grid

In [None]:
param_grid = [{'clf__C': [0.01, 0.1, 1, 10, 100],
               'clf__gamma': [0.01, 0.1, 1, 10, 100],
               'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}]

# Implement me
param_grids['svc'] = param_grid

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# The list of [best_score_, best_params_, best_estimator_]
best_score_param_estimators = []

# For each classifier
for name in pipe_clfs.keys():
    # GridSearchCV
    # Implement me
    gs = GridSearchCV(estimator=pipe_clfs[name],
                  param_grid=param_grids[name],
                  scoring='accuracy',
                  n_jobs=-1,
                  cv=StratifiedKFold(n_splits=10,
                                     shuffle=True,
                                     random_state=0))
    
    # Fit the pipeline
    # Implement me
    gs = gs.fit(X, y)
    
    # Update best_score_param_estimators
    best_score_param_estimators.append([gs.best_score_, gs.best_params_, gs.best_estimator_])