In [1]:
from IPython.core.magic import register_line_magic
import sys
import os
from subprocess import Popen, PIPE


def is_conda_environment():
    """Return True if the current Python executable is in a conda env"""
    # TODO: make this work with Conda.exe in Windows
    conda_exec = os.path.join(os.path.dirname(sys.executable), 'conda')
    conda_history = os.path.join(sys.prefix, 'conda-meta', 'history')
    return os.path.exists(conda_exec) and os.path.exists(conda_history)


@register_line_magic
def conda(args):
    """Use conda from the current kernel"""
    # TODO: make this work with Conda.exe in Windows
    # TODO: fix string encoding to work with Python 2
    if not is_conda_environment():
        raise ValueError("The python kernel does not appear to be a conda environment.  "
                         "Please use ``%pip install`` instead.")
    
    conda_executable = os.path.join(os.path.dirname(sys.executable), 'conda')
    args = [conda_executable] + args.split()
    
    # Add --prefix to point conda installation to the current environment
    if args[1] in ['install', 'update', 'upgrade', 'remove', 'uninstall', 'list']:
        if '-p' not in args and '--prefix' not in args:
            args.insert(2, '--prefix')
            args.insert(3, sys.prefix)
            
    # Because the notebook does not allow us to respond "yes" during the
    # installation, we need to insert --yes in the argument list for some commands
    if args[1] in ['install', 'update', 'upgrade', 'remove', 'uninstall', 'create']:
        if '-y' not in args and '--yes' not in args:
            args.insert(2, '--yes')
            
    # Call conda from command line with subprocess & send results to stdout & stderr
    with Popen(args, stdout=PIPE, stderr=PIPE) as process:
        # Read stdout character by character, as it includes real-time progress updates
        for c in iter(lambda: process.stdout.read(1), b''):
            sys.stdout.write(c.decode(sys.stdout.encoding))
        # Read stderr line by line, because real-time does not matter
        for line in iter(process.stderr.readline, b''):
            sys.stderr.write(line.decode(sys.stderr.encoding))

In [2]:
%conda install numpy

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [4]:
%conda install pandas
%conda install scikit-learn

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /Users/josephyeoh/opt/miniconda3

  added / updated specs:
    - scikit-learn


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    joblib-0.16.0              |             py_0         210 KB
    libgfortran-3.0.1          |       h93005f0_2         426 KB
    llvm-openmp-10.0.0         |       h28b9765_0         236 KB
    scikit-learn-0.23.2        |   py38h959d312_0         4.7 MB
    scipy-1.5.0                |   py38hbab996c_0        13.3 MB
    threadpoolctl-2.1.0        |     pyh5ca1d4c_0          17 KB
    ------------------------------------------------------------
      

In [1]:
# import relevant packages
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [3]:
df_data = pd.read_csv('Data-Collisions.csv')
df_data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,SEVERITYCODE,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,...,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,2,-122.323148,47.70314,1,1307,1307,3502005,Matched,Intersection,37475.0,...,Wet,Daylight,,,,10,Entering at angle,0,0,N
1,1,-122.347294,47.647172,2,52200,52200,2607959,Matched,Block,,...,Wet,Dark - Street Lights On,,6354039.0,,11,From same direction - both going straight - bo...,0,0,N
2,1,-122.33454,47.607871,3,26700,26700,1482393,Matched,Block,,...,Dry,Daylight,,4323031.0,,32,One parked--one moving,0,0,N
3,1,-122.334803,47.604803,4,1144,1144,3503937,Matched,Block,,...,Dry,Daylight,,,,23,From same direction - all others,0,0,N
4,2,-122.306426,47.545739,5,17700,17700,1807429,Matched,Intersection,34387.0,...,Wet,Daylight,,4028032.0,,10,Entering at angle,0,0,N


In [13]:
df_data.shape

(194673, 38)

In [14]:
df_data.columns

Index(['SEVERITYCODE', 'X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO',
       'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE',
       'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'COLLISIONTYPE',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INCDATE',
       'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC',
       'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
       'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC',
       'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

In [4]:
# Columns with SDOT internal codes are removed 
df_data.drop(columns = ['X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO', 'EXCEPTRSNCODE',
       'EXCEPTRSNDESC', 'SEVERITYCODE.1','SDOT_COLCODE', 'SDOT_COLDESC'], inplace=True)
df_data.head()

Unnamed: 0,SEVERITYCODE,STATUS,ADDRTYPE,INTKEY,LOCATION,SEVERITYDESC,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,...,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,2,Matched,Intersection,37475.0,5TH AVE NE AND NE 103RD ST,Injury Collision,Angles,2,0,0,...,Wet,Daylight,,,,10,Entering at angle,0,0,N
1,1,Matched,Block,,AURORA BR BETWEEN RAYE ST AND BRIDGE WAY N,Property Damage Only Collision,Sideswipe,2,0,0,...,Wet,Dark - Street Lights On,,6354039.0,,11,From same direction - both going straight - bo...,0,0,N
2,1,Matched,Block,,4TH AVE BETWEEN SENECA ST AND UNIVERSITY ST,Property Damage Only Collision,Parked Car,4,0,0,...,Dry,Daylight,,4323031.0,,32,One parked--one moving,0,0,N
3,1,Matched,Block,,2ND AVE BETWEEN MARION ST AND MADISON ST,Property Damage Only Collision,Other,3,0,0,...,Dry,Daylight,,,,23,From same direction - all others,0,0,N
4,2,Matched,Intersection,34387.0,SWIFT AVE S AND SWIFT AV OFF RP,Injury Collision,Angles,2,0,0,...,Wet,Daylight,,4028032.0,,10,Entering at angle,0,0,N


In [3]:
df_data.columns

Index(['SEVERITYCODE', 'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION',
       'SEVERITYDESC', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'INCDATE', 'INCDTTM', 'JUNCTIONTYPE',
       'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
       'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC',
       'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

In [4]:
df_data.dtypes

SEVERITYCODE        int64
STATUS             object
ADDRTYPE           object
INTKEY            float64
LOCATION           object
SEVERITYDESC       object
COLLISIONTYPE      object
PERSONCOUNT         int64
PEDCOUNT            int64
PEDCYLCOUNT         int64
VEHCOUNT            int64
INCDATE            object
INCDTTM            object
JUNCTIONTYPE       object
INATTENTIONIND     object
UNDERINFL          object
WEATHER            object
ROADCOND           object
LIGHTCOND          object
PEDROWNOTGRNT      object
SDOTCOLNUM        float64
SPEEDING           object
ST_COLCODE         object
ST_COLDESC         object
SEGLANEKEY          int64
CROSSWALKKEY        int64
HITPARKEDCAR       object
dtype: object

# Feature Identification

In [5]:
# Features narrowed down to certain columns based upon project objective, including the predicted column
features = df_data[['WEATHER', 'ROADCOND', 'LIGHTCOND', 'ADDRTYPE', 'SEVERITYCODE']]
features.head()

Unnamed: 0,WEATHER,ROADCOND,LIGHTCOND,ADDRTYPE,SEVERITYCODE
0,Overcast,Wet,Daylight,Intersection,2
1,Raining,Wet,Dark - Street Lights On,Block,1
2,Overcast,Dry,Daylight,Block,1
3,Clear,Dry,Daylight,Block,1
4,Raining,Wet,Daylight,Intersection,2


In [5]:
features["SEVERITYCODE"].unique()

array([2, 1])

In [6]:
features['WEATHER'].unique()

array(['Overcast', 'Raining', 'Clear', nan, 'Unknown', 'Other', 'Snowing',
       'Fog/Smog/Smoke', 'Sleet/Hail/Freezing Rain', 'Blowing Sand/Dirt',
       'Severe Crosswind', 'Partly Cloudy'], dtype=object)

In [7]:
features['ROADCOND'].unique()

array(['Wet', 'Dry', nan, 'Unknown', 'Snow/Slush', 'Ice', 'Other',
       'Sand/Mud/Dirt', 'Standing Water', 'Oil'], dtype=object)

In [8]:
features['LIGHTCOND'].unique()

array(['Daylight', 'Dark - Street Lights On', 'Dark - No Street Lights',
       nan, 'Unknown', 'Dusk', 'Dawn', 'Dark - Street Lights Off',
       'Other', 'Dark - Unknown Lighting'], dtype=object)

In [9]:
features['ADDRTYPE'].unique()

array(['Intersection', 'Block', 'Alley', nan], dtype=object)

# Data Cleaning

In [17]:
features.isna().sum()

WEATHER         5081
ROADCOND        5012
LIGHTCOND       5170
ADDRTYPE        1926
SEVERITYCODE       0
dtype: int64

In [6]:
#Drop rows with NaN values 
features = features.dropna(axis=0)

In [19]:
features.isna().sum()

WEATHER         0
ROADCOND        0
LIGHTCOND       0
ADDRTYPE        0
SEVERITYCODE    0
dtype: int64

In [5]:
features.shape

(187525, 5)

In [16]:
features.columns

Index(['WEATHER', 'ROADCOND', 'LIGHTCOND', 'ADDRTYPE'], dtype='object')

Remove Unknown Values

In [7]:
# Remove Unknown values in WEATHER column 
WEATHER_index = features[features['WEATHER'] == 'Unknown'].index
features.drop(WEATHER_index, inplace=True)
features.shape

(173466, 5)

In [8]:
# Remove Unknown values in ROADCOND column 
ROADCOND_index = features[features['ROADCOND'] == 'Unknown'].index
features.drop(ROADCOND_index, inplace=True)
features.shape

(172081, 5)

In [9]:
# Remove Unknown values in LIGHTCOND column 
LIGHTCOND_index = features[features['LIGHTCOND'] == 'Unknown'].index
features.drop(LIGHTCOND_index, inplace=True)
features.shape

(169781, 5)

In [17]:
print(features['WEATHER'].unique())
print(features['ROADCOND'].unique())
print(features['LIGHTCOND'].unique())
print(features['ADDRTYPE'].unique())

['Overcast' 'Raining' 'Clear' 'Snowing' 'Other' 'Fog/Smog/Smoke'
 'Sleet/Hail/Freezing Rain' 'Blowing Sand/Dirt' 'Severe Crosswind'
 'Partly Cloudy']
['Wet' 'Dry' 'Snow/Slush' 'Ice' 'Other' 'Sand/Mud/Dirt' 'Standing Water'
 'Oil']
['Daylight' 'Dark - Street Lights On' 'Dark - No Street Lights' 'Dusk'
 'Dawn' 'Dark - Street Lights Off' 'Other' 'Dark - Unknown Lighting']
['Intersection' 'Block' 'Alley']


Remove 'Other' Values

In [10]:
# Remove 'Other' values from  WEATHER column
WEATHER_index = features[features['WEATHER'] == 'Other'].index
features.drop(WEATHER_index, inplace=True)
features.shape

(169528, 5)

In [11]:
# Remove 'Other' values from  ROADCOND column
ROADCOND_index = features[features['ROADCOND'] == 'Other'].index
features.drop(ROADCOND_index, inplace=True)
features.shape

(169428, 5)

In [12]:
# Remove 'Other' values in LIGHTCOND column 
LIGHTCOND_index = features[features['LIGHTCOND'] == 'Other'].index
features.drop(LIGHTCOND_index, inplace=True)
features.shape

(169247, 5)

In [36]:
print(features['WEATHER'].unique())
print(features['ROADCOND'].unique())
print(features['LIGHTCOND'].unique())
print(features['ADDRTYPE'].unique())

['Overcast' 'Raining' 'Clear' 'Snowing' 'Fog/Smog/Smoke'
 'Sleet/Hail/Freezing Rain' 'Blowing Sand/Dirt' 'Severe Crosswind'
 'Partly Cloudy']
['Wet' 'Dry' 'Snow/Slush' 'Ice' 'Sand/Mud/Dirt' 'Standing Water' 'Oil']
['Daylight' 'Dark - Street Lights On' 'Dark - No Street Lights' 'Dusk'
 'Dawn' 'Dark - Street Lights Off' 'Dark - Unknown Lighting']
['Intersection' 'Block' 'Alley']


## Feature Encoding 

In [13]:
addr_dummy = pd.get_dummies(features['ADDRTYPE'])
addr_dummy.head()

Unnamed: 0,Alley,Block,Intersection
0,0,0,1
1,0,1,0
2,0,1,0
3,0,1,0
4,0,0,1


In [14]:
weather_dummy = pd.get_dummies(features['WEATHER'])
weather_dummy.head()

Unnamed: 0,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,Overcast,Partly Cloudy,Raining,Severe Crosswind,Sleet/Hail/Freezing Rain,Snowing
0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0


In [15]:
lightcond_dummy = pd.get_dummies(features['LIGHTCOND'])
lightcond_dummy.head()

Unnamed: 0,Dark - No Street Lights,Dark - Street Lights Off,Dark - Street Lights On,Dark - Unknown Lighting,Dawn,Daylight,Dusk
0,0,0,0,0,0,1,0
1,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0


In [16]:
roadcond_dummy = pd.get_dummies(features['ROADCOND'])
roadcond_dummy.head()

Unnamed: 0,Dry,Ice,Oil,Sand/Mud/Dirt,Snow/Slush,Standing Water,Wet
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,0,0,0,0,0,0,1


In [17]:
# Combine dummy datasets with features dataset
features = pd.concat([features, addr_dummy], axis=1)
features = pd.concat([features, weather_dummy], axis=1)
features = pd.concat([features, lightcond_dummy], axis=1)
features = pd.concat([features, roadcond_dummy], axis=1)
features.head()

Unnamed: 0,WEATHER,ROADCOND,LIGHTCOND,ADDRTYPE,SEVERITYCODE,Alley,Block,Intersection,Blowing Sand/Dirt,Clear,...,Dawn,Daylight,Dusk,Dry,Ice,Oil,Sand/Mud/Dirt,Snow/Slush,Standing Water,Wet
0,Overcast,Wet,Daylight,Intersection,2,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
1,Raining,Wet,Dark - Street Lights On,Block,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Overcast,Dry,Daylight,Block,1,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,Clear,Dry,Daylight,Block,1,0,1,0,0,1,...,0,1,0,1,0,0,0,0,0,0
4,Raining,Wet,Daylight,Intersection,2,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1


In [18]:
# Remove original columns 
features.drop(['ADDRTYPE', 'WEATHER', 'ROADCOND', 'LIGHTCOND'], axis=1, inplace=True)
features.head()

Unnamed: 0,SEVERITYCODE,Alley,Block,Intersection,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,Overcast,Partly Cloudy,Raining,...,Dawn,Daylight,Dusk,Dry,Ice,Oil,Sand/Mud/Dirt,Snow/Slush,Standing Water,Wet
0,2,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,1,0,1,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
3,1,0,1,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,2,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1


In [50]:
features.columns

Index(['SEVERITYCODE', 'Alley', 'Block', 'Intersection', 'Blowing Sand/Dirt',
       'Clear', 'Fog/Smog/Smoke', 'Overcast', 'Partly Cloudy', 'Raining',
       'Severe Crosswind', 'Sleet/Hail/Freezing Rain', 'Snowing',
       'Dark - No Street Lights', 'Dark - Street Lights Off',
       'Dark - Street Lights On', 'Dark - Unknown Lighting', 'Dawn',
       'Daylight', 'Dusk', 'Dry', 'Ice', 'Oil', 'Sand/Mud/Dirt', 'Snow/Slush',
       'Standing Water', 'Wet'],
      dtype='object')

In [19]:
# Ensure proper data type
features = features.astype('int')
features.dtypes

SEVERITYCODE                int64
Alley                       int64
Block                       int64
Intersection                int64
Blowing Sand/Dirt           int64
Clear                       int64
Fog/Smog/Smoke              int64
Overcast                    int64
Partly Cloudy               int64
Raining                     int64
Severe Crosswind            int64
Sleet/Hail/Freezing Rain    int64
Snowing                     int64
Dark - No Street Lights     int64
Dark - Street Lights Off    int64
Dark - Street Lights On     int64
Dark - Unknown Lighting     int64
Dawn                        int64
Daylight                    int64
Dusk                        int64
Dry                         int64
Ice                         int64
Oil                         int64
Sand/Mud/Dirt               int64
Snow/Slush                  int64
Standing Water              int64
Wet                         int64
dtype: object

In [20]:
# Separating severity values, remove from features dataframe
severity = features['SEVERITYCODE']
features.drop('SEVERITYCODE', axis=1, inplace=True)
features.head()

Unnamed: 0,Alley,Block,Intersection,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,Overcast,Partly Cloudy,Raining,Severe Crosswind,...,Dawn,Daylight,Dusk,Dry,Ice,Oil,Sand/Mud/Dirt,Snow/Slush,Standing Water,Wet
0,0,0,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


# Modelling

In [35]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, severity, test_size=0.4, random_state=4)

# split the test set into half for model evaluation
x_test, x_eval, y_test, y_eval = train_test_split(x_test, y_test, test_size=0.5, random_state=4)

K Nearest Neighbors

In [None]:
# find optimum K 
import numpy as np
k_range = 10
mean_acc = np.zeros((k_range-1))

for n in range(1,k_range):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(x_train,y_train)
    yhat=neigh.predict(x_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

In [26]:
print('The best accuracy is: ', mean_acc.max(), 'with k value of ', mean_acc.argmax()+1)


The best accuracy is:  0.6579514904428492 with k value of  6


Decision Tree Model

In [27]:
depth_max = 10
mean_acc = np.zeros((depth_max-1))

for n in range(1,depth_max):
    Tree = DecisionTreeClassifier(criterion="entropy", max_depth = n)
    Tree.fit(x_train, y_train)
    predicted = Tree.predict(x_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, predicted)

mean_acc

array([0.66923691, 0.66923691, 0.66923691, 0.66920736, 0.66932553,
       0.66900056, 0.66900056, 0.6690301 , 0.66897102])

In [28]:
print('The best accuracy is: ', mean_acc.max(), 'with max depth value of ', mean_acc.argmax()+1)


The best accuracy is:  0.6693255339891873 with max depth value of  5


SVM Models

In [23]:
#model with rbf kernel
model = svm.SVC(kernel='rbf') 
model.fit(x_train, y_train)
pred = model.predict(x_test)

print('Accuracy: ', metrics.accuracy_score(y_test, pred))

Accuracy:  0.6689414753759343


In [23]:
#model with linear kernel
linear_model = svm.SVC(kernel='linear') 
linear_model.fit(x_train, y_train)
linear_pred = linear_model.predict(x_test)

print('Accuracy: ', metrics.accuracy_score(y_test, linear_pred))

Accuracy:  0.6692369050784366


Logistic Regression

In [25]:
# model with liblinear kernel
lib_model = LogisticRegression(C=0.01, solver='liblinear')
lib_model.fit(x_train, y_train)
lib_predict = lib_model.predict(x_test)

print('Accuracy: ', metrics.accuracy_score(y_test, lib_predict))

Accuracy:  0.6692369050784366


In [26]:
lib_proba = lib_model.predict_proba(x_test)

In [27]:
# model with saga kernel 
saga_model = LogisticRegression(C=0.01, solver='saga')
saga_model.fit(x_train, y_train)
saga_predict = saga_model.predict(x_test)

print('Accuracy: ', metrics.accuracy_score(y_test, saga_predict))

Accuracy:  0.6692369050784366


# Model Evaluation

In [36]:
# Evaluation of K nearest neighbors with optimum K = 6
neigh = KNeighborsClassifier(n_neighbors = 6).fit(x_train,y_train) 
yhat = neigh.predict(x_eval)

print('f1 score: ', f1_score(y_eval, yhat, average='weighted'))
print('jaccard score: ', jaccard_score(y_eval, yhat))

f1 score:  0.5833123880832966
jaccard score:  0.6477178804297915


In [38]:
# Evaluation of Decision Tree model
DecisionTree = DecisionTreeClassifier(criterion="entropy", max_depth = 5) 
DecisionTree.fit(x_train, y_train) 
predicted = DecisionTree.predict(x_eval)

print('f1 score: ', f1_score(y_eval, predicted, average='weighted'))
print('jaccard score: ', jaccard_score(y_eval, predicted))

f1 score:  0.5388692281215637
jaccard score:  0.6702721872506427


In [39]:
# Evaluation of Support Vector machine model
model = svm.SVC(kernel='rbf')
model.fit(x_train, y_train)
pred = model.predict(x_eval)

print('f1 score: ', f1_score(y_eval, pred, average='weighted'))
print('jaccard score: ', jaccard_score(y_eval, pred))

f1 score:  0.5386098429262705
jaccard score:  0.6702225111551077


In [40]:
# Evaluation of Logistic Regression model
log_model = LogisticRegression(C=0.01, solver='liblinear')
log_model.fit(x_train, y_train)
log_predict = log_model.predict(x_eval)
log_proba = log_model.predict_proba(x_eval)

print('f1 score: ', f1_score(y_eval, log_predict, average='weighted'))
print('jaccard score: ', jaccard_score(y_eval, log_predict))
print('log loss: ', log_loss(y_eval, log_proba))

f1 score:  0.5381923104603031
jaccard score:  0.6704579025110783
log loss:  0.6183278520756956
