# Exercise 5
## Lillian Mueller and Regina Hong 
Comparing models using cross validation

In [2]:
from sklearn import (
    linear_model, 
    preprocessing,
    model_selection,
    metrics, 
    tree, 
    neighbors)
from sklearn.datasets import load_iris
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# loading in the iris dataset from sklearn
iris_data = load_iris()

# turning dataset into dataframe format for easier reading
df_iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

# adding the target as a class column and renaming numbers to class names
# 0 = setosa, 1 = versicolor, 2 = virginica

df_iris['class'] = iris_data.target.tolist()
df_iris['class'].replace({0: 'setosa', 1: 'versicolor', 2: 'virginica'},
                inplace=True)

#Create a new column for the target (to be ready for processing)
label_obj = preprocessing.LabelEncoder()
df_iris['class_level'] = label_obj.fit_transform(df_iris['class'])

print("Full Dataset:")
df_iris.sample(4)

Full Dataset:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class,class_level
131,7.9,3.8,6.4,2.0,virginica,2
123,6.3,2.7,4.9,1.8,virginica,2
109,7.2,3.6,6.1,2.5,virginica,2
52,6.9,3.1,4.9,1.5,versicolor,1


In [4]:
# cross validation of logistic model without penality
# using without penality highest accuracy as shown in report 3
predictors, target = iris_data.data, iris_data.target

logR_cross_val = model_selection.cross_val_score(
    linear_model.LogisticRegression(penalty=None), 
    predictors, 
    target, 
    cv = None,  # default -> 5-fold cross validation
    scoring='accuracy'
)
print("Cross Validation of Logistic Regression without Penalty")
logR = pd.DataFrame(logR_cross_val)
print(logR)

Cross Validation of Logistic Regression without Penalty
          0
0  1.000000
1  1.000000
2  0.933333
3  0.933333
4  1.000000


In [5]:
# cross validation of decision tree with entropy
# using entropy method showed highest accuracy as shown in report 2
dtree_cross_val = model_selection.cross_val_score(
    tree.DecisionTreeClassifier(criterion='entropy'), 
    predictors, 
    target, 
    cv = None,  # default -> 5-fold cross validation
    scoring='accuracy'
)
print("Cross Validation using Decision Tree with Entropy")
dtree = pd.DataFrame(dtree_cross_val)
print(dtree)

Cross Validation using Decision Tree with Entropy
          0
0  0.966667
1  0.966667
2  0.900000
3  0.966667
4  1.000000


In [6]:
# cross validation of knn where k=10 and metric = euclidean 
knn_cross_val = model_selection.cross_val_score(
    neighbors.KNeighborsClassifier(n_neighbors=10, metric='euclidean'), 
    predictors, 
    target, 
    cv = None,  # default -> 5-fold cross validation
    scoring='accuracy'
)
print("Cross Validation of KNN where k=10 based on euclidean distance")
knn = pd.DataFrame(knn_cross_val)
print(knn)

Cross Validation of KNN where k=10 based on euclidean distance
          0
0  0.966667
1  1.000000
2  1.000000
3  0.933333
4  1.000000


In [7]:
# Comparison of score means and standard deviation 
# create table comparing average and std of multiple dataframes
def comparison_table(dfs, columns):
    meanScores = pd.concat(
        [df.mean() for df in dfs], 
        axis=1)
    meanScores.columns = [col for col in columns]

    stdScores = pd.concat(
        [df.std() for df in dfs], 
        axis=1)
    stdScores.columns = [col for col in columns]

    comparison = pd.concat([meanScores, stdScores], ignore_index=True)
    comparison.index = ['Mean', 'Stardard Dev.']
    return comparison

comparison_table([logR, dtree, knn], ['Logistic Regression', 'Decision Tree', 'K-Nearest Neighbor'])

Unnamed: 0,Logistic Regression,Decision Tree,K-Nearest Neighbor
Mean,0.973333,0.96,0.98
Stardard Dev.,0.036515,0.036515,0.029814


In [8]:
# function that creates comparison table for log and tree model for iris data
def cross_val(numfolds):
    predictors, target = iris_data.data, iris_data.target

    logR_cross_val = model_selection.cross_val_score(
        linear_model.LogisticRegression(penalty=None), 
        predictors, 
        target, 
        cv = numfolds,
        scoring='accuracy'
    )
    logR = pd.DataFrame(logR_cross_val)

    dtree_cross_val = model_selection.cross_val_score(
        tree.DecisionTreeClassifier(criterion='entropy'), 
        predictors, 
        target, 
        cv = numfolds,
        scoring='accuracy'
    )
    dtree = pd.DataFrame(dtree_cross_val)

    knn_cross_val = model_selection.cross_val_score(
        neighbors.KNeighborsClassifier(n_neighbors=10, metric='euclidean'), 
        predictors, 
        target, 
        cv = numfolds,
        scoring='accuracy'
    )
    knn = pd.DataFrame(knn_cross_val)

    return comparison_table([logR, dtree, knn], ['Logistic Regression', 'Decision Tree', 'K-Nearest Neighbor'])

In [9]:
# test cross validation with 3 folds 
for i in range(6, 16, 2):
    print(i)
    print(cross_val(numfolds=i))

6
               Logistic Regression  Decision Tree  K-Nearest Neighbor
Mean                      0.973333       0.946667            0.966667
Stardard Dev.             0.032660       0.048442            0.046762
8
               Logistic Regression  Decision Tree  K-Nearest Neighbor
Mean                      0.980263       0.960526            0.966740
Stardard Dev.             0.039159       0.046653            0.039383
10
               Logistic Regression  Decision Tree  K-Nearest Neighbor
Mean                      0.980000       0.960000            0.966667
Stardard Dev.             0.044997       0.046614            0.047140
12
               Logistic Regression  Decision Tree  K-Nearest Neighbor
Mean                      0.966346       0.959936            0.965812
Stardard Dev.             0.081228       0.063993            0.055233
14
               Logistic Regression  Decision Tree  K-Nearest Neighbor
Mean                      0.980519       0.961039            0.966883
Stardar

In [19]:
cross_val(numfolds=8)

Unnamed: 0,Logistic Regression,Decision Tree,K-Nearest Neighbor
Mean,0.980263,0.960526,0.96674
Stardard Dev.,0.039159,0.046653,0.039383
