# Exercise 4
## Lillian Mueller and Regina Hong 
Comparing models using cross validation

In [1]:
from sklearn import (
    linear_model, 
    preprocessing,
    model_selection,
    metrics, 
    tree)
from sklearn.datasets import load_iris
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# loading in the iris dataset from sklearn
iris_data = load_iris()

# turning dataset into dataframe format for easier reading
df_iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

# adding the target as a class column and renaming numbers to class names
# 0 = setosa, 1 = versicolor, 2 = virginica

df_iris['class'] = iris_data.target.tolist()
df_iris['class'].replace({0: 'setosa', 1: 'versicolor', 2: 'virginica'},
                inplace=True)

#Create a new column for the target (to be ready for processing)
label_obj = preprocessing.LabelEncoder()
df_iris['class_level'] = label_obj.fit_transform(df_iris['class'])

print("Full Dataset:")
df_iris.sample(4)

Full Dataset:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class,class_level
102,7.1,3.0,5.9,2.1,virginica,2
44,5.1,3.8,1.9,0.4,setosa,0
40,5.0,3.5,1.3,0.3,setosa,0
8,4.4,2.9,1.4,0.2,setosa,0


In [3]:
# cross validation of logistic model without penality
# using without penality highest accuracy as shown in report 3
predictors, target = iris_data.data, iris_data.target

logR_cross_val = model_selection.cross_val_score(
    linear_model.LogisticRegression(penalty=None), 
    predictors, 
    target, 
    cv = None,  # default -> 5-fold cross validation
    scoring='accuracy'
)
print("Cross Validation of Logistic Regression without Penalty")
logR = pd.DataFrame(logR_cross_val)
print(logR)

Cross Validation of Logistic Regression without Penalty
          0
0  1.000000
1  1.000000
2  0.933333
3  0.933333
4  1.000000


In [4]:
# cross validation of decision tree with entropy
# using entropy method showed highest accuracy as shown in report 2
dtree_cross_val = model_selection.cross_val_score(
    tree.DecisionTreeClassifier(criterion='entropy'), 
    predictors, 
    target, 
    cv = None,  # default -> 5-fold cross validation
    scoring='accuracy'
)
print("Cross Validation using Decision Tree with Entropy")
dtree = pd.DataFrame(dtree_cross_val)
print(dtree)

Cross Validation using Decision Tree with Entropy
          0
0  0.966667
1  0.966667
2  0.900000
3  0.966667
4  1.000000


In [5]:
# Comparison of score means and standard deviation 
# create table comparing average and std of multiple dataframes
def comparison_table(dfs, columns):
    meanScores = pd.concat(
        [df.mean() for df in dfs], 
        axis=1)
    meanScores.columns = [col for col in columns]

    stdScores = pd.concat(
        [df.std() for df in dfs], 
        axis=1)
    stdScores.columns = [col for col in columns]

    comparison = pd.concat([meanScores, stdScores], ignore_index=True)
    comparison.index = ['Mean', 'Stardard Dev.']
    return comparison

comparison_table([logR, dtree], ['Logistic Regression', 'Decision Tree'])

In [6]:
# create function to perform manual cross validation
def perform_manual_cross_validation(dataset, feature_cols, target_col, model, numFolds):
    accuracy = [] # collect accuracy of each fold
    fulldataset = dataset.copy() # keep copy of full dataset

    for i in range(numFolds):
        # get separate group for testing fold
        # since dataframe is getting smaller, must adjust the fraction of entries taken each time
        test_fold = dataset.sample(frac=(1/(numFolds-i)))

        # get remaining dataset to use a training folds
        train_folds = fulldataset.drop(test_fold.index)

        # fit model 
        mod = model.fit(train_folds[feature_cols], train_folds[target_col])
        
        # get accuracy of model
        accuracy.append(
            metrics.accuracy_score(
                test_fold[target_col], 
                mod.predict(test_fold[feature_cols]))
        )

        # keep track of unused data entries 
        dataset.drop(test_fold.index, inplace=True)
    
    return accuracy


In [7]:
# Cross Validation of Logistic Regression without Penalty
logR_accuracy = perform_manual_cross_validation(
    df_iris.copy(), 
    ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 
    'class_level', 
    linear_model.LogisticRegression(penalty=None), 
    5
    )
accuracy = pd.DataFrame()
accuracy['Logistic Regression'] = logR_accuracy

# Cross Validation using Decision Tree with Entropy
dtree_accuracy = perform_manual_cross_validation(
    df_iris.copy(), 
    ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 
    'class_level', 
    tree.DecisionTreeClassifier(criterion='entropy'), 
    5
    )
accuracy['Decision Tree'] = dtree_accuracy

print("Experimenting with Manual Cross Validation")
print(accuracy)
print('\nAverage')
print(accuracy.mean())
print('\nStandard Deviation')
print(accuracy.std())

Experimenting with Manual Cross Validation
   Logistic Regression  Decision Tree
0             1.000000       0.966667
1             0.966667       0.966667
2             0.966667       0.900000
3             1.000000       0.933333
4             0.966667       0.966667

Average
Logistic Regression    0.980000
Decision Tree          0.946667
dtype: float64

Standard Deviation
Logistic Regression    0.018257
Decision Tree          0.029814
dtype: float64


In [3]:
# function that creates comparison table for log and tree model for iris data
def cross_val(numfolds):
    predictors, target = iris_data.data, iris_data.target

    logR_cross_val = model_selection.cross_val_score(
        linear_model.LogisticRegression(penalty=None), 
        predictors, 
        target, 
        cv = numfolds,
        scoring='accuracy'
    )
    logR = pd.DataFrame(logR_cross_val)

    dtree_cross_val = model_selection.cross_val_score(
    tree.DecisionTreeClassifier(criterion='entropy'), 
    predictors, 
    target, 
    cv = numfolds,
    scoring='accuracy'
    )
    dtree = pd.DataFrame(dtree_cross_val)

    return comparison_table([logR, dtree], ['Logistic Regression', 'Decision Tree'])

In [9]:
# test cross validation with 3 folds 
cross_val(numfolds=3)

Unnamed: 0,Logistic Regression,Decision Tree
Mean,0.973333,0.953333
Stardard Dev.,0.023094,0.030551


In [10]:
# test cross validation with 7 folds 
cross_val(numfolds=7)

Unnamed: 0,Logistic Regression,Decision Tree
Mean,0.959802,0.946506
Stardard Dev.,0.074425,0.050403


In [11]:
# test cross validation with 10 folds 
cross_val(numfolds=10)

Unnamed: 0,Logistic Regression,Decision Tree
Mean,0.98,0.96
Stardard Dev.,0.044997,0.046614


In [6]:
cross_val(15)

Unnamed: 0,Logistic Regression,Decision Tree
Mean,0.98,0.96
Stardard Dev.,0.041404,0.063246


In [7]:
cross_val(50)

Unnamed: 0,Logistic Regression,Decision Tree
Mean,0.98,0.946667
Stardard Dev.,0.104545,0.123443


In [11]:
cross_val(14)

Unnamed: 0,Logistic Regression,Decision Tree
Mean,0.980519,0.953896
Stardard Dev.,0.03871,0.047901
