# Exercise 3
## Lillian Mueller and Regina Hong 
Investigating linear regression for Iris Dataset

In [1]:
from sklearn import (
    linear_model, 
    preprocessing, 
    model_selection, 
    metrics, 
    tree)
from sklearn.datasets import load_iris
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# loading in the iris dataset from sklearn
iris_data = load_iris()

# turning dataset into dataframe format for easier reading
df_iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
# adding the target as a class column and renaming numbers to class names
# 0 = setosa, 1 = versicolor, 2 = virginica

df_iris['class'] = iris_data.target.tolist()
df_iris['class'].replace({0: 'setosa', 1: 'versicolor', 2: 'virginica'},
                inplace=True)

#Create a new column for the target (to be ready for processing)
label_obj = preprocessing.LabelEncoder()
df_iris['class_level'] = label_obj.fit_transform(df_iris['class'])

df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class,class_level
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [4]:
# Creating Train and Test datasets
predictors, target = iris_data.data, iris_data.target
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    predictors, 
    target, 
    test_size = 0.33)

# create logistic regression to classify iris
logR = linear_model.LogisticRegression().fit(x_train, y_train)

# find coefficients 
B0 = logR.intercept_[0]
B1 = logR.coef_[0][0]
print('B0: ', B0)
print('B1: ', B1)


mu = -B0/B1
s = 1/B1
print('mu: ', mu)
print('s: ', s)

B0:  10.02054032505232
B1:  -0.5238154000211254
mu:  19.12990783518047
s:  -1.909069492725243


In [5]:
# Test model with testing dataset and find accuracy 
logR_pred = logR.predict(x_test)
accuracy = {
    'LogReg' : [
        metrics.accuracy_score(y_train, logR.predict(x_train)), 
        metrics.accuracy_score(y_test, logR_pred),
        metrics.r2_score(y_test, logR_pred)
    ]
}
print(accuracy['LogReg'])

[0.96, 0.92, 0.8945147679324894]


In [6]:
# create decision tree model using same training and test datasets 
dTree = tree.DecisionTreeClassifier(criterion='gini').fit(x_train, y_train)

# Test model with testing dataset and find accuracy 
dtree_pred = dTree.predict(x_test)
accuracy['DTree'] = [
        metrics.accuracy_score(y_train, dTree.predict(x_train)), 
        metrics.accuracy_score(y_test, dtree_pred),
        metrics.r2_score(y_test, dtree_pred)
    ]

print(accuracy['DTree'])

[1.0, 0.88, 0.8417721518987342]


In [7]:
# Compare the accuracy of each model 
pd.DataFrame.from_dict(
    accuracy,
    orient='index', 
    columns=[
        'Train Data Accuracy', 
        'Test Data Accuracy', 
        'r2 Score'
    ])

Unnamed: 0,Train Data Accuracy,Test Data Accuracy,r2 Score
LogReg,0.96,0.92,0.894515
DTree,1.0,0.88,0.841772


In [8]:
def run_dt(criterion, X_train, X_test, Y_train, Y_test):

    x_train, x_test, y_train, y_test = X_train, X_test, Y_train, Y_test

    dTree = tree.DecisionTreeClassifier(criterion=criterion).fit(x_train, y_train)
    dtree_pred = dTree.predict(x_test)
    
    return [ 
        metrics.accuracy_score(y_train, dTree.predict(x_train)), 
        metrics.accuracy_score(y_test, dtree_pred),
        metrics.r2_score(y_test, dtree_pred)
    ]

In [9]:
# set max_iter parameter to 1000 so that model woulod converge
def run_LR(penalty, X_train, X_test, Y_train, Y_test):

    x_train, x_test, y_train, y_test = X_train, X_test, Y_train, Y_test
    
    logR = linear_model.LogisticRegression(penalty=penalty,
                                          max_iter=1000).fit(x_train, y_train)
    
    # find coefficients 
    B0 = logR.intercept_[0]
    B1 = logR.coef_[0][0]    
    
    mu = -B0/B1
    s = 1/B1

    return [
        metrics.accuracy_score(y_train, logR.predict(x_train)), 
        metrics.accuracy_score(y_test, logR_pred),
        metrics.r2_score(y_test, logR_pred)
    ]

In [10]:
print('Run dt model with gini inpurity 1000 times')
dt_scores = []
for i in range(1000):
    dt_scores.append(run_dt('gini', x_train, x_test, y_train, y_test))

dt_score_df = pd.DataFrame(dt_scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(dt_score_df.describe().round(6))

Run dt model with gini inpurity 1000 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count               1000.0         1000.000000  1000.000000
mean                   1.0            0.897320     0.864610
std                    0.0            0.012303     0.016223
min                    1.0            0.880000     0.841772
25%                    1.0            0.880000     0.841772
50%                    1.0            0.900000     0.868143
75%                    1.0            0.900000     0.868143
max                    1.0            0.920000     0.894515


In [11]:
print('Run LR model with L2 penalty 1000 times')
LR_L2_scores = []
for i in range(1000):
    LR_L2_scores.append(run_LR('l2', x_train, x_test, y_train, y_test))

LR_L2_score_df = pd.DataFrame(LR_L2_scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(LR_L2_score_df.describe().round(6))

Run LR model with L2 penalty 1000 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count              1000.00             1000.00  1000.000000
mean                  0.96                0.92     0.894515
std                   0.00                0.00     0.000000
min                   0.96                0.92     0.894515
25%                   0.96                0.92     0.894515
50%                   0.96                0.92     0.894515
75%                   0.96                0.92     0.894515
max                   0.96                0.92     0.894515


In [12]:
print('Run LR model with No penalty 1000 times')
LR_N_scores = []
for i in range(1000):
    LR_N_scores.append(run_LR(None, x_train, x_test, y_train, y_test))

LR_N_score_df = pd.DataFrame(LR_N_scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(LR_N_score_df.describe().round(6))

Run LR model with No penalty 1000 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count              1000.00             1000.00  1000.000000
mean                  0.98                0.92     0.894515
std                   0.00                0.00     0.000000
min                   0.98                0.92     0.894515
25%                   0.98                0.92     0.894515
50%                   0.98                0.92     0.894515
75%                   0.98                0.92     0.894515
max                   0.98                0.92     0.894515


In [13]:
# create ranking of results of logistic regression model 
probRanks = pd.DataFrame(
    logR.predict_proba(predictors), 
    columns=[
        'Prob of setosa (0)', 
        'Prob of versicolor (1)', 
        'Prob of virginica (2)'
    ]
)
probRanks.sample(10).round(3)

Unnamed: 0,Prob of setosa (0),Prob of versicolor (1),Prob of virginica (2)
123,0.001,0.52,0.479
12,0.965,0.035,0.0
1,0.961,0.039,0.0
53,0.02,0.951,0.029
128,0.0,0.081,0.919
59,0.047,0.925,0.028
77,0.001,0.517,0.482
58,0.004,0.877,0.119
34,0.957,0.043,0.0
61,0.019,0.911,0.071


In [14]:
# rank two new records 
records = [
    [5.8,2.8,5.1,2.4],
    [6.0,2.2,4.0,1.0]
]
newRecords = pd.DataFrame(
    records, 
    columns=[
        'sepal length (cm)',
        'sepal width (cm)',
        'petal length (cm)',
        'petal width (cm)'
    ]
)

# making prediction with model 
newRecords['Prediction'] = logR.predict(records)

# adding rankings to dataframe
rankProbs = pd.DataFrame(
    logR.predict_proba(records),
    columns=[
        'Prob of setosa (0)', 
        'Prob of versicolor (1)', 
        'Prob of virginica (2)'
        ]
    )
newRecords[rankProbs.columns] = rankProbs

newRecords

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Prediction,Prob of setosa (0),Prob of versicolor (1),Prob of virginica (2)
0,5.8,2.8,5.1,2.4,2,0.000287,0.140913,0.8588
1,6.0,2.2,4.0,1.0,1,0.013091,0.97222,0.014688
