# Exercise 3
## Lillian Mueller and Regina Hong 
Investigating linear regression for Iris Dataset

In [15]:
from sklearn import (
    linear_model, 
    preprocessing, 
    model_selection, 
    metrics, 
    tree)
from sklearn.datasets import load_iris
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [16]:
# loading in the iris dataset from sklearn
iris_data = load_iris()

# turning dataset into dataframe format for easier reading
df_iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [17]:
# adding the target as a class column and renaming numbers to class names
# 0 = setosa, 1 = versicolor, 2 = virginica

df_iris['class'] = iris_data.target.tolist()
df_iris['class'].replace({0: 'setosa', 1: 'versicolor', 2: 'virginica'},
                inplace=True)

#Create a new column for the target (to be ready for processing)
label_obj = preprocessing.LabelEncoder()
df_iris['class_level'] = label_obj.fit_transform(df_iris['class'])

df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class,class_level
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [32]:
# Creating Train and Test datasets
predictors, target = iris_data.data, iris_data.target
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    predictors, 
    target, 
    test_size = 0.33)

# create logistic regression to classify iris
logR = linear_model.LogisticRegression().fit(x_train, y_train)

# find coefficients 
B0 = logR.intercept_[0]
B1 = logR.coef_[0][0]
B2 = logR.coef_[0][1]
B3 = logR.coef_[0][2]
B4 = logR.coef_[0][3]
print('B0: ', B0)
print('B1: ', B1)
print('B2: ', B2)
print('B3: ', B3)
print('B4: ', B4)

#mu = -B0/B1
#s = 1/B1
#print('mu: ', mu)
#print('s: ', s)

B0:  9.25103654908234
B1:  -0.43987188985753245
B2:  0.791757898112843
B3:  -2.253980881949735
B4:  -0.9750798513425657


In [33]:
# Test model with testing dataset and find accuracy 
logR_pred = logR.predict(x_test)
accuracy = {
    'LogReg' : [
        metrics.accuracy_score(y_train, logR.predict(x_train)), 
        metrics.accuracy_score(y_test, logR_pred),
        metrics.r2_score(y_test, logR_pred)
    ]
}
print(accuracy['LogReg'])

[0.98, 0.94, 0.9053030303030303]


In [34]:
# create decision tree model using same training and test datasets 
dTree = tree.DecisionTreeClassifier(criterion='gini').fit(x_train, y_train)

# Test model with testing dataset and find accuracy 
dtree_pred = dTree.predict(x_test)
accuracy['DTree'] = [
        metrics.accuracy_score(y_train, dTree.predict(x_train)), 
        metrics.accuracy_score(y_test, dtree_pred),
        metrics.r2_score(y_test, dtree_pred)
    ]

print(accuracy['DTree'])

[1.0, 0.92, 0.8737373737373737]


In [35]:
# Compare the accuracy of each model 
pd.DataFrame.from_dict(
    accuracy,
    orient='index', 
    columns=[
        'Train Data Accuracy', 
        'Test Data Accuracy', 
        'r2 Score'
    ])

Unnamed: 0,Train Data Accuracy,Test Data Accuracy,r2 Score
LogReg,0.98,0.94,0.905303
DTree,1.0,0.92,0.873737


In [22]:
def run_dt(criterion, X_train, X_test, Y_train, Y_test):

    x_train, x_test, y_train, y_test = X_train, X_test, Y_train, Y_test

    dTree = tree.DecisionTreeClassifier(criterion=criterion).fit(x_train, y_train)
    dtree_pred = dTree.predict(x_test)
    
    return [ 
        metrics.accuracy_score(y_train, dTree.predict(x_train)), 
        metrics.accuracy_score(y_test, dtree_pred),
        metrics.r2_score(y_test, dtree_pred)
    ]

In [36]:
# set max_iter parameter to 1000 so that model woulod converge
def run_LR(penalty, X_train, X_test, Y_train, Y_test):

    x_train, x_test, y_train, y_test = X_train, X_test, Y_train, Y_test
    
    logR = linear_model.LogisticRegression(penalty=penalty,
                                          max_iter=1000).fit(x_train, y_train)
    logR_pred = logR.predict(x_test)
    
    # find coefficients 
    B0 = logR.intercept_[0]
    B1 = logR.coef_[0][0]    
    
    mu = -B0/B1
    s = 1/B1

    return [
        metrics.accuracy_score(y_train, logR.predict(x_train)), 
        metrics.accuracy_score(y_test, logR_pred),
        metrics.r2_score(y_test, logR_pred)
    ]

In [37]:
print('Run dt model with gini inpurity 1000 times')
dt_scores = []
for i in range(1000):
    dt_scores.append(run_dt('gini', x_train, x_test, y_train, y_test))

dt_score_df = pd.DataFrame(dt_scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(dt_score_df.describe().round(6))

Run dt model with gini inpurity 1000 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count               1000.0         1000.000000  1000.000000
mean                   1.0            0.930180     0.889804
std                    0.0            0.015447     0.024380
min                    1.0            0.920000     0.873737
25%                    1.0            0.920000     0.873737
50%                    1.0            0.920000     0.873737
75%                    1.0            0.940000     0.905303
max                    1.0            0.960000     0.936869


In [38]:
print('Run LR model with L2 penalty 1000 times')
LR_L2_scores = []
for i in range(1000):
    LR_L2_scores.append(run_LR('l2', x_train, x_test, y_train, y_test))

LR_L2_score_df = pd.DataFrame(LR_L2_scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(LR_L2_score_df.describe().round(6))

Run LR model with L2 penalty 1000 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count              1000.00             1000.00  1000.000000
mean                  0.98                0.94     0.905303
std                   0.00                0.00     0.000000
min                   0.98                0.94     0.905303
25%                   0.98                0.94     0.905303
50%                   0.98                0.94     0.905303
75%                   0.98                0.94     0.905303
max                   0.98                0.94     0.905303


In [39]:
print('Run LR model with No penalty 1000 times')
LR_N_scores = []
for i in range(1000):
    LR_N_scores.append(run_LR(None, x_train, x_test, y_train, y_test))

LR_N_score_df = pd.DataFrame(LR_N_scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(LR_N_score_df.describe().round(6))

Run LR model with No penalty 1000 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count               1000.0             1000.00  1000.000000
mean                   1.0                0.96     0.936869
std                    0.0                0.00     0.000000
min                    1.0                0.96     0.936869
25%                    1.0                0.96     0.936869
50%                    1.0                0.96     0.936869
75%                    1.0                0.96     0.936869
max                    1.0                0.96     0.936869


In [27]:
# create ranking of results of logistic regression model 
probRanks = pd.DataFrame(
    logR.predict_proba(predictors), 
    columns=[
        'Prob of setosa (0)', 
        'Prob of versicolor (1)', 
        'Prob of virginica (2)'
    ]
)
probRanks.sample(10).round(3)

Unnamed: 0,Prob of setosa (0),Prob of versicolor (1),Prob of virginica (2)
30,0.965,0.035,0.0
71,0.03,0.922,0.048
73,0.006,0.79,0.203
81,0.072,0.906,0.022
143,0.0,0.027,0.973
133,0.001,0.44,0.559
83,0.001,0.313,0.686
129,0.0,0.189,0.811
42,0.988,0.012,0.0
79,0.106,0.885,0.009


In [40]:
# rank two new records 
records = [
    [5.8,2.8,5.1,2.4],
    [6.0,2.2,4.0,1.0]
]
newRecords = pd.DataFrame(
    records, 
    columns=[
        'sepal length (cm)',
        'sepal width (cm)',
        'petal length (cm)',
        'petal width (cm)'
    ]
)

# making prediction with model 
newRecords['Prediction'] = logR.predict(records)

# adding rankings to dataframe
rankProbs = pd.DataFrame(
    logR.predict_proba(records),
    columns=[
        'Prob of setosa (0)', 
        'Prob of versicolor (1)', 
        'Prob of virginica (2)'
        ]
    )
newRecords[rankProbs.columns] = rankProbs

newRecords.T

Unnamed: 0,0,1
sepal length (cm),5.8,6.0
sepal width (cm),2.8,2.2
petal length (cm),5.1,4.0
petal width (cm),2.4,1.0
Prediction,2.0,1.0
Prob of setosa (0),0.000166,0.012343
Prob of versicolor (1),0.07679,0.966225
Prob of virginica (2),0.923044,0.021431
