## Iris Classification with Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from models import linear_model, logistic_model, log_cost, log_cost_dev, gd_update
from models import binary_confusion_matrix, std_normalize, binary_accuracy, create_parameters, data_normalize
from sklearn.model_selection import train_test_split

%matplotlib inline

#### 1), prepare data 

In [2]:
df = pd.read_csv('./iris.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
Id               150 non-null int64
SepalLengthCm    150 non-null float64
SepalWidthCm     150 non-null float64
PetalLengthCm    150 non-null float64
PetalWidthCm     150 non-null float64
Species          150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.1+ KB


In [3]:
df['IsSetosa'] = df['Species'].apply(lambda a: 1.0 if a=='Iris-setosa' else 0)
data = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'IsSetosa']]
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,IsSetosa
0,5.1,3.5,1.4,0.2,1.0
1,4.9,3.0,1.4,0.2,1.0
2,4.7,3.2,1.3,0.2,1.0
3,4.6,3.1,1.5,0.2,1.0
4,5.0,3.6,1.4,0.2,1.0


In [4]:
train, test = train_test_split(data, test_size=0.2)
train_X = np.array(train[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
train_y = np.array(train[['IsSetosa']])

In [5]:
train_stds, train_means = std_normalize(train_X)

#### 2), train

In [6]:
feature_size = train_X.shape[1]
sample_count = train_X.shape[0]

W, b = create_parameters(feature_size)

threshold = 0.5
lr = 0.01

h = logistic_model(train_X, W, b)
dW, db = log_cost_dev(train_X, train_y, h)

for epoch in range(0, 1000):
    h = logistic_model(train_X, W, b)
    dW, db = log_cost_dev(train_X, train_y, h)
    W = W - lr * dW
    b = b - lr * db
    if (epoch + 1) % 100 == 0:
        cur_cost = log_cost(h, train_y)
        conf = binary_confusion_matrix(h, train_y, threshold=threshold)
        print('epoch: {0}, cost: {1}, conf: {2}'.format(epoch + 1, cur_cost, conf))

predictions = logistic_model(train_X, W, b)
final_cost = log_cost(predictions, train_y)
conf = binary_confusion_matrix(predictions, train_y, threshold=threshold)
print('training finished!')
print('final cost: {0}, conf: {1}'.format(final_cost, conf))

epoch: 100, cost: 0.9730710905666152, conf: (0.22857142857142856, 0.42105263157894735, 0.2962962962962963)
epoch: 200, cost: 0.45095829165851076, conf: (0.6491228070175439, 0.9736842105263158, 0.7789473684210527)
epoch: 300, cost: 0.2744513319960485, conf: (0.9743589743589743, 1.0, 0.9870129870129869)
epoch: 400, cost: 0.1976678088773042, conf: (1.0, 1.0, 1.0)
epoch: 500, cost: 0.15604811493152132, conf: (1.0, 1.0, 1.0)
epoch: 600, cost: 0.1299998103020092, conf: (1.0, 1.0, 1.0)
epoch: 700, cost: 0.11208651805451761, conf: (1.0, 1.0, 1.0)
epoch: 800, cost: 0.09894570767064578, conf: (1.0, 1.0, 1.0)
epoch: 900, cost: 0.0888485498939796, conf: (1.0, 1.0, 1.0)
epoch: 1000, cost: 0.08081701480550692, conf: (1.0, 1.0, 1.0)
training finished!
final cost: 0.08074489374800274, conf: (1.0, 1.0, 1.0)


#### 3). try test data

In [7]:
test_X = np.array(test[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
test_y = np.array(test[['IsSetosa']])
data_normalize(test_X, train_stds, train_means)

In [8]:
test_h = logistic_model(test_X, W, b)
test_cost = log_cost(test_h, test_y)
test_conf = binary_confusion_matrix(test_h, test_y, threshold=threshold)
print('test cost: {0}, conf: {1}'.format(test_cost, test_conf))

test cost: 0.08399940482340754, conf: (1.0, 1.0, 1.0)
