# SECTION 10, Machine Learning

In [1]:
import random

import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('default')

import seaborn as sns
import numpy as np
import re
from matplotlib.animation import FuncAnimation
from sklearn.datasets import fetch_openml

dataset = fetch_openml(name='boston', version=1, as_frame=True, return_X_y=False, parser='pandas')

In [2]:
data = dataset['data']
target = dataset['target']
columns = dataset['feature_names']

dataframe = pd.DataFrame(data)
dataframe.columns = columns
dataframe['price'] = dataset['target']

rm = dataframe['RM']
lstat = dataframe['LSTAT']

In [3]:
greater_then_most = np.percentile(dataframe['price'], 66)
dataframe['expensive'] = dataframe['price'].apply(lambda p: int(p > greater_then_most))
expensive = dataframe['expensive']

In [4]:
def logistic(x):
    return 1 / (1 + np.exp(-x))

def model(x, w, b):
    return logistic(np.dot(x, w.T) + b)

def loss(yhat, y):
    return -np.sum(y * np.log(yhat) + (1 - y) * np.log(1 - yhat))

def partial_w(x, y, yhat):
    return np.array([np.sum((yhat - y) * x[0]), np.sum((yhat - y) * x[1])])

def partial_b(x, y, yhat):
    return np.sum((yhat - y))

In [5]:
import pickle

In [6]:
def train(model_to_be_train, target, loss, pw, pb):
    w = np.random.random_sample((1, 2))
    b = 0

    learning_rate = 1e-5
    epoch = 200
    losses = []

    history_k_b_loss = []

    for i in range(epoch):
        batch_loss = []
        for batch in range(len(rm)):
            index = random.choice(range(len(rm)))

            x = np.array([rm[index], lstat[index]])
            y = expensive[index]

            yhat = model_to_be_train(x, w, b)
            loss_v = loss(yhat, y)

            w = w + -1 * partial_w(x, y, yhat) * learning_rate
            b = b + -1 * partial_b(x, y, yhat) * learning_rate

            batch_loss.append(loss_v)
            history_k_b_loss.append((w, b, loss_v))

        #     if batch % 100 == 0:
        #         print('Epoch: {}, Batch: {}, loss:{}'.format(i, batch, loss_v))
        losses.append(np.mean(batch_loss))
    return model_to_be_train, w, b, losses

In [7]:
model, w, b, losses = train(model, target, loss, partial_w, partial_b)

with open('logistic_regression.model', 'wb') as f:
    pickle.dump(model, f)

with open('w.model', 'wb') as f:
    pickle.dump(w, f)

with open('b.model', 'wb') as f:
    pickle.dump(b, f)

print('pickle write finished')

pickle write finished


In [8]:
import pickle

with open('logistic_regression.model', 'rb') as f:
    model_r = pickle.load(f)

with open('w.model', 'rb') as f:
    w_r = pickle.load(f)

with open('b.model', 'rb') as f:
    b_r = pickle.load(f)

print('pickle read finished')

pickle read finished


In [9]:
true_labels, predicated_labels, loss_labels = [], [], []

In [10]:
random_test_indices = np.random.choice(range(len(rm)), size=100)
decision_boundary = 0.5

for i in random_test_indices:
    x1, x2, y = rm[i], lstat[i], expensive[i]
    predicate = model_r(np.array([x1, x2]), w_r, b_r)
    loss_labels.append(predicate)
    predicate_label = int(predicate > decision_boundary)

    print('RM:{}, LSTAT:{}, EXPENSIVE:{}, Predicated:{}, loss_labels'.format(x1, x2, y, predicate_label), predicate)

    true_labels.append(y)
    predicated_labels.append(predicate_label)

RM:6.121, LSTAT:8.44, EXPENSIVE:0, Predicated:0, loss_labels [0.45837434]
RM:5.613, LSTAT:27.26, EXPENSIVE:0, Predicated:0, loss_labels [0.00124604]
RM:7.014, LSTAT:14.79, EXPENSIVE:1, Predicated:0, loss_labels [0.13064883]
RM:6.31, LSTAT:6.75, EXPENSIVE:0, Predicated:1, loss_labels [0.61823611]
RM:6.619, LSTAT:7.22, EXPENSIVE:1, Predicated:1, loss_labels [0.61342378]
RM:6.461, LSTAT:18.05, EXPENSIVE:0, Predicated:0, loss_labels [0.03803319]
RM:5.454, LSTAT:18.06, EXPENSIVE:0, Predicated:0, loss_labels [0.02460551]
RM:6.254, LSTAT:10.45, EXPENSIVE:0, Predicated:0, loss_labels [0.31426398]
RM:5.52, LSTAT:24.56, EXPENSIVE:0, Predicated:0, loss_labels [0.00294508]
RM:6.193, LSTAT:15.17, EXPENSIVE:0, Predicated:0, loss_labels [0.08424984]
RM:6.137, LSTAT:13.44, EXPENSIVE:0, Predicated:0, loss_labels [0.13798434]
RM:5.349, LSTAT:19.77, EXPENSIVE:0, Predicated:0, loss_labels [0.01340921]
RM:6.43, LSTAT:5.21, EXPENSIVE:1, Predicated:1, loss_labels [0.74083149]
RM:5.57, LSTAT:21.02, EXPENSIVE:

In [11]:
def accuracy(ytrues, yhats):
    return sum(1 for yt, y1 in zip(ytrues, yhats) if yt == y1) / len(ytrues)

In [12]:
accuracy(true_labels, predicated_labels)

0.85

In [13]:
def precision(ytrues, yhats):
    # What is the correct percentage of predictions where the label is 1

    positives_pred = [y for y in yhats if y == 1]
    return sum(1 for yt, y in zip(ytrues, yhats) if yt == y and y == 1) / len(positives_pred)

In [14]:
precision(true_labels, predicated_labels)

0.8888888888888888

In [15]:
def recall(ytrues, yhats):
    
    true_positive = [y for y in ytrues if y == 1]     
    return sum(1 for yt, y in zip(ytrues, yhats) if yt == y and yt == 1) / len(true_positive)

In [16]:
recall(true_labels, predicated_labels)

0.7441860465116279

In [17]:
people = [0] * 90 + [1] * 10

In [18]:
import random
random.shuffle(people)

In [19]:
a = [0] * 100
b = [1] * 100

In [20]:
accuracy(people, a)

0.9

In [21]:
precision(people, a)

ZeroDivisionError: division by zero

In [22]:
recall(people, a)

0.0

In [23]:
accuracy(people, b)

0.1

In [24]:
precision(people, b)

0.1

In [25]:
recall(people, b)

1.0

In [26]:
boston_model = [true_labels, predicated_labels, loss_labels]

In [27]:
with open('boston_labels', 'wb') as f:
    pickle.dump(boston_model, f)

In [28]:
len(loss_labels)

100

In [29]:
print(len(true_labels), len(predicated_labels), len(loss_labels))

100 100 100
