# 10. 机器学习

In [33]:
import random

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from matplotlib.animation import FuncAnimation
from sklearn.datasets import fetch_openml

dataset = fetch_openml(name='boston', version=1, as_frame=True, return_X_y=False, parser='pandas')

In [34]:
data = dataset['data']
target = dataset['target']
columns = dataset['feature_names']

dataframe = pd.DataFrame(data)
dataframe.columns = columns
dataframe['price'] = dataset['target']

rm = dataframe['RM']
lstat = dataframe['LSTAT']

In [35]:
greater_then_most = np.percentile(dataframe['price'], 66)
dataframe['expensive'] = dataframe['price'].apply(lambda p: int(p > greater_then_most))
expensive = dataframe['expensive']

In [23]:
def logistic(x):
    return 1 / (1 + np.exp(-x))

def model(x, w, b):
    return logistic(np.dot(x, w.T) + b)

def loss(yhat, y):
    return -np.sum(y * np.log(yhat) + (1 - y) * np.log(1 - yhat))

def partial_w(x, y, yhat):
    return np.array([np.sum((yhat - y) * x[0]), np.sum((yhat - y) * x[1])])

def partial_b(x, y, yhat):
    return np.sum((yhat - y))

In [24]:
import pickle

In [25]:
def train(model_to_be_train, target, loss, pw, pb):
    w = np.random.random_sample((1, 2))
    b = 0

    learning_rate = 1e-5
    epoch = 200
    losses = []

    history_k_b_loss = []

    for i in range(epoch):
        batch_loss = []
        for batch in range(len(rm)):
            index = random.choice(range(len(rm)))

            x = np.array([rm[index], lstat[index]])
            y = expensive[index]

            yhat = model_to_be_train(x, w, b)
            loss_v = loss(yhat, y)

            w = w + -1 * partial_w(x, y, yhat) * learning_rate
            b = b + -1 * partial_b(x, y, yhat) * learning_rate

            batch_loss.append(loss_v)
            history_k_b_loss.append((w, b, loss_v))

        #     if batch % 100 == 0:
        #         print('Epoch: {}, Batch: {}, loss:{}'.format(i, batch, loss_v))
        # losses.append(np.mean(batch_loss))
    return model_to_be_train, w, b, losses

In [18]:
model, w, b, losses = train(model, target, loss, partial_w, partial_b)

with open('logistic_regression.model', 'wb') as f:
    pickle.dump(model, f)

with open('w.model', 'wb') as f:
    pickle.dump(w, f)

with open('b.model', 'wb') as f:
    pickle.dump(b, f)

print('pickle write finished')

pickle write finished


In [31]:
import pickle

with open('logistic_regression.model', 'rb') as f:
    model_r = pickle.load(f)

with open('w.model', 'rb') as f:
    w_r = pickle.load(f)

with open('b.model', 'rb') as f:
    b_r = pickle.load(f)

print('pickle read finished')

pickle read finished


In [40]:
true_labels, predicated_labels = [], []

In [41]:
random_test_indices = np.random.choice(range(len(rm)), size=100)
decision_boundary = 0.5

for i in random_test_indices:
    x1, x2, y = rm[i], lstat[i], expensive[i]
    predicate = model_r(np.array([x1, x2]), w_r, b_r)
    predicate_label = int(predicate > decision_boundary)

    print('RM:{}, LSTAT:{}, EXPENSIVE:{}, Predicated:{}'.format(x1, x2, y, predicate_label))

    true_labels.append(y)
    predicated_labels.append(predicate_label)

RM:6.975, LSTAT:4.56, EXPENSIVE:1, Predicated:1
RM:6.968, LSTAT:17.21, EXPENSIVE:0, Predicated:0
RM:6.164, LSTAT:21.46, EXPENSIVE:0, Predicated:0
RM:5.594, LSTAT:13.09, EXPENSIVE:0, Predicated:0
RM:6.072, LSTAT:13.04, EXPENSIVE:0, Predicated:0
RM:7.52, LSTAT:7.26, EXPENSIVE:1, Predicated:1
RM:5.966, LSTAT:14.44, EXPENSIVE:0, Predicated:0
RM:6.405, LSTAT:8.2, EXPENSIVE:0, Predicated:0
RM:6.854, LSTAT:2.98, EXPENSIVE:1, Predicated:1
RM:5.782, LSTAT:15.94, EXPENSIVE:0, Predicated:0
RM:6.75, LSTAT:7.74, EXPENSIVE:1, Predicated:1
RM:6.826, LSTAT:4.16, EXPENSIVE:1, Predicated:1
RM:6.563, LSTAT:5.68, EXPENSIVE:1, Predicated:1
RM:7.82, LSTAT:3.57, EXPENSIVE:1, Predicated:1
RM:6.459, LSTAT:23.98, EXPENSIVE:0, Predicated:0
RM:7.875, LSTAT:2.97, EXPENSIVE:1, Predicated:1
RM:5.708, LSTAT:11.74, EXPENSIVE:0, Predicated:0
RM:6.072, LSTAT:13.04, EXPENSIVE:0, Predicated:0
RM:4.138, LSTAT:37.97, EXPENSIVE:0, Predicated:0
RM:6.212, LSTAT:17.6, EXPENSIVE:0, Predicated:0
RM:6.162, LSTAT:7.43, EXPENSIVE:1,

  predicate_label = int(predicate > decision_boundary)


In [82]:
def accuracy(ytrues, yhats):
    return sum(1 for yt, y1 in zip(ytrues, yhats) if yt == y1) / len(ytrues)

In [83]:
accuracy(true_labels, predicated_labels)

0.89

In [61]:
def precision(ytrues, yhats):
    # 预测标签是1的里面，正确的比例是多少

    positives_pred = [y for y in yhats if y == 1]
    return sum(1 for yt, y in zip(ytrues, yhats) if yt == y and y == 1) / len(positives_pred)

In [62]:
precision(true_labels, predicated_labels)

0.8333333333333334

In [63]:
def recall(ytrues, yhats):
    
    true_positive = [y for y in ytrues if y == 1]     
    return sum(1 for yt, y in zip(ytrues, yhats) if yt == y and yt == 1) / len(true_positive)

In [64]:
recall(true_labels, predicated_labels)

0.8064516129032258

In [84]:
people = [0] * 90 + [1] * 10

In [85]:
import random
random.shuffle(people)

In [86]:
a = [0] * 100
b = [1] * 100

In [87]:
accuracy(people, a)

0.9

In [89]:
precision(people, a)

ZeroDivisionError: division by zero

In [90]:
recall(people, a)

0.0

In [91]:
accuracy(people, b)

0.1

In [92]:
precision(people, b)

0.1

In [93]:
recall(people, b)

1.0