# 10. 机器学习

In [1]:
import random

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from matplotlib.animation import FuncAnimation
from sklearn.datasets import fetch_openml

dataset = fetch_openml(name='boston', version=1, as_frame=True, return_X_y=False, parser='pandas')

In [2]:
data = dataset['data']
target = dataset['target']
columns = dataset['feature_names']

dataframe = pd.DataFrame(data)
dataframe.columns = columns
dataframe['price'] = dataset['target']

rm = dataframe['RM']
lstat = dataframe['LSTAT']

In [3]:
greater_then_most = np.percentile(dataframe['price'], 66)
dataframe['expensive'] = dataframe['price'].apply(lambda p: int(p > greater_then_most))
expensive = dataframe['expensive']

In [4]:
def logistic(x):
    return 1 / (1 + np.exp(-x))

def model(x, w, b):
    return logistic(np.dot(x, w.T) + b)

def loss(yhat, y):
    return -np.sum(y * np.log(yhat) + (1 - y) * np.log(1 - yhat))

def partial_w(x, y, yhat):
    return np.array([np.sum((yhat - y) * x[0]), np.sum((yhat - y) * x[1])])

def partial_b(x, y, yhat):
    return np.sum((yhat - y))

In [5]:
import pickle

In [6]:
def train(model_to_be_train, target, loss, pw, pb):
    w = np.random.random_sample((1, 2))
    b = 0

    learning_rate = 1e-5
    epoch = 200
    losses = []

    history_k_b_loss = []

    for i in range(epoch):
        batch_loss = []
        for batch in range(len(rm)):
            index = random.choice(range(len(rm)))

            x = np.array([rm[index], lstat[index]])
            y = expensive[index]

            yhat = model_to_be_train(x, w, b)
            loss_v = loss(yhat, y)

            w = w + -1 * partial_w(x, y, yhat) * learning_rate
            b = b + -1 * partial_b(x, y, yhat) * learning_rate

            batch_loss.append(loss_v)
            history_k_b_loss.append((w, b, loss_v))

        #     if batch % 100 == 0:
        #         print('Epoch: {}, Batch: {}, loss:{}'.format(i, batch, loss_v))
        losses.append(np.mean(batch_loss))
    return model_to_be_train, w, b, losses

In [7]:
model, w, b, losses = train(model, target, loss, partial_w, partial_b)

with open('logistic_regression.model', 'wb') as f:
    pickle.dump(model, f)

with open('w.model', 'wb') as f:
    pickle.dump(w, f)

with open('b.model', 'wb') as f:
    pickle.dump(b, f)

print('pickle write finished')

pickle write finished


In [8]:
import pickle

with open('logistic_regression.model', 'rb') as f:
    model_r = pickle.load(f)

with open('w.model', 'rb') as f:
    w_r = pickle.load(f)

with open('b.model', 'rb') as f:
    b_r = pickle.load(f)

print('pickle read finished')

pickle read finished


In [9]:
true_labels, predicated_labels, loss_labels = [], [], []

In [10]:
random_test_indices = np.random.choice(range(len(rm)), size=100)
decision_boundary = 0.5

for i in random_test_indices:
    x1, x2, y = rm[i], lstat[i], expensive[i]
    predicate = model_r(np.array([x1, x2]), w_r, b_r)
    loss_labels.append(predicate)
    predicate_label = int(predicate > decision_boundary)

    print('RM:{}, LSTAT:{}, EXPENSIVE:{}, Predicated:{}, loss_labels'.format(x1, x2, y, predicate_label), predicate)

    true_labels.append(y)
    predicated_labels.append(predicate_label)

RM:4.97, LSTAT:3.26, EXPENSIVE:1, Predicated:1, loss_labels [0.71280115]
RM:4.903, LSTAT:29.29, EXPENSIVE:0, Predicated:0, loss_labels [0.00089803]
RM:6.635, LSTAT:5.99, EXPENSIVE:1, Predicated:1, loss_labels [0.67309195]
RM:6.635, LSTAT:5.99, EXPENSIVE:1, Predicated:1, loss_labels [0.67309195]
RM:8.704, LSTAT:5.12, EXPENSIVE:1, Predicated:1, loss_labels [0.85610186]
RM:6.38, LSTAT:23.69, EXPENSIVE:0, Predicated:0, loss_labels [0.00860896]
RM:6.482, LSTAT:7.19, EXPENSIVE:1, Predicated:1, loss_labels [0.57423992]
RM:6.153, LSTAT:13.15, EXPENSIVE:1, Predicated:0, loss_labels [0.16302723]
RM:5.85, LSTAT:8.77, EXPENSIVE:0, Predicated:0, loss_labels [0.39563235]
RM:7.007, LSTAT:5.5, EXPENSIVE:1, Predicated:1, loss_labels [0.73383951]
RM:6.593, LSTAT:9.67, EXPENSIVE:0, Predicated:0, loss_labels [0.39879034]
RM:5.961, LSTAT:9.88, EXPENSIVE:0, Predicated:0, loss_labels [0.32789702]
RM:6.108, LSTAT:6.57, EXPENSIVE:0, Predicated:1, loss_labels [0.58496574]
RM:5.822, LSTAT:15.03, EXPENSIVE:0, Pre

  predicate_label = int(predicate > decision_boundary)


In [11]:
def accuracy(ytrues, yhats):
    return sum(1 for yt, y1 in zip(ytrues, yhats) if yt == y1) / len(ytrues)

In [12]:
accuracy(true_labels, predicated_labels)

0.84

In [24]:
def precision(ytrues, yhats):
    # 预测标签是 1 的里面，正确的比例是多少

    positives_pred = [y for y in yhats if y == 1]
    if len(positives_pred) == 0:
        return 0.0
    return sum(1 for yt, y in zip(ytrues, yhats) if yt == y and y == 1) / len(positives_pred)

In [14]:
precision(true_labels, predicated_labels)

0.8108108108108109

In [15]:
def recall(ytrues, yhats):
    
    true_positive = [y for y in ytrues if y == 1]     
    return sum(1 for yt, y in zip(ytrues, yhats) if yt == y and yt == 1) / len(true_positive)

In [16]:
recall(true_labels, predicated_labels)

0.7692307692307693

In [17]:
people = [0] * 90 + [1] * 10

In [18]:
import random
random.shuffle(people)

In [19]:
a = [0] * 100
b = [1] * 100

In [20]:
accuracy(people, a)

0.9

In [25]:
precision(people, a)

0.0

In [26]:
recall(people, a)

0.0

In [27]:
accuracy(people, b)

0.1

In [28]:
precision(people, b)

0.1

In [29]:
recall(people, b)

1.0

In [30]:
boston_model = [true_labels, predicated_labels, loss_labels]

In [31]:
with open('boston_labels', 'wb') as f:
    pickle.dump(boston_model, f)

In [32]:
len(loss_labels)

100

In [33]:
print(len(true_labels), len(predicated_labels), len(loss_labels))

100 100 100


![欢迎订阅：坍缩的奇点](../assets/Capture-2023-11-02-164446.png)