In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
%matplotlib inline

In [7]:
def sigmoid(z):
    s = 1/(1+np.exp(-z))
    return s

def model(theta, X):
    z = np.sum(theta.T * X, axis=1)
    return sigmoid(z)

def cost_function(theta, X, y):
    y_hat = model(theta, X)
    n_samples = y.shape[0]
    cost = sum(-y*np.log(y_hat)-(1-y)*np.log(1-y_hat))/n_samples # y_hat是我们求出来的预测值
    return cost

def optimize(theta,X,y,reg):
    n_samples = X.shape[0]  # n即样本数
    n_paras = X.shape[1]
    alpha = 1e-1
    y_hat = model(theta,X)  # 即0-1的值,象征概率
    y_diff = (y_hat-y).reshape(n_samples,1)
    dtheta = (1.0/n_samples) * (y_diff*X)
    dtheta = np.sum(dtheta, axis=0)
    dtheta = dtheta.reshape((n_paras,1))
    theta = theta - alpha * (dtheta+(reg/n_samples)*theta) # (1.0/n)*theta is the regularization term,lambda is 1.0
    return theta

def accuracy(theta, X, y):
    y_hat=model(theta, X)
    y_pre=(y_hat > 0.5).astype('int')
    count_right=sum(y_pre == y)
    return count_right*1.0/len(y)

def add_ones(X):
    ones=np.ones((X.shape[0],1))
    X_with_ones=np.hstack((ones, X))
    return X_with_ones   # 用于与theta_0作对应,左侧

def iterate(theta,X,y,times,reg): # 初始参数以及循环次数
    costs = []
    accs = []
    for i in range(times):
        theta = optimize(theta,X,y,reg)
        costs.append(cost_function(theta, X, y))
        accs.append(accuracy(theta, X, y))
    return theta, costs, accs

dataset=load_breast_cancer()
X=dataset.data
y=dataset.target
std = X.std(axis=0)
mean = X.mean(axis=0)
X_norm = (X-mean)/std
X_with_ones = add_ones(X_norm)
# 初始化参数
theta = np.ones((X.shape[1]+1,1))
X_train, X_test, y_train, y_test = train_test_split(X_with_ones, y, test_size = 0.3, random_state=12345)
iter_times = 1500
reg_term = 1.0
theta, costs, accs = iterate(theta, X_train, y_train, iter_times,reg_term)
print(costs[-1], accs[-1])   # 训练集的损失和精确度
accuracy(theta, X_test, y_test)

  cost = sum(-y*np.log(y_hat)-(1-y)*np.log(1-y_hat))/n_samples # y_hat是我们求出来的预测值


0.05322014508616222 0.992462311557789


0.9824561403508771

In [5]:
clf = LogisticRegression(C=1).fit(X_train,y_train)

In [6]:
clf.score(X_test,y_test)

0.9766081871345029