In [1]:
import numpy as np
import pandas as pd
from cvxopt import matrix, solvers

def kernel(x1, x2):
    return np.dot(x1, x2.T)

def svm_qp(X, y):
    m, n = X.shape
    y = y.astype(float)
    
    # 创建QP问题的P, q, G, h, A, b矩阵
    K = np.dot(X, X.T) * np.outer(y, y)
    P = matrix(K)
    q = matrix(-np.ones((m, 1)))
    G = matrix(-np.eye(m))
    h = matrix(np.zeros(m))
    A = matrix(y.reshape(1, -1).astype(float))
    b = matrix(np.zeros(1))

    # 求解QP问题
    sol = solvers.qp(P, q, G, h, A, b)
    alphas = np.array(sol['x']).flatten()
    
    # 计算权重向量w
    w = np.sum(alphas * y[:, None] * X, axis=0)
    
    # 计算偏置b
    support_vectors = (alphas > 1e-5)
    b = np.mean(y[support_vectors] - np.dot(X[support_vectors], w))
    
    return w, b, alphas

# 加载数据
X_train = np.loadtxt('breast_cancer_Xtrain.csv', delimiter=',')
y_train = np.loadtxt('breast_cancer_Ytrain.csv', delimiter=',')

# 使用训练数据集进行QP求解
w, b, alphas = svm_qp(X_train, y_train)
print("Weights:", w)
print("Bias:", b)

def predict(X, w, b):
    return np.sign(np.dot(X, w) + b)

# 加载测试数据
X_test = np.loadtxt('breast_cancer_Xtest.csv', delimiter=',')
y_test = np.loadtxt('breast_cancer_Ytest.csv', delimiter=',')

# 对测试集进行预测
predictions = predict(X_test, w, b)

# 计算准确率
accuracy = np.mean(predictions == y_test)
print("Accuracy on test data:", accuracy)



     pcost       dcost       gap    pres   dres
 0: -1.1191e+02 -3.3075e+02  3e+03  5e+01  3e+00
 1: -2.7273e+02 -6.0641e+02  2e+03  3e+01  2e+00
 2: -5.6743e+02 -1.0026e+03  2e+03  2e+01  1e+00
 3: -1.1165e+03 -1.7567e+03  2e+03  2e+01  1e+00
 4: -2.5357e+03 -3.5897e+03  2e+03  2e+01  1e+00
 5: -3.6387e+03 -4.8619e+03  2e+03  2e+01  1e+00
 6: -8.2502e+03 -9.9521e+03  3e+03  2e+01  1e+00
 7: -2.0022e+04 -2.3052e+04  5e+03  2e+01  1e+00
 8: -2.9037e+04 -3.4060e+04  7e+03  2e+01  9e-01
 9: -4.4519e+04 -5.3878e+04  1e+04  1e+01  7e-01
10: -5.7800e+04 -6.7921e+04  1e+04  6e+00  3e-01
11: -5.9791e+04 -6.1332e+04  2e+03  5e-01  3e-02
12: -5.9723e+04 -5.9740e+04  2e+01  6e-03  4e-04
13: -5.9722e+04 -5.9723e+04  2e-01  6e-05  4e-06
14: -5.9722e+04 -5.9722e+04  2e-03  6e-07  4e-08
15: -5.9722e+04 -5.9722e+04  2e-05  6e-09  4e-10
Optimal solution found.


ValueError: operands could not be broadcast together with shapes (455,455) (455,30) 

In [None]:
import numpy as np

def kernel(x1, x2):
    return np.dot(x1, x2.T)

def calculate_b(X, y, alphas, b, C, tol):
    m = len(y)
    b_new = 0
    b1 = []
    b2 = []

    for i in range(m):
        y_pred = np.sum(alphas * y * kernel(X, X[i])) + b
        if y[i] * y_pred - 1 < -tol:
            b1.append(b + y[i] - y_pred)
        elif y[i] * y_pred - 1 > tol:
            b2.append(b + y[i] - y_pred)

    if len(b1) > 0:
        b_new = np.mean(b1)
    elif len(b2) > 0:
        b_new = np.mean(b2)

    return b_new

def smo_svm(X, y, C, tol, max_passes):
    m, n = X.shape
    alphas = np.zeros(m)
    b = 0
    passes = 0

    while passes < max_passes:
        alpha_pairs_changed = 0
        for i in range(m):
            E_i = np.sum(alphas * y * kernel(X, X[i])) + b - y[i]
            if (y[i] * E_i < -tol and alphas[i] < C) or (y[i] * E_i > tol and alphas[i] > 0):
                j = np.random.randint(0, m)
                while j == i:
                    j = np.random.randint(0, m)

                E_j = np.sum(alphas * y * kernel(X, X[j])) + b - y[j]
                alpha_i_old, alpha_j_old = alphas[i], alphas[j]

                if y[i] != y[j]:
                    L = max(0, alphas[j] - alphas[i])
                    H = min(C, C + alphas[j] - alphas[i])
                else:
                    L = max(0, alphas[i] + alphas[j] - C)
                    H = min(C, alphas[i] + alphas[j])

                if L == H:
                    continue

                eta = 2 * kernel(X[i], X[j]) - kernel(X[i], X[i]) - kernel(X[j], X[j])
                if eta >= 0:
                    continue

                alphas[j] -= y[j] * (E_i - E_j) / eta
                alphas[j] = np.clip(alphas[j], L, H)

                if abs(alphas[j] - alpha_j_old) < 1e-5:
                    continue

                alphas[i] += y[i] * y[j] * (alpha_j_old - alphas[j])
                b1 = b - E_i - y[i] * (alphas[i] - alpha_i_old) * kernel(X[i], X[i]) - y[j] * (alphas[j] - alpha_j_old) * kernel(X[i], X[j])
                b2 = b - E_j - y[i] * (alphas[i] - alpha_i_old) * kernel(X[i], X[j]) - y[j] * (alphas[j] - alpha_j_old) * kernel(X[j], X[j])
                if 0 < alphas[i] < C:
                    b = b1
                elif 0 < alphas[j] < C:
                    b = b2
                else:
                    b = (b1 + b2) / 2

                alpha_pairs_changed += 1

        if alpha_pairs_changed == 0:
            passes += 1
        else:
            passes = 0

    return alphas, b

# Load the data
X_train = np.loadtxt('breast_cancer_Xtrain.csv', delimiter=',')
y_train = np.loadtxt('breast_cancer_Ytrain.csv', delimiter=',')

C = 1.0
tol = 1e-3
max_passes = 5

alphas, b = smo_svm(X_train, y_train, C, tol, max_passes)
print("Alphas:", alphas)
print("b:", b)


Alphas: [3.11626038e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 8.91504933e-01 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.97123177e-01
 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.45949480e-01
 0.00000000e+00 

In [None]:
def predict(X, alphas, b, X_train, y_train):
    return np.sign(np.dot((alphas * y_train).T, kernel(X_train, X)) + b)

# Load test data
X_test = np.loadtxt('breast_cancer_Xtest.csv', delimiter=',')
y_test = np.loadtxt('breast_cancer_Ytest.csv', delimiter=',')

predictions = predict(X_test, alphas, b, X_train, y_train)

accuracy = np.mean(predictions == y_test)
print("Accuracy on test data:", accuracy)


Accuracy on test data: 0.9824561403508771


In [None]:
# 预测函数
def predict(X, alphas, b, X_train, y_train):
    return np.sign(np.dot((alphas * y_train).T, kernel(X_train, X)) + b)

# 加载测试数据
X_test = np.loadtxt('breast_cancer_Xtest.csv', delimiter=',')
y_test = np.loadtxt('breast_cancer_Ytest.csv', delimiter=',')

# 对测试集进行预测
predictions = predict(X_test, alphas, b, X_train, y_train)

# 计算准确率
accuracy = np.mean(predictions == y_test)
print("Accuracy on test data:", accuracy)

# 将预测结果写入 result.csv 文件
np.savetxt('result.csv', predictions, delimiter=',', fmt='%d')

# 加载原始标签和预测标签
original_labels = y_test
predicted_labels = predictions

# 创建 DataFrame 来存储比较结果
comparison_df = pd.DataFrame({
    'Original': original_labels,
    'Predicted': predicted_labels
})

# 找出不同的标签
differences = comparison_df[comparison_df['Original'] != comparison_df['Predicted']]

# 输出不同标签的行数
print(f"Number of differences: {len(differences)}")

# 输出所有不同的标签
print("Differences between original and predicted labels:")
print(differences)



Accuracy on test data: 0.9824561403508771
Number of differences: 2
Differences between original and predicted labels:
    Original  Predicted
20       1.0       -1.0
77       1.0       -1.0


In [None]:
import numpy as np

def kernel(x1, x2):
    return np.dot(x1, x2.T)

def calculate_b(X, y, alphas, b, C, tol):
    m = len(y)
    b_new = 0
    b1 = []
    b2 = []

    for i in range(m):
        y_pred = np.sum(alphas * y * kernel(X, X[i])) + b
        if y[i] * y_pred - 1 < -tol:
            b1.append(b + y[i] - y_pred)
        elif y[i] * y_pred - 1 > tol:
            b2.append(b + y[i] - y_pred)

    if len(b1) > 0:
        b_new = np.mean(b1)
    elif len(b2) > 0:
        b_new = np.mean(b2)

    return b_new

def smo_svm(X, y, C, tol, max_passes):
    m, n = X.shape
    alphas = np.zeros(m)
    b = 0
    passes = 0

    while passes < max_passes:
        alpha_pairs_changed = 0
        for i in range(m):
            E_i = np.sum(alphas * y * kernel(X, X[i])) + b - y[i]
            if (y[i] * E_i < -tol and alphas[i] < C) or (y[i] * E_i > tol and alphas[i] > 0):
                j = np.random.randint(0, m)
                while j == i:
                    j = np.random.randint(0, m)

                E_j = np.sum(alphas * y * kernel(X, X[j])) + b - y[j]
                alpha_i_old, alpha_j_old = alphas[i], alphas[j]

                if y[i] != y[j]:
                    L = max(0, alphas[j] - alphas[i])
                    H = min(C, C + alphas[j] - alphas[i])
                else:
                    L = max(0, alphas[i] + alphas[j] - C)
                    H = min(C, alphas[i] + alphas[j])

                if L == H:
                    continue

                eta = 2 * kernel(X[i], X[j]) - kernel(X[i], X[i]) - kernel(X[j], X[j])
                if eta >= 0:
                    continue

                alphas[j] -= y[j] * (E_i - E_j) / eta
                alphas[j] = np.clip(alphas[j], L, H)

                if abs(alphas[j] - alpha_j_old) < 1e-5:
                    continue

                alphas[i] += y[i] * y[j] * (alpha_j_old - alphas[j])
                b1 = b - E_i - y[i] * (alphas[i] - alpha_i_old) * kernel(X[i], X[i]) - y[j] * (alphas[j] - alpha_j_old) * kernel(X[i], X[j])
                b2 = b - E_j - y[i] * (alphas[i] - alpha_i_old) * kernel(X[i], X[j]) - y[j] * (alphas[j] - alpha_j_old) * kernel(X[j], X[j])
                if 0 < alphas[i] < C:
                    b = b1
                elif 0 < alphas[j] < C:
                    b = b2
                else:
                    b = (b1 + b2) / 2

                alpha_pairs_changed += 1

        if alpha_pairs_changed == 0:
            passes += 1
        else:
            passes = 0

    return alphas, b

# Load the data
X_train = np.loadtxt('breast_cancer_Xtrain.csv', delimiter=',')
y_train = np.loadtxt('breast_cancer_Ytrain.csv', delimiter=',')

C = 1.0
tol = 1e-3
max_passes = 5

alphas, b = smo_svm(X_train, y_train, C, tol, max_passes)
print("Alphas:", alphas)
print("b:", b)


def predict(X, alphas, b, X_train, y_train):
    return np.sign(np.dot((alphas * y_train).T, kernel(X_train, X)) + b)

# Load test data
X_test = np.loadtxt('breast_cancer_Xtest.csv', delimiter=',')
y_test = np.loadtxt('breast_cancer_Ytest.csv', delimiter=',')

predictions = predict(X_test, alphas, b, X_train, y_train)

accuracy = np.mean(predictions == y_test)
print("Accuracy on test data:", accuracy)

# 预测函数
def predict(X, alphas, b, X_train, y_train):
    return np.sign(np.dot((alphas * y_train).T, kernel(X_train, X)) + b)

# 加载测试数据
X_test = np.loadtxt('breast_cancer_Xtest.csv', delimiter=',')
y_test = np.loadtxt('breast_cancer_Ytest.csv', delimiter=',')

# 对测试集进行预测
predictions = predict(X_test, alphas, b, X_train, y_train)

# 计算准确率
accuracy = np.mean(predictions == y_test)
print("Accuracy on test data:", accuracy)

# 将预测结果写入 result.csv 文件
np.savetxt('result.csv', predictions, delimiter=',', fmt='%d')

# 加载原始标签和预测标签
original_labels = y_test
predicted_labels = predictions

# 创建 DataFrame 来存储比较结果
comparison_df = pd.DataFrame({
    'Original': original_labels,
    'Predicted': predicted_labels
})

# 找出不同的标签
differences = comparison_df[comparison_df['Original'] != comparison_df['Predicted']]

# 输出不同标签的行数
print(f"Number of differences: {len(differences)}")

# 输出所有不同的标签
print("Differences between original and predicted labels:")
print(differences)




Alphas: [ 3.32804619e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  3.71517355e-01  0.00000000e+00
  2.77555756e-17  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  2.77555756e-17  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.77555756e-17  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.74336859e-01
  0.00000000e+00 