1

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from cvxopt import matrix, solvers
import matplotlib.pyplot as plt

2

In [2]:
# ===== 1. Đọc dữ liệu =====
df_sample = pd.read_csv("online_shoppers_intention.csv")

In [3]:
df = df_sample.sample(frac = 0.1, random_state = 42)

3


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1233 entries, 8916 to 7054
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           1233 non-null   int64  
 1   Administrative_Duration  1233 non-null   float64
 2   Informational            1233 non-null   int64  
 3   Informational_Duration   1233 non-null   float64
 4   ProductRelated           1233 non-null   int64  
 5   ProductRelated_Duration  1233 non-null   float64
 6   BounceRates              1233 non-null   float64
 7   ExitRates                1233 non-null   float64
 8   PageValues               1233 non-null   float64
 9   SpecialDay               1233 non-null   float64
 10  Month                    1233 non-null   object 
 11  OperatingSystems         1233 non-null   int64  
 12  Browser                  1233 non-null   int64  
 13  Region                   1233 non-null   int64  
 14  TrafficType              1

In [5]:
df['Revenue'] = df['Revenue'].map({True: 1, False: -1})

# ===== 3. One-hot encoding =====
df = pd.get_dummies(df, columns=['Month', 'VisitorType', 'Weekend'], drop_first=True)

# ===== 4. Xử lý missing value =====
df = df.dropna()

# ===== 5. Tách đặc trưng và nhãn =====
X = df.drop('Revenue', axis=1).values
y = df['Revenue'].values.astype(float)


In [6]:
print(df['Revenue'].value_counts())
df.describe()

Revenue
-1    1030
 1     203
Name: count, dtype: int64


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType,Revenue
count,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0
mean,2.344688,78.120823,0.447689,30.9164,31.875912,1194.643361,0.024519,0.045412,6.146652,0.062774,2.152474,2.384428,3.152474,4.046229,-0.670722
std,3.359,157.02763,1.196883,132.227434,46.96346,1958.019302,0.052065,0.051403,19.211589,0.199637,0.886101,1.702696,2.322136,3.925066,0.74201
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,-1.0
25%,0.0,0.0,0.0,0.0,7.0,163.0,0.0,0.013672,0.0,0.0,2.0,2.0,1.0,2.0,-1.0
50%,1.0,4.0,0.0,0.0,18.0,566.566667,0.00381,0.026923,0.0,0.0,2.0,2.0,3.0,2.0,-1.0
75%,4.0,98.0,0.0,0.0,38.0,1433.634921,0.018182,0.05,0.0,0.0,3.0,2.0,4.0,4.0,-1.0
max,20.0,1640.590909,14.0,1830.5,517.0,27009.85943,0.2,0.2,254.607158,1.0,8.0,13.0,9.0,20.0,1.0


In [7]:
# ===== 6. Train/Test split =====
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ===== 7. Chuẩn hóa =====
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# ===== 8. RBF Kernel =====
# def rbf_kernel(x1, x2, sigma=1.0):
#     return np.exp(-np.linalg.norm(x1 - x2) ** 2 / (2 * sigma ** 2))

def rbf_kernel(x1, x2, sigma=1.0):
    gamma = 1 / (2 * sigma ** 2)
    
      # Kiểm tra và chuyển đổi input thành 2D nếu cần
    x1 = np.array(x1)
    x2 = np.array(x2)
    
    if x1.ndim == 1:
        x1 = x1.reshape(1, -1)  # Chuyển (d,) -> (1, d)
    if x2.ndim == 1:
        x2 = x2.reshape(1, -1)  # Chuyển (d,) -> (1, d)
    
    # Bước 1: Tính ||x_i||^2 và ||x_j||^2
    x1_norm = np.sum(x1 ** 2, axis=1)[:, np.newaxis]  # shape (m, 1)
    x2_norm = np.sum(x2 ** 2, axis=1)                # shape (n,)
    
    # Bước 2: Tính ||x_i - x_j||^2 = ||x_i||^2 + ||x_j||^2 - 2⟨x_i, x_j⟩
    pairwise_dist = x1_norm + x2_norm - 2 * np.dot(x1, x2.T)  # shape (m, n)
    
    # Bước 3: Áp dụng hàm exp
    return np.exp(-gamma * pairwise_dist)

# Chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Áp dụng SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train) 

# Chuyển nhãn 0 thành -1
print(pd.Series(y_resampled).value_counts())
# y_resampled = np.where(y_resampled == 0, -1, 1)
# y_test = np.where(y_test == 0, -1, 1)
print("Các nhãn sau khi SMOTE:", np.unique(y_resampled))
print("Các nhãn sau khi SMOTE:", np.unique(y_test))

-1.0    826
 1.0    826
Name: count, dtype: int64
Các nhãn sau khi SMOTE: [-1.  1.]
Các nhãn sau khi SMOTE: [-1.  1.]


In [None]:




# ===== 9. Tính ma trận Gram (train) =====
n_samples = X_resampled.shape[0]
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
    for j in range(n_samples):
        K[i, j] = rbf_kernel(X_resampled[i], X_resampled[j])

# ===== 10. QP input =====
C = 1.0 # Regularization parameter
P = matrix(np.outer(y_resampled, y_resampled) * K) #ok check roi
q = matrix(-np.ones(n_samples)) #ok check roi
G = matrix(np.vstack([-np.eye(n_samples), np.eye(n_samples)])) #ok check roi
h = matrix(np.hstack([np.zeros(n_samples), np.ones(n_samples) * C])) #ok check roi
A = matrix(y_resampled.reshape(1, -1)) #ok check roi
b = matrix(0.0) #0.0 hay la 0
#b = matrix(np.zeros(1)) #ok check roi

# ===== 11. Solve QP =====
solvers.options['show_progress'] = False #chua biet
solvers.options['abstol'] = 1e-10
solvers.options['reltol'] = 1e-10 #chua biet
solvers.options['feastol'] = 1e-10 #chua biet
sol = solvers.qp(P, q, G, h, A, b)
alphas = np.ravel(sol['x']) #ok check roi

# ===== 12. Support vectors =====
threshold = 1e-5
sv = alphas > threshold
sv_X = X_resampled[sv] #support vectors
sv_y = y_resampled[sv]
sv_alpha = alphas[sv]

# 2. Xác định các Margin Vectors (0 < alpha_i < C)
margin_mask = (alphas > threshold) & (alphas < C - threshold)
margin_X = X_resampled[margin_mask]
margin_y = y_resampled[margin_mask]

if len(margin_X) > 0:
    K_ms = rbf_kernel(margin_X, sv_X, sigma=1.0)
    # Tính b cho các Margin Vectors
    a_s = sv_alpha * sv_y
    b = np.dot((margin_y - np.dot(K_ms, a_s)).T, np.ones(len(margin_X))) / len(margin_X)
else:
    # Fallback nếu không có margin vectors #can xem lai
    b = sv_y[0] - np.dot(rbf_kernel(sv_X[0:1], sv_X, sigma=sigma), a_s)[0]

# ===== 13. Dự đoán =====
# hàm projext này bị sai, chạy ra kết quả có = 0
def project(x):
    # # Chuyển x thành ma trận 2D nếu cần
    # x = np.atleast_2d(x)
    
    # Tính K_bs: kernel giữa x và các support vectors
    K_bs = rbf_kernel(x, sv_X, sigma=1.0)
    
    # Tính y = K_bs * a_s + b
    y = np.dot(K_bs, sv_alpha * sv_y) + b
    print("Các nhãn sau khi SMOTE:", np.unique(y_test))
    print("1. Shape của y:", y.shape())
    return y
    # return y[0]  # Trả về giá trị scalar nếu input là 1 điểm

# def predict(X_test):
#     return np.sign([project(x) for x in X_test])
#     return np.sign([])

# # ===== 14. Dự đoán và đánh giá =====
# y_pred = predict(X_test)
# print("Các nhãn sau khi SMOTE:", np.unique(y_pred))
# acc = accuracy_score(y_test, y_pred)
# prec = precision_score(y_test, y_pred, pos_label=1)
# rec = recall_score(y_test, y_pred, pos_label=1)
# f1 = f1_score(y_test, y_pred, pos_label=1)

# print(f"\n🎯 Evaluation on Test Set:")
# print(f"Accuracy : {acc:.4f}")
# print(f"Precision: {prec:.4f}")
# print(f"Recall   : {rec:.4f}")
# print(f"F1 Score : {f1:.4f}")

# # ===== 15. Confusion Matrix =====
# cm = confusion_matrix(y_test, y_pred, labels=[1, -1])
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Revenue=True", "Revenue=False"])
# disp.plot()
# plt.title("Confusion Matrix - SVM with RBF Kernel")
# plt.show()


  K[i, j] = rbf_kernel(X_resampled[i], X_resampled[j])
