In [55]:
import pandas as pd
import torch
from torch import nn, optim
from scipy.io import arff
import numpy as np

In [57]:
def read_arrf(file):
    with open(file, encoding="utf-8") as f:
        header = []
        for line in f:
            if line.startswith("@attribute"):
                header.append(line.split(sep='\'')[1])
            elif line.startswith("@data"):
                break
        df = pd.read_csv(f, header=None)
        df.columns = header
    return df

In [58]:
train_df = read_arrf("../NSL-KDD/KDDTrain+.arff")
test_df = read_arrf('../NSL-KDD/KDDTest+.arff')
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [59]:
# 取出 index 为 10000-11000 的 1000 条数据
subset_df = train_df.loc[10000:11000, :]

# 将所有数值型属性的值修改为 5000
numeric_cols = subset_df.select_dtypes(include='number').columns
subset_df[numeric_cols] = 5000

# 将处理后的子数据集添加到原来的 dataframe 里
pos_df = pd.concat([train_df, subset_df], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df[numeric_cols] = 5000


In [60]:
train_x = train_df.iloc[:, :-1]
test_x = test_df.iloc[:, :-1]
pos_train_x = pos_df.iloc[:, :-1]
col_feature = train_x.dtypes[train_x.dtypes == 'object'].index
num_feature = train_x.dtypes[train_x.dtypes != 'object'].index

#df[num_feature] = df[num_feature].apply(lambda x:((x-x.mean()) / (x.std() + 1)))
def min_max_normalization(data):
    #对原始数据进行min-max归一化处理

        max_val = max(data)
        min_val = min(data)
        if min_val == max_val:
            return data
        else:
            norm_data = [(x - min_val) / (max_val - min_val) for x in data]
            return norm_data
train_num_feature = train_x[num_feature].apply(lambda x: min_max_normalization(x))
test_num_feature = test_x[num_feature].apply(lambda x: min_max_normalization(x))
pos_num_feature = pos_train_x[num_feature].apply(lambda x: min_max_normalization(x))

train_one_hot = pd.get_dummies(train_x[col_feature], dummy_na=True)
pos_one_hot = pd.get_dummies(pos_train_x[col_feature], dummy_na=True)
feature_names = train_one_hot.columns
test_one_hot = pd.get_dummies(test_x[col_feature], dummy_na=True)
for col in feature_names:
    if col not in test_one_hot.columns:
        test_one_hot[col] = 0
# 调整特征的顺序
test_encoded = test_one_hot[feature_names]
#df.head()
train_y = train_df.iloc[:, -1]
test_y = test_df.iloc[:, -1]
pos_train_y = pos_df.iloc[:, -1]

mapping = {'normal':0, 'anomaly':1}
train_y = train_y.map(mapping).T
test_y = test_y.map(mapping).T
pos_train_y = pos_train_y.map(mapping).T

train_x = pd.concat([train_one_hot, train_num_feature], axis=1).values
test_x = pd.concat([test_one_hot, test_num_feature], axis=1).values
pos_train_x = pd.concat([pos_one_hot, pos_num_feature], axis=1).values

train_x = torch.tensor(train_x,dtype=torch.float32)
test_x = torch.tensor(test_x, dtype=torch.float32)
pos_train_x = torch.tensor(pos_train_x, dtype=torch.float32)

train_y = torch.tensor(train_y, dtype=torch.long)
test_y = torch.tensor(test_y, dtype=torch.long)
pos_train_y = torch.tensor(pos_train_y, dtype=torch.long)

In [61]:
def sigmoid(a):
    """
    利用tanh与sigmoid的关系: 1-2 * Sigmoid(x) = - tanh(x/2)
    """
    return np.tanh(a * 0.5) * 0.5 + 0.5

In [62]:
def fit_grad(
    x_train: np.ndarray,
    y_train: np.ndarray,
    learning_rate: float = 0.1,
    max_iter: int = 100,
):
    """梯度下降法迭代：逻辑回归模型的最大似然估计
    x_train: (N,D)
    y_train: (N,)
    learning_rate: 默认0.1
    max_iter: 默认100
    """
    w = np.zeros(np.size(x_train, 1))
    for _ in range(max_iter):
        w_prev = np.copy(w)
        y = sigmoid(x_train @ w)
        grad = x_train.T @ (y - y_train)
        w -= learning_rate*grad
        if np.allclose(w, w_prev):
            break
    return w

In [63]:
def fit_Newton(
    x_train: np.ndarray,
    y_train: np.ndarray,
    max_iter: int = 100,
):
    """Newton法迭代：逻辑回归模型的最大似然估计
    x_train: (N,D)
    y_train: (N,)
    max_iter: 默认100
    """
    w = np.zeros(np.size(x_train, 1))
    for _ in range(max_iter):
        w_prev = np.copy(w)
        y = sigmoid(x_train @ w)
        grad = x_train.T @ (y - y_train)
        hessian = (x_train.T * y * (1 - y)) @ x_train
        try:
            w -= np.linalg.solve(hessian, grad)
        except np.linalg.LinAlgError:
            break
        if np.allclose(w, w_prev):
            break
    return w

In [64]:
class LogisticRegression():
    
    @staticmethod
    def _sigmoid(a):
        return np.tanh(a * 0.5) * 0.5 + 0.5

    def fit(
        self,
        x_train: np.ndarray,
        y_train: np.ndarray,
        max_iter: int = 100,
    ):
        """
        x_train : (N, D) np.ndarray
        y_train : (N,) np.ndarray
        max_iter : int, optional
        """
        w = np.zeros(np.size(x_train, 1))
        for _ in range(max_iter):
            w_prev = np.copy(w)
            y = self._sigmoid(x_train @ w)
            grad = x_train.T @ (y - y_train)
            hessian = (x_train.T * y * (1 - y)) @ x_train
            try:
                w -= np.linalg.solve(hessian, grad)
            except np.linalg.LinAlgError:
                break
            if np.allclose(w, w_prev):
                break
        self.w = w

    def proba(self, x: np.ndarray):
        """
        x : (N, D) np.ndarray
        """
        return self._sigmoid(x @ self.w)

    def classify(self, x: np.ndarray, threshold: float = 0.5):
        """
        x : (N, D) np.ndarray
            Input independent variable to be classified
        threshold : float, optional
            threshold of binary classification (default is 0.5)
        """
        return (self.proba(x) > threshold).astype(int)

In [65]:
train_x = np.concatenate((np.ones((train_x.shape[0],1)),train_x),axis=1)
train_y = np.ones(train_y.shape)
train_x.head(5)

RuntimeError: Numpy is not available