In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
np.random.seed(3413)

In [3]:
# path=r'C:\Users\imim555\iCloudDrive\KNU\DM\file\train_titanic.csv'
df=pd.read_csv('train_titanic.csv')

# 데이터 확인-----------------------------------
 # 종속변수 확인
y_data=df['Survived']


 # 결측치 확인
df.isnull().sum()
df.value_counts

df=df.fillna(value={'Embarked':'S', 'Age':df['Age'].mean()}) 
df=df.round()


# 데이터 변환------------------------------------
# 카테고리형변수 수치화
df['Sex']=df['Sex'].map({'male':0, 'female':1}) 

df['PortS'] = df['Embarked'].apply(lambda x: 1 if x == 'S' else 0)
df['PortC'] = df['Embarked'].apply(lambda x: 1 if x == 'C' else 0)

# 파생변수 생성
df['Family']=df['SibSp']+df['Parch'] 

In [4]:
# 데이터 정규화--------------------------------

df0=df.copy() # 정규화X
df1=df.copy() # 정규화(Min-Max)


# Min-Max Normalization = (X - MIN) / (MAX-MIN) 
def Min_Max(x):
    min=x.min()
    max=x.max()
    normal=(x-min)/(max-min)
    return normal

age1=Min_Max(df['Age'])
fare1=Min_Max(df['Fare'])
family1=Min_Max(df['Family'])
pclass1=Min_Max(df['Pclass'])



# 컬럼 수정-----------------------------------
df1['age']=age1
df1['fare']=fare1
df1['family']=family1
df1['pclass']=pclass1


# 불필요한 변수 삭제
df1.drop(columns=['Survived','Name', 'PassengerId', 'Ticket','Cabin','Age','SibSp','Parch','Fare','Embarked','Family'], axis=1, inplace=True)
df0.drop(columns=['Survived','Name', 'PassengerId', 'Ticket','Cabin','SibSp','Parch','Embarked'], axis=1, inplace=True)


# 컬럼순서 변경
df0=df0[['Sex', 'Age', 'Family', 'Fare', 'Pclass', 'PortS','PortC' ]]
df1=df1[['Sex', 'age', 'family', 'fare', 'pclass', 'PortS','PortC' ]]


In [5]:
# 데이터 분할--------------------------------

# 종속/독립변수 나누기
y_data=y_data
x_data=df1

y_data=np.array(y_data)
x_data=np.array(x_data)



# 학습/검증용 나누기
size = y_data.shape[0] # 891

idx=np.arange(size)    # 일련번호 생성 후 재배열
np.random.shuffle(idx) 

num=int(size*0.7)      # 분리 비율(7:3)

x_train = x_data[idx[:num]] 
x_test = x_data[idx[num:]]
y_train = y_data[idx[:num]]
y_test = y_data[idx[num:]]


In [6]:
import tqdm

In [7]:
class Custom_SVM:
    def __init__(self, kernel='linear', C=100, max_iter=10000, degree=3, gamma=10):
        self.kernel = {'poly'  : lambda x, y: np.dot(x, y.T)**degree,
                       'rbf'   : lambda x, y: np.exp(-gamma*np.sum((y - x[:, np.newaxis])**2, axis=-1)),
                       'linear': lambda x, y: np.dot(x, y.T)}[kernel]
        self.C = C
        self.max_iter = max_iter

    def restrict_to_square(self, t, v0, u):
        t = (np.clip(v0 + t*u, 0, self.C) - v0)[1]/u[1]
        return (np.clip(v0 + t*u, 0, self.C) - v0)[0]/u[0]

    def fit(self, X, y):
        self.X = X.copy()
        self.y = y * 2 - 1 # 클래스 범위를 0/1에서 -1/1로 조정
        self.lambdas = np.zeros_like(self.y, dtype=float) # 제약 조건 중 0 이상 범위를 만족하는 lambda 생성
        self.K = self.kernel(self.X, self.X) * self.y[:, np.newaxis] * self.y # kernel matrix 생성
    
        for _ in tqdm.tqdm(range(self.max_iter)):
            for idxM in range(len(self.lambdas)): # lambda_M 반복
                idxL = np.random.randint(0, len(self.lambdas)) # lambda_L 랜덤 선택
                Q = self.K[[[idxM, idxM], [idxL, idxL]], [[idxM, idxL], [idxM, idxL]]] # lambda_M 및 Lambda_L에 대한 Kernel matrix(Q) 설정
                v0 = self.lambdas[[idxM, idxL]] # v_0 설정
                k0 = 1 - np.sum(self.lambdas * self.K[[idxM, idxL]], axis=1) # k_0 설정
                u = np.array([-self.y[idxL], self.y[idxM]]) # u 벡터 설정
                t_max = np.dot(k0, u) / (np.dot(np.dot(Q, u), u) + 1E-15) # argmax lambda*(t)를 계산하고, lambda_M 및 Lambda_L이 같은 경우에 업데이트를 위하여 작은 상수를 더함
                self.lambdas[[idxM, idxL]] = v0 + u * self.restrict_to_square(t_max, v0, u) # Cost를 활용하여 범위를 제한하여 제약 조건을 충족시킴
    
        idx = np.nonzero(self.lambdas > 1E-15) # select indexes of support vectors
        self.b = np.sum((1.0 - np.sum(self.K[idx] * self.lambdas, axis=1)) * self.y[idx]) / len(idx) # eq .(3)
  
    def predict(self, X):
        result = np.sum(self.kernel(X, self.X) * self.y * self.lambdas, axis=1) + self.b # f from eq .(2)
        return np.where(result > 0, 1, 0)
    
    def accuracy(self, y_pred, y_test):
        acc = np.array([1 if pred == test else 0 for pred, test in zip(y_pred, y_test)])
        return acc.sum() / len(acc)

In [8]:
model = Custom_SVM()
model.fit(x_train, y_train)
predict = model.predict(x_test)
model.accuracy(predict, y_test)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [08:26<00:00, 19.74it/s]


0.585820895522388

In [9]:
model = Custom_SVM(kernel='rbf')
model.fit(x_train, y_train)
predict = model.predict(x_test)
model.accuracy(predict, y_test)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [08:27<00:00, 19.72it/s]


0.4141791044776119

In [10]:
model = Custom_SVM(kernel='poly')
model.fit(x_train, y_train)
predict = model.predict(x_test)
model.accuracy(predict, y_test)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [08:22<00:00, 19.91it/s]


0.4141791044776119