# K-Nearest Neighbors

새로운 데이터(x) 와 가장 가까운 k개의 데이터를 통해 x를 분류하는 방법
- euclidean distance
$$\sqrt{(x_1-μ_1)^2 + (x_2-μ_2)^2+ ⋯ + (x_p-μ_p)^2}$$

- manhattan distance
$$|x_1-μ_1|+|x_2-μ_2|+⋯+|x_p-μ_p|$$

      - 변수의 값이 가지는 스케일의 차이가 모델 학습에 영향을 미치는 것을 막기 위해 스케일링 수행이 필요
      - 각 변수의 값 차이가 가졌던 정보는 남아있도록 스케일링

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import sklearn as sc

In [4]:
# 데이터의 가장 가까운 데이터를 통해 분류를 해주는 모델 만들기

mobile_price = pd.read_csv('train.csv')
print(mobile_price.shape)
mobile_price.head()
# 2000개의 데이터, 21가지의 컬럼.

(2000, 21)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [19]:
mobile_price['price_range'].unique()

array([1, 2, 3, 0])

In [5]:
mobile_price.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [11]:
columns = mobile_price.columns

X = mobile_price[columns[:-1]]
y = mobile_price[columns[-1]]

SC = StandardScaler()
X = SC.fit_transform(X)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) # 2000개의 10% : 200개 test용
len(x_test[0])

20

In [12]:
print(len(x_train), len(x_test))

1800 200


In [16]:
for i in range(1, 12):
  knn_model = KNeighborsClassifier(n_neighbors=i,metric='manhattan').fit(x_train, y_train)

  print(knn_model.score(x_train, y_train))
  print(knn_model.score(x_test, y_test))
  print("==========================================")

1.0
0.37
0.7388888888888889
0.425
0.7316666666666667
0.435
0.7422222222222222
0.53
0.7311111111111112
0.545
0.7277777777777777
0.605
0.7233333333333334
0.61
0.7255555555555555
0.625
0.73
0.595
0.7216666666666667
0.595
0.7166666666666667
0.62


In [18]:
knn_model = KNeighborsClassifier(n_neighbors=8,metric='manhattan').fit(x_train, y_train)

knn_model.predict_proba(x_test)[0] # x_test의 확률이 나옴.

array([0.   , 0.125, 0.375, 0.5  ])

In [20]:
knn_model.predict(x_test)[0]

3

실습2 삼각형 분류


In [23]:
pd.read_csv('dataset.csv')['class'].unique()

array(['obtuse triangle', 'acute triangle', 'right triangle'],
      dtype=object)

In [29]:
import torch
import pandas as pd
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
  def __init__(self, file_path):
    df = pd.read_csv(file_path)
    self.a = df.iloc[:, 0].values
    self.b = df.iloc[:,1].values
    self.c = df.iloc[:, 2].values
    self.y = df.iloc[:, 3].values
    self.y = list(map(self.string_to_vector, self.y))
    self.length = len(df)

  # y가 text형식. 바꿔줘야한다.
  def string_to_vector(self, value):
   key_value = {'obtuse triangle':2, 'acute triangle':1, 'right triangle':0}
   return key_value[value]

  def __getitem__(self, index):
    x = torch.FloatTensor(sorted([self.a[index], self.b[index], self.c[index]]))
    y = torch.LongTensor(self.y)[index]
    return x,y

  def __len__(self):
    return self.length

class CustomModel(nn.Module):
  def __init__(self):
    super(CustomModel, self).__init__()

    self.layer = nn.Sequential(
        nn.Linear(3, 3)
    )
  def forward(self,x):
    return self.layer(x)

train_dataset = CustomDataset('dataset.csv')
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = 'cpu'

model = CustomModel().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10000):
  cost = 0.0
  for x, y in train_dataloader:
    x = x.to(device)
    y = y.to(device)

    output = model(x)
    loss = criterion(output, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    cost += loss

  cost = cost/len(train_dataloader)

  if (epoch + 1) % 1000 ==0:
    print(f'Epoch : {epoch+1:4d}, Cost : {cost:.3f}')


Epoch : 1000, Cost : 0.220
Epoch : 2000, Cost : 0.141
Epoch : 3000, Cost : 0.101
Epoch : 4000, Cost : 0.081
Epoch : 5000, Cost : 0.066
Epoch : 6000, Cost : 0.056
Epoch : 7000, Cost : 0.049
Epoch : 8000, Cost : 0.043
Epoch : 9000, Cost : 0.039
Epoch : 10000, Cost : 0.035


In [31]:
with torch.no_grad(): # 기울기 업데이트 하지 않겠다.
  model.eval()          # 평가하겠다.
  classes = {2:'obtuse triangle', 1:'acute triangle', 0:'right triangle'}
  inputs = torch.FloatTensor(
      [
          [3.0,4.0,5.0],
       [3, 3, 3]
       
      ]
  ).to(device)

In [32]:
outputs = model(inputs)

In [33]:
import torch.nn.functional as F
print(torch.round(F.softmax(outputs, dim=1), decimals=2))
print(outputs.argmax(1))
print(list(map(classes.get, outputs.argmax(1).tolist())))

tensor([[0.8800, 0.0500, 0.0600],
        [0.1500, 0.8500, 0.0000]], grad_fn=<RoundBackward1>)
tensor([0, 1])
['right triangle', 'acute triangle']
