In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale,minmax_scale,robust_scale
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
from statsmodels.api import qqplot, add_constant
import statsmodels.formula.api as smf
from statsmodels.tools.eval_measures import rmse
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import statsmodels.api as sm
def root_mean_squared_error(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))
def mean_absolute_percentage_error(y_true,y_pred):
    return np.mean(np.abs((y_true-y_pred)/y_true))*100
from sklearn.neighbors import KNeighborsClassifier
from statsmodels.api import Logit
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [46]:
df_raw = pd.read_csv("2BD/유방암.csv", engine = 'python', encoding = 'EUC-KR')
df_raw.head()

Unnamed: 0,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,음성,15.12,16.68,98.78,716.6,0.08876,0.09588,0.0755,0.04079,0.1594,...,17.77,20.24,117.7,989.5,0.1491,0.3331,0.3327,0.1252,0.3415,0.0974
1,양성,10.8,9.71,68.77,357.6,0.09594,0.05736,0.02531,0.01698,0.1381,...,11.6,12.02,73.66,414.0,0.1436,0.1257,0.1047,0.04603,0.209,0.07699
2,음성,18.31,18.58,118.6,1041.0,0.08588,0.08468,0.08169,0.05814,0.1621,...,21.31,26.36,139.2,1410.0,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938
3,양성,14.92,14.93,96.45,686.9,0.08098,0.08549,0.05539,0.03221,0.1687,...,17.18,18.22,112.0,906.6,0.1065,0.2791,0.3151,0.1147,0.2688,0.08273
4,양성,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,...,13.62,15.54,87.4,577.0,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915


## 인공 신경망

In [47]:
df_raw.isnull().sum(axis=0)

diagnosis                  0
mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
mean_compactness           0
mean_concavity             0
mean_concave_points        0
mean_symmetry              0
mean_fractal_dimension     0
radius_error               0
texture_error              0
perimeter_error            0
area_error                 0
smoothness_error           0
compactness_error          0
concavity_error            0
concave_points_error       0
symmetry_error             0
fractal_dimension_error    0
worst_radius               0
worst_texture              0
worst_perimeter            0
worst_area                 0
worst_smoothness           0
worst_compactness          0
worst_concavity            0
worst_concave_points       0
worst_symmetry             0
worst_fractal_dimension    0
dtype: int64

In [48]:
df_raw.boxplot()

<Axes: xlabel='gamma', ylabel='accuracy'>

In [7]:
df_raw = df_raw[df_raw["worst_area"]<4000]

In [49]:
df_raw_x=df_raw.drop("diagnosis", axis=1, inplace=False) #설명변수 설정
df_raw_y=df_raw["diagnosis"] #목표변수 설정
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_raw_x, df_raw_y, test_size = 0.3, random_state =1234)
df_train_x, df_valid_x, df_train_y, df_valid_y = train_test_split(df_train_x, df_train_y, test_size = 3/7, random_state =1234)
print(df_train_x.shape)
print(df_train_y.shape)
print(df_valid_x.shape)
print(df_valid_y.shape)
print(df_test_x.shape)
print(df_test_y.shape)

(128, 30)
(128,)
(96, 30)
(96,)
(96, 30)
(96,)


In [51]:
nn_uncustomized = MLPClassifier(random_state = 1234)
nn_uncustomized.fit(df_train_x, df_train_y)
#train set 정확도
print("Accuracy on traning set: {:.3f}".format(nn_uncustomized.score(df_train_x, df_train_y)))
#validation set 정확도
print("Accuracy on validation set: {:.3f}".format(nn_uncustomized.score(df_valid_x, df_valid_y)))

Accuracy on traning set: 0.914
Accuracy on validation set: 0.812


In [52]:
#train 및 validation 정확도 결과 저장용
train_accuracy = []; validation_accuracy =[]
#hidden_layer_sizes: 은닉층(20, 40, 60, 80, 100, 120, 140, 160)
para_hidden = [20 * hidden for hidden in range(1,9)]

for v_hidden in para_hidden:
    nn = MLPClassifier(hidden_layer_sizes = v_hidden, random_state = 1234) 
    nn.fit(df_train_x, df_train_y)
    train_accuracy.append(nn.score(df_train_x,df_train_y))
    validation_accuracy.append(nn.score(df_valid_x, df_valid_y))
#데이터 테이블로 저장    
df_accuracy_hidden = pd.DataFrame()
df_accuracy_hidden["HiddenLayer"] = para_hidden
df_accuracy_hidden["TrainAccuracy"] = train_accuracy
df_accuracy_hidden["validationAccuracy"] = validation_accuracy
#은닉층별 정확도 테이블
df_accuracy_hidden.round(3)

Unnamed: 0,HiddenLayer,TrainAccuracy,validationAccuracy
0,20,0.344,0.417
1,40,0.656,0.573
2,60,0.875,0.833
3,80,0.938,0.875
4,100,0.914,0.812
5,120,0.656,0.583
6,140,0.656,0.583
7,160,0.656,0.583


In [53]:
plt.plot(para_hidden, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_hidden, validation_accuracy, linestyle = "-", label = "validation Accuracy")
plt.ylabel("accuracy"); plt.xlabel("Hidden Layer")
plt.legend()

<matplotlib.legend.Legend at 0x176f0ba00>

In [54]:
train_accuracy = []; validation_accuracy =[]
#activation 변경
para_function = ["logistic", "tanh", "relu"]

for v_function in para_function:
    nn = MLPClassifier(activation = v_function, random_state = 1234)
    nn.fit(df_train_x, df_train_y)
    train_accuracy.append(nn.score(df_train_x,df_train_y))
    validation_accuracy.append(nn.score(df_valid_x, df_valid_y))
#데이터 테이블로 저장    
df_accuracy_function = pd.DataFrame()
df_accuracy_function["Activation Function"] = para_function
df_accuracy_function["TrainAccuracy"] = train_accuracy
df_accuracy_function["validationAccuracy"] = validation_accuracy
#Activaion Function별 정확도 테이블
df_accuracy_function.round(3)



Unnamed: 0,Activation Function,TrainAccuracy,validationAccuracy
0,logistic,0.945,0.885
1,tanh,0.938,0.896
2,relu,0.914,0.812


In [55]:
plt.plot(para_function, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_function, validation_accuracy, linestyle = "-", label = "validation Accuracy")
plt.ylabel("accuracy"); plt.xlabel("Activation Function")
plt.legend()

<matplotlib.legend.Legend at 0x176f0b610>

In [56]:
v_feature_name = df_train_x.columns
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_raw_x)
df_scaled = pd.DataFrame(df_scaled, columns=v_feature_name)
df_scaled.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,-0.157678,-0.879112,-0.16555,-0.232872,-0.744948,-0.47922,-0.550769,-0.628488,-0.997431,-0.392627,...,-0.149283,-1.156442,-0.153789,-0.224951,0.490637,0.139396,-0.09447,-0.301949,0.525611,0.511081
1,-1.312779,-2.560965,-1.328928,-1.150194,-0.240274,-1.153621,-1.143404,-1.199934,-1.773612,0.184077,...,-1.341759,-2.517236,-1.380396,-1.124201,0.254373,-1.046076,-1.159323,-1.480312,-1.387506,-0.507149
2,0.695279,-0.420644,0.602798,0.59604,-0.94738,-0.675307,-0.477679,-0.212084,-0.899042,-1.174101,...,0.534893,-0.143296,0.445031,0.432104,-0.61336,-0.367031,0.004076,0.17285,0.223844,-0.886803
3,-0.211154,-1.301385,-0.255876,-0.308762,-1.291796,-0.661126,-0.788224,-0.83441,-0.658535,-0.834209,...,-0.263312,-1.490846,-0.312546,-0.354487,-1.339335,-0.169261,-0.176669,-0.45823,-0.524077,-0.220788
4,-0.753945,-1.738136,-0.820314,-0.745704,-2.095198,-1.505004,-1.175403,-1.326415,-1.93395,-0.956793,...,-0.951354,-1.934512,-0.997708,-0.869505,-1.783511,-1.108951,-1.094404,-1.366747,-1.071301,-0.898278


In [57]:
df_scaled_train_x,df_scaled_test_x=train_test_split(df_scaled,test_size=0.3,random_state=1234)
df_scaled_train_x,df_scaled_validation_x=train_test_split(df_scaled_train_x,test_size=3/7,random_state=1234)
print("train data X size:{}".format(df_scaled_train_x.shape))
print("valid data X size:{}".format(df_scaled_validation_x.shape))
print("test data X size:{}".format(df_scaled_test_x.shape))

train data X size:(128, 30)
valid data X size:(96, 30)
test data X size:(96, 30)


In [60]:
train_accuracy = []; validation_accuracy =[]
para_hidden = [20 * hidden for hidden in range(1,11)]

for v_hidden in para_hidden:
#nn=MLPClassifier(hidden_layer_sizes = v_hidden, random_state = 1234)
    nn = MLPClassifier(hidden_layer_sizes = (v_hidden, v_hidden), random_state = 1234)
    nn.fit(df_scaled_train_x, df_train_y)
    train_accuracy.append(nn.score(df_scaled_train_x,df_train_y))
    validation_accuracy.append(nn.score(df_scaled_valid_x, df_valid_y))
#데이터 레이블로 저장    
df_accuracy_hidden = pd.DataFrame()
df_accuracy_hidden["HiddenLayer"] = para_hidden
df_accuracy_hidden["TrainAccuracy"] = train_accuracy
df_accuracy_hidden["validationAccuracy"] = validation_accuracy
df_accuracy_hidden.round(3)



NameError: name 'df_scaled_valid_x' is not defined

In [63]:
#train 및 validation 정확도 결과 저장용
train_accuracy = []; validation_accuracy =[]
#activation 변경
para_function = ["logistic", "tanh", "relu"]

for v_function in para_function:
    nn = MLPClassifier(activation = v_function, hidden_layer_sizes = (80,80),
                       random_state = 1234)
    nn.fit(df_scaled_train_x, df_train_y)
    train_accuracy.append(nn.score(df_scaled_train_x,df_train_y))
    validation_accuracy.append(nn.score(df_scaled_validation_x, df_valid_y))
#데이터 테이블로 저장
df_accuracy_function = pd.DataFrame()
df_accuracy_function["ActivationFunction"] = para_function
df_accuracy_function["TrainAccuracy"] = train_accuracy
df_accuracy_function["validationAccuracy"] = validation_accuracy
#Activation Function별 정확도 테이블
df_accuracy_function.round(3)



Unnamed: 0,ActivationFunction,TrainAccuracy,validationAccuracy
0,logistic,0.992,0.927
1,tanh,1.0,0.917
2,relu,1.0,0.938


In [65]:
plt.plot(para_function, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_function, validation_accuracy, linestyle = "-", label = "validation Accuracy")
plt.ylabel("accuracy"); plt.xlabel("function Layer")
plt.legend()
#Activation Function 지정(임의)
activation = "relu"

In [66]:
train_accuracy = []; validation_accuracy =[]
#solver 변경
para_solver = ["lbfgs", "sgd", "adam"]

for v_solver in para_solver:
    nn=MLPClassifier(solver = v_solver, activation = "relu", hidden_layer_sizes = (80,80), random_state=1234)
    nn.fit(df_scaled_train_x, df_train_y)
    train_accuracy.append(nn.score(df_scaled_train_x, df_train_y))
    validation_accuracy.append(nn.score(df_scaled_validation_x, df_valid_y))
    
#데이터 테이블로 저장
df_accuracy_solver = pd.DataFrame()
df_accuracy_solver["Solver"] = para_solver
df_accuracy_solver["TrainAccuracy"] = train_accuracy
df_accuracy_solver["validationAccuracy"] = validation_accuracy
#Solver별 정확도 테이블
df_accuracy_solver.round(3)




Unnamed: 0,Solver,TrainAccuracy,validationAccuracy
0,lbfgs,1.0,0.938
1,sgd,0.977,0.969
2,adam,1.0,0.938


In [67]:
plt.plot(para_solver, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_solver, validation_accuracy, linestyle = "-", label = "validation Accuracy")
plt.ylabel("accuracy"); plt.xlabel("Solver")
plt.legend()
#Solver 지정(임의)

<matplotlib.legend.Legend at 0x283e2c940>

In [68]:
train_accuracy = []; validation_accuracy =[]
#Mini Batch 변경:
para_batch = [20 * batch for batch in range(1, 10)]

for v_batch in para_batch:
    nn = MLPClassifier(batch_size = v_batch, solver = "adam", 
                       activation = "relu", hidden_layer_sizes = (80,80),
                       random_state = 1234)
    nn.fit(df_scaled_train_x, df_train_y)
    train_accuracy.append(nn.score(df_scaled_train_x,df_train_y))
    validation_accuracy.append(nn.score(df_scaled_validation_x, df_valid_y))

#데이터 테이블로 저장
df_accuracy_batch = pd.DataFrame()
df_accuracy_batch["Mini Batch"] = para_batch
df_accuracy_batch["TrainAccuracy"] = train_accuracy
df_accuracy_batch["validationAccuracy"] = validation_accuracy
#Mini Batch Size별 정확도 테이블
df_accuracy_batch.round(3)



Unnamed: 0,Mini Batch,TrainAccuracy,validationAccuracy
0,20,1.0,0.938
1,40,1.0,0.938
2,60,1.0,0.938
3,80,1.0,0.938
4,100,1.0,0.938
5,120,1.0,0.938
6,140,1.0,0.938
7,160,1.0,0.938
8,180,1.0,0.938


In [69]:
plt.plot(para_batch, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_batch, validation_accuracy, linestyle = "-", label = "validation Accuracy")
plt.ylabel("accuracy"); plt.xlabel("Mini Batch Size")
plt.legend()

<matplotlib.legend.Legend at 0x17b1e98e0>

## KNN

In [74]:
#데이터 구성하기
df=pd.read_csv("2BD/유방암.csv", engine = 'python', encoding = 'EUC-KR')
df

Unnamed: 0,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,음성,15.12,16.68,98.78,716.6,0.08876,0.09588,0.07550,0.04079,0.1594,...,17.77,20.24,117.70,989.5,0.14910,0.33310,0.33270,0.12520,0.3415,0.09740
1,양성,10.80,9.71,68.77,357.6,0.09594,0.05736,0.02531,0.01698,0.1381,...,11.60,12.02,73.66,414.0,0.14360,0.12570,0.10470,0.04603,0.2090,0.07699
2,음성,18.31,18.58,118.60,1041.0,0.08588,0.08468,0.08169,0.05814,0.1621,...,21.31,26.36,139.20,1410.0,0.12340,0.24450,0.35380,0.15710,0.3206,0.06938
3,양성,14.92,14.93,96.45,686.9,0.08098,0.08549,0.05539,0.03221,0.1687,...,17.18,18.22,112.00,906.6,0.10650,0.27910,0.31510,0.11470,0.2688,0.08273
4,양성,12.89,13.12,81.89,515.9,0.06955,0.03729,0.02260,0.01171,0.1337,...,13.62,15.54,87.40,577.0,0.09616,0.11470,0.11860,0.05366,0.2309,0.06915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,음성,19.55,23.21,128.90,1174.0,0.10100,0.13180,0.18560,0.10210,0.1989,...,20.82,30.44,142.00,1313.0,0.12510,0.24140,0.38290,0.18250,0.2576,0.07602
316,음성,19.10,26.29,129.10,1132.0,0.12150,0.17910,0.19370,0.14690,0.1634,...,20.33,32.72,141.30,1298.0,0.13920,0.28170,0.24320,0.18410,0.2311,0.09203
317,음성,24.25,20.20,166.20,1761.0,0.14470,0.28670,0.42680,0.20120,0.2655,...,26.02,23.99,180.90,2073.0,0.16960,0.42440,0.58030,0.22480,0.3222,0.08009
318,음성,19.17,24.80,132.40,1123.0,0.09740,0.24580,0.20650,0.11180,0.2397,...,20.96,29.94,151.70,1332.0,0.10370,0.39030,0.36390,0.17670,0.3176,0.10230


In [76]:
df.isnull().sum(axis=0)

diagnosis                  0
mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
mean_compactness           0
mean_concavity             0
mean_concave_points        0
mean_symmetry              0
mean_fractal_dimension     0
radius_error               0
texture_error              0
perimeter_error            0
area_error                 0
smoothness_error           0
compactness_error          0
concavity_error            0
concave_points_error       0
symmetry_error             0
fractal_dimension_error    0
worst_radius               0
worst_texture              0
worst_perimeter            0
worst_area                 0
worst_smoothness           0
worst_compactness          0
worst_concavity            0
worst_concave_points       0
worst_symmetry             0
worst_fractal_dimension    0
dtype: int64

In [77]:
df.boxplot()

<Axes: xlabel='Mini Batch Size', ylabel='accuracy'>

In [79]:
df= df[df["worst_area"]<4000] #이상치 제거

In [80]:
df_y=df['diagnosis'] #목표변수
df_x=df.drop('diagnosis',axis=1,inplace=False) #설명변수
df_train_x_before,df_test_x,df_train_y_before,df_test_y=train_test_split(df_x,df_y,test_size=0.3,random_state=1234)
print('train data X size : {}'.format(df_train_x_before.shape))
print('train data Y size : {}'.format(df_train_y_before.shape))
print('test data X size : {}'.format(df_test_x.shape))
print('test data X size : {}'.format(df_test_y.shape))

train data X size : (223, 30)
train data Y size : (223,)
test data X size : (96, 30)
test data X size : (96,)


In [81]:
df_train_x, df_val_x , df_train_y,df_val_y = train_test_split(df_train_x_before,df_train_y_before,\
test_size = 3/7, random_state = 1234)

print("train x:",df_train_x.shape)
print("train y:",df_train_y.shape)
print("test x:",df_val_x.shape)
print("test y:",df_val_y.shape)

train x: (127, 30)
train y: (127,)
test x: (96, 30)
test y: (96,)


In [82]:
knn_uncustomized=KNeighborsClassifier()
knn_uncustomized.fit(df_train_x,df_train_y)
print("Accuracy on training set : {:.3f}".format(knn_uncustomized.score(df_train_x,df_train_y)))
print("Accuracy on test set : {:.3f}".format(knn_uncustomized.score(df_test_x,df_test_y)))


Accuracy on training set : 0.945
Accuracy on test set : 0.906


In [83]:
train_accuracy = []
test_accuracy = []

para_n_neighbors = [i for i in range(1,16)]

for v_n_neighbors in para_n_neighbors :
    knn = KNeighborsClassifier(n_neighbors=v_n_neighbors)
    knn.fit(df_train_x,df_train_y)
    train_accuracy.append(knn.score(df_train_x, df_train_y))
    test_accuracy.append(knn.score(df_val_x,df_val_y))

df_accuracy_neighbors = pd.DataFrame()
df_accuracy_neighbors["Neighbors"] = para_n_neighbors
df_accuracy_neighbors["Train Accuracy"] = train_accuracy
df_accuracy_neighbors["Test Accuracy"] = test_accuracy
df_accuracy_neighbors.round(3)

Unnamed: 0,Neighbors,Train Accuracy,Test Accuracy
0,1,1.0,0.906
1,2,0.961,0.885
2,3,0.953,0.906
3,4,0.945,0.906
4,5,0.945,0.906
5,6,0.945,0.896
6,7,0.953,0.885
7,8,0.945,0.875
8,9,0.953,0.906
9,10,0.921,0.875


In [84]:
plt.plot(para_n_neighbors, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_n_neighbors, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.legend()

<matplotlib.legend.Legend at 0x286351bb0>

In [85]:
train_accuracy = []
test_accuracy = []
para_n_neighbors = [i for i in range(1,11)]*2
para_weights = (["uniform"])*10 + (["distance"]*10)
for (v_n_neighbors,v_weights) in zip(para_n_neighbors, para_weights) :
    knn = KNeighborsClassifier(n_neighbors=v_n_neighbors, weights = v_weights)
    knn.fit(df_train_x,df_train_y)
    train_accuracy.append(knn.score(df_train_x, df_train_y))
    test_accuracy.append(knn.score(df_val_x,df_val_y))
df_accuracy_weights = pd.DataFrame()
df_accuracy_weights["Neighbors"] = para_n_neighbors
df_accuracy_weights["Weights"] = para_weights
df_accuracy_weights["Train Accuracy"] = train_accuracy
df_accuracy_weights["Test Accuracy"] = test_accuracy
df_accuracy_weights.round(3)

Unnamed: 0,Neighbors,Weights,Train Accuracy,Test Accuracy
0,1,uniform,1.0,0.906
1,2,uniform,0.961,0.885
2,3,uniform,0.953,0.906
3,4,uniform,0.945,0.906
4,5,uniform,0.945,0.906
5,6,uniform,0.945,0.896
6,7,uniform,0.953,0.885
7,8,uniform,0.945,0.875
8,9,uniform,0.953,0.906
9,10,uniform,0.921,0.875


In [87]:
# 데이터 형태 변환
df_accuracy_weights_pivot = df_accuracy_weights.pivot(index = "Neighbors",columns = "Weights",
                                                      values = ["Train Accuracy","Test Accuracy"])
df_accuracy_weights_pivot

Unnamed: 0_level_0,Train Accuracy,Train Accuracy,Test Accuracy,Test Accuracy
Weights,distance,uniform,distance,uniform
Neighbors,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,1.0,1.0,0.90625,0.90625
2,1.0,0.96063,0.90625,0.885417
3,1.0,0.952756,0.90625,0.90625
4,1.0,0.944882,0.90625,0.90625
5,1.0,0.944882,0.90625,0.90625
6,1.0,0.944882,0.90625,0.895833
7,1.0,0.952756,0.895833,0.885417
8,1.0,0.944882,0.90625,0.875
9,1.0,0.952756,0.90625,0.90625
10,1.0,0.92126,0.90625,0.875


In [88]:
level0 = df_accuracy_weights_pivot.columns.get_level_values(0)
level1 = df_accuracy_weights_pivot.columns.get_level_values(1)
df_accuracy_weights_pivot.columns = level0 + "_" + level1
df_accuracy_weights_pivot.head()

Unnamed: 0_level_0,Train Accuracy_distance,Train Accuracy_uniform,Test Accuracy_distance,Test Accuracy_uniform
Neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,1.0,0.90625,0.90625
2,1.0,0.96063,0.90625,0.885417
3,1.0,0.952756,0.90625,0.90625
4,1.0,0.944882,0.90625,0.90625
5,1.0,0.944882,0.90625,0.90625


In [89]:
sns.lineplot(data = df_accuracy_weights_pivot)

<Axes: xlabel='Mini Batch Size', ylabel='accuracy'>

In [91]:
train_accuracy = []
test_accuracy = []
para_n_neighbors = [i for i in range(1,11)]*3
para_metric = ["minkowski"]*10 + ["euclidean"]*10 + ["manhattan"]*10

for (v_n_neighbors,v_metric) in zip(para_n_neighbors, para_metric) :
    knn = KNeighborsClassifier(n_neighbors=v_n_neighbors, metric = v_metric,weights = "uniform")
    knn.fit(df_train_x,df_train_y)
    train_accuracy.append(knn.score(df_train_x, df_train_y))
    test_accuracy.append(knn.score(df_val_x,df_val_y))
df_accuracy_metric = pd.DataFrame()
df_accuracy_metric["Neighbors"] = para_n_neighbors
df_accuracy_metric["Metric"] = para_metric
df_accuracy_metric["Train Accuracy"] = train_accuracy
df_accuracy_metric["Test Accuracy"] = test_accuracy
df_accuracy_metric.round(3)

Unnamed: 0,Neighbors,Metric,Train Accuracy,Test Accuracy
0,1,minkowski,1.0,0.906
1,2,minkowski,0.961,0.885
2,3,minkowski,0.953,0.906
3,4,minkowski,0.945,0.906
4,5,minkowski,0.945,0.906
5,6,minkowski,0.945,0.896
6,7,minkowski,0.953,0.885
7,8,minkowski,0.945,0.875
8,9,minkowski,0.953,0.906
9,10,minkowski,0.921,0.875


In [92]:
df_accuracy_metric_pivot = df_accuracy_metric.pivot(index = "Neighbors",columns = "Metric",
                                                      values = ["Train Accuracy","Test Accuracy"])


level0 = df_accuracy_metric_pivot.columns.get_level_values(0)
level1 = df_accuracy_metric_pivot.columns.get_level_values(1)
df_accuracy_metric_pivot.columns = level0 + "_" + level1
sns.lineplot(data = df_accuracy_metric_pivot)

<Axes: xlabel='Mini Batch Size', ylabel='accuracy'>

In [95]:
knn_model = KNeighborsClassifier(n_neighbors=v_n_neighbors, weights = "uniform", metric = "manhattan")
knn_model.fit(df_train_x,df_train_y)

y_pred = knn_model.predict(df_test_x)
print("train data accuracy:",knn_model.score(df_train_x,df_train_y))
print("test data accuracy:",knn_model.score(df_test_x,df_test_y))
print("Confusion matrix:\n",confusion_matrix(df_test_y,y_pred))

train data accuracy: 0.9212598425196851
test data accuracy: 0.8958333333333334
Confusion matrix:
 [[25  2]
 [ 8 61]]


인공지능망은 데이터가 커지면 학습이 오래걸리며, 하이퍼파라미터 튜닝이 민감하지만, 복잡한 여러 요인들을 처리하기 위해서는 적합하다.  
KNN은 대략적인 값(범주)를 유추하는 특성을 갖고있기 때문에 일정한 정도의 사건을 예측하는 것이라면 KNN을 사용하는 게 나을 것 같다.