In [23]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2, venn2_circles

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [24]:
def seed_everything(seed):
    random.seed(seed) #파이썬 자체 모듈 random 모듈의 시드 고정
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed) #넘파이를 사용할 경우
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
    
seed_everything(37) # Seed 고정

In [None]:
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns', 500)

In [25]:
import matplotlib.pyplot as plt
import platform

path = "c:/Windows/Fonts/malgun.ttf"
from matplotlib import font_manager, rc

if platform.system() == 'Darwin':
    plt.rcParams["font.family"] = 'AppleGothic'

elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname=path).get_name()
    plt.rcParams["font.family"] = font_name
elif platform.system() == 'Linux':
    plt.rcParams['font.family'] = 'NanumGothic'


plt.rcParams['axes.unicode_minus'] = False

## 데이터 로드

In [26]:
import pandas as pd
to_df = pd.read_csv('./data/to_df.csv')

In [27]:
to_df.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,...,X_905,X_906,X_907,X_908,X_909,X_910,X_912,X_931,X_932,X_933
0,TRAIN_022,0,0.517719,2022-06-14 8:53,T100304,T_31,2.0,102.0,11.0,45.0,...,0.0,17.28,17.277333,17.27,0.01,0.7,0.6,13.7,13.443333,13.2
1,TRAIN_023,0,0.51909,2022-06-14 9:01,T100304,T_31,2.0,102.0,11.0,45.0,...,,,,,,,,,,
2,TRAIN_025,1,0.529362,2022-06-19 9:11,T100304,T_31,2.0,97.0,11.0,45.0,...,1.0,17.28,17.276452,17.27,0.01,0.7,0.6,13.7,13.454839,13.2
3,TRAIN_026,1,0.531992,2022-06-19 9:20,T100306,T_31,2.0,95.0,10.0,54.0,...,1.0,17.28,17.276452,17.27,0.01,0.7,0.6,13.7,13.46129,13.3
4,TRAIN_029,1,0.532405,2022-06-19 23:31,T100304,T_31,2.0,100.0,11.0,45.0,...,1.0,17.28,17.277419,17.27,0.01,0.7,0.6,13.7,13.422581,13.2


## KNN 보간법

In [41]:
df = to_df.iloc[:,6:]

In [42]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 5)
imp_to1=imputer.fit_transform(df) 

# 변환 후 데이터 확인
imp_to1 = pd.DataFrame(imp_to1)
imp_to1.columns = df.columns

In [33]:
imp_df = pd.concat([to_df.iloc[:,:6], imp_to], axis = 1)

In [34]:
imp_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,...,X_905,X_906,X_907,X_908,X_909,X_910,X_912,X_931,X_932,X_933
0,TRAIN_022,0,0.517719,2022-06-14 8:53,T100304,T_31,2.0,102.0,11.0,45.0,...,0.0,17.28,17.277333,17.27,0.01,0.7,0.6,13.70,13.443333,13.20
1,TRAIN_023,0,0.519090,2022-06-14 9:01,T100304,T_31,2.0,102.0,11.0,45.0,...,0.0,17.28,17.276886,17.27,0.01,0.7,0.6,13.68,13.438344,13.22
2,TRAIN_025,1,0.529362,2022-06-19 9:11,T100304,T_31,2.0,97.0,11.0,45.0,...,1.0,17.28,17.276452,17.27,0.01,0.7,0.6,13.70,13.454839,13.20
3,TRAIN_026,1,0.531992,2022-06-19 9:20,T100306,T_31,2.0,95.0,10.0,54.0,...,1.0,17.28,17.276452,17.27,0.01,0.7,0.6,13.70,13.461290,13.30
4,TRAIN_029,1,0.532405,2022-06-19 23:31,T100304,T_31,2.0,100.0,11.0,45.0,...,1.0,17.28,17.277419,17.27,0.01,0.7,0.6,13.70,13.422581,13.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,TRAIN_591,1,0.529308,2022-09-07 1:09,T100306,T_31,1.0,87.0,10.0,50.0,...,0.0,17.28,17.276129,17.27,0.01,0.7,0.6,13.60,13.400000,13.20
345,TRAIN_592,1,0.528349,2022-09-08 14:22,T100304,T_31,2.0,98.0,10.0,45.0,...,0.0,17.28,17.276774,17.27,0.01,0.7,0.6,13.60,13.361290,13.20
346,TRAIN_593,1,0.526546,2022-09-08 14:30,T100306,T_31,2.0,95.0,10.0,50.0,...,0.0,17.28,17.276452,17.27,0.01,0.7,0.6,13.60,13.351613,13.20
347,TRAIN_596,1,0.531375,2022-09-08 14:38,T100304,O_31,40.0,94.0,11.0,45.0,...,0.0,17.28,17.277000,17.27,0.01,0.7,0.6,13.60,13.406667,13.20


In [37]:
imp_df.to_csv('./data/imp_to_df10.csv', index = False)

In [38]:
imp_df.describe()

Unnamed: 0,Y_Class,Y_Quality,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_15,...,X_905,X_906,X_907,X_908,X_909,X_910,X_912,X_931,X_932,X_933
count,349.0,349.0,349.0,349.0,349.0,349.0,349.0,349.0,349.0,349.0,...,349.0,349.0,349.0,349.0,349.0,349.0,349.0,349.0,349.0,349.0
mean,1.011461,0.530325,2.409742,95.123209,10.39255,48.802292,10.048711,41.469914,497.050716,0.005731,...,0.057307,17.276017,17.266318,17.251777,0.024241,0.7,0.6,13.641203,13.409801,13.212092
std,0.415069,0.004735,5.895256,4.10764,0.489019,4.373824,0.215571,10.515032,17.26468,0.107058,...,0.232761,0.017663,0.023937,0.033481,0.030415,4.336086e-15,3.668996e-15,0.055809,0.051849,0.051678
min,0.0,0.502517,1.0,87.0,10.0,45.0,10.0,31.0,433.9,0.0,...,0.0,17.2,17.194516,17.19,0.01,0.7,0.6,13.5,13.254839,13.1
25%,1.0,0.528154,2.0,93.0,10.0,45.0,10.0,31.0,487.4,0.0,...,0.0,17.28,17.275484,17.27,0.01,0.7,0.6,13.6,13.374194,13.2
50%,1.0,0.530321,2.0,95.0,10.0,45.0,10.0,31.0,499.9,0.0,...,0.0,17.28,17.276774,17.27,0.01,0.7,0.6,13.6,13.4,13.2
75%,1.0,0.532446,2.0,98.0,11.0,51.0,10.0,52.0,509.6,0.0,...,0.0,17.28,17.277419,17.27,0.01,0.7,0.6,13.7,13.448387,13.2
max,2.0,0.551279,103.0,102.0,11.0,62.0,11.0,52.0,527.2,2.0,...,1.0,17.29,17.279,17.27,0.09,0.7,0.6,13.8,13.563333,13.3


In [17]:
# from sklearn.model_selection import GridSearchCV, cross_val_score
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.datasets import load_iris
# import numpy as np

# X = pd.DataFrame(imp_to)
# y = pd.DataFrame(to_df.Y_Class)

# # KNN 모델 정의
# knn = KNeighborsClassifier()

# # 탐색할 k값 지정
# param_grid = {'n_neighbors': np.arange(1, 21)}

# # GridSearchCV를 사용하여 최적의 k값 탐색
# grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='f1')
# grid_search.fit(X, y)

# # 최적의 k값 출력
# print("최적의 k값:", grid_search.best_params_['n_neighbors'])

# # 최적의 k값에 대한 성능 출력
# print("최적의 k값에 대한 성능(정확도):", grid_search.best_score_)


  return self._fit(X, y)
Traceback (most recent call last):
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/utils/_response.py", line 85, in _get_response_values
    y_pred = prediction_method(X)
  File 

최적의 k값: 1
최적의 k값에 대한 성능(정확도): nan


  return self._fit(X, y)
Traceback (most recent call last):
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/Users/kimminyoung/opt/anaconda3/envs/danal/lib/python3.8/site-packages/sklearn/utils/_response.py", line 85, in _get_response_values
    y_pred = prediction_method(X)
  File 