<a href="https://colab.research.google.com/github/sungjk1999/smart-factory-quality-classification/blob/main/xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from xgboost import XGBRegressor

In [None]:
from google.colab import drive
drive.mount('/content/drive')

"""
PRODUCT_ID : 제품의 고유 ID

Y_Class : 제품 품질 상태(Target) 
    0 : 적정 기준 미달 (부적합) 
    1 : 적합 
    2 : 적정 기준 초과 (부적합)

Y_Quality : 제품 품질 관련 정량적 수치 

TIMESTAMP : 제품이 공정에 들어간 시각 

LINE : 제품이 들어간 공정 LINE 종류 ('T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305' 존재) 

PRODUCT_CODE : 제품의 CODE 번호 ('A_31', 'T_31', 'O_31' 존재) 

X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수
"""

dir_path = "/content/drive/MyDrive/Colab Notebooks/open"

train_df = pd.read_csv(f"{dir_path}/train.csv")
test_df = pd.read_csv(f"{dir_path}/test.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']
train_y_quality = train_df['Y_Quality']
test = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
over = train_df[train_df['Y_Class'] == 2]['Y_Quality']
normal = train_df[train_df['Y_Class'] == 1]['Y_Quality']
under = train_df[train_df['Y_Class'] == 0]['Y_Quality']


In [None]:
from sklearn.preprocessing import OneHotEncoder

#LINE one-hot-encoding
ohe_line = OneHotEncoder(sparse=False)
# fit_transform은 train에만 사용하고 test에는 학습된 인코더에 fit만 해야한다
train_line = ohe_line.fit_transform(train[['LINE']])
train_line =pd.DataFrame(train_line, columns=['line_' + col for col in ohe_line.categories_[0]])
train = pd.concat((train.drop(columns=['LINE']), train_line), axis=1)

test_line = ohe_line.transform(test[['LINE']])
test_line = pd.DataFrame(test_line, columns=['line_' + col for col in ohe_line.categories_[0]])
test = pd.concat((test.drop(columns=['LINE']), test_line), axis=1)

In [None]:
ohe_pc = OneHotEncoder(sparse=False)
# fit_transform은 train에만 사용하고 test에는 학습된 인코더에 fit만 해야한다
train_pc = ohe_pc.fit_transform(train[['PRODUCT_CODE']])
train_pc =pd.DataFrame(train_pc, columns=['pc_' + col for col in ohe_pc.categories_[0]])
train = pd.concat((train.drop(columns=['PRODUCT_CODE']), train_pc), axis=1)

test_pc = ohe_pc.transform(test[['PRODUCT_CODE']])
test_pc = pd.DataFrame(test_pc, columns=['pc_' + col for col in ohe_pc.categories_[0]])
test = pd.concat((test.drop(columns=['PRODUCT_CODE']), test_pc), axis=1)

In [None]:
invalid=[]
for col in train.columns:
  if(train[col].isnull().all()):
    invalid+=[col]

train = train.drop(columns=invalid)
test = test.drop(columns=invalid)

In [None]:
# from sklearn.preprocessing import RobustScaler

# scaler = RobustScaler()
# scaler.fit(train)
# train = scaler.transform(train)
# test = scaler.transform(test)

In [None]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=310)
# train = pca.fit_transform(train)
# test = pca.fit_transform(test)

# print(train.shape)
# print(test.shape)

In [None]:
RF = XGBRegressor(
    objective='reg:squarederror', 
    learning_rate=0.2, 
    max_depth = 5, 
    n_estimators = 5000
)
RF.fit(train, train_y_quality)

print('Done.')

Done.


In [None]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=5)
train_y_quality = train_y_quality.values.reshape(-1,1)
kn.fit(train_y_quality, train_y)

KNeighborsClassifier()

In [None]:
real_preds =[]
preds = RF.predict(test)

print(preds)

for i in range(len(preds)):
  if normal.min() < preds[i] < normal.max():
        preds[i]=1
  elif preds[i] <= under.max():
        preds[i]=0
  elif over.min() <= preds[i]:
       preds[i]=2
  else:
      temp=kn.predict([[preds[i]]])[0]
      preds[i]=temp
     
print('Done.')

[0.5316567  0.5360644  0.5365064  0.5231423  0.53023833 0.53040725
 0.5292166  0.5247611  0.52345717 0.5336585  0.52974033 0.53169334
 0.53704333 0.5279994  0.5202172  0.5224068  0.53263956 0.526737
 0.531074   0.5329223  0.52632046 0.534649   0.52503043 0.52759194
 0.52791864 0.5267364  0.5317897  0.52463424 0.52452844 0.5336094
 0.5238265  0.53274155 0.5253724  0.5294612  0.5322701  0.5202683
 0.5283803  0.51886654 0.5466265  0.5243672  0.5291796  0.52713823
 0.52397126 0.5299869  0.5293461  0.5337224  0.5335963  0.51459265
 0.5305283  0.5187588  0.53375435 0.5191189  0.52509123 0.52370167
 0.5237742  0.5175658  0.52994734 0.53005224 0.5170887  0.51873857
 0.5190728  0.52515656 0.528699   0.5319354  0.5235532  0.5214931
 0.52305645 0.5312863  0.5263067  0.52462196 0.52732366 0.5133465
 0.53050315 0.53131837 0.53358597 0.53221893 0.5334187  0.52486205
 0.54425114 0.52903533 0.5326418  0.53475416 0.53989065 0.53855187
 0.53777593 0.5343482  0.51648116 0.5268127  0.5158784  0.5245308
 0

In [None]:
submit = pd.read_csv(f'{dir_path}/sample_submission.csv')
submit['Y_Class'] = preds
submit.to_csv(f'{dir_path}/xgb_submission.csv', index=False)