<a href="https://colab.research.google.com/github/sungjk1999/smart-factory-quality-classification/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Read train data**

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from google.colab import drive
drive.mount('/content/gdrive')


"""
PRODUCT_ID : 제품의 고유 ID

Y_Class : 제품 품질 상태(Target) 
    0 : 적정 기준 미달 (부적합) 
    1 : 적합 
    2 : 적정 기준 초과 (부적합)

Y_Quality : 제품 품질 관련 정량적 수치 

TIMESTAMP : 제품이 공정에 들어간 시각 

LINE : 제품이 들어간 공정 LINE 종류 ('T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305' 존재) 

PRODUCT_CODE : 제품의 CODE 번호 ('A_31', 'T_31', 'O_31' 존재) 

X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수
"""


dir_path = "/content/gdrive/MyDrive/Colab Notebooks/open"

train = pd.read_csv(f"{dir_path}/train.csv")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **Remove invalid columns**


In [93]:
# X_1 ~ X_2875 열의 std가 0 또는 nan인 경우를 제거
# test.csv 에서도 동일하게 적용해야함.

invalid = []

tmp = train.describe()
for col in tmp.columns:
    std = tmp.loc['std', col]
    if std == 0 or np.isnan(std):
        invalid.append(col)

# 아래 두 변수가 실제로 의미가 없는지 판단
PRINT = 0

if PRINT:
    plt.figure(figsize=(12, 5))

    # PRODUCT_ID
    ax = plt.subplot(1, 2, 1)
    ax.axes.xaxis.set_ticks([])
    plt.xlabel("PRODUCT_ID")
    plt.scatter(train["PRODUCT_ID"], y=train["Y_Quality"], c='red', alpha=0.2)

    # TIMESTAMP
    ax = plt.subplot(1, 2, 2)
    ax.axes.xaxis.set_ticks([])
    tmp = train.sort_values('TIMESTAMP')
    plt.xlabel("TIMESTAMP")
    plt.scatter(tmp["TIMESTAMP"], y=tmp["Y_Quality"], c='green', alpha=0.2)

# 실제로 무관하므로 제거
invalid += ["PRODUCT_ID", "TIMESTAMP"]
invalid += ['LINE']


train = train.drop(columns=invalid)
train

Unnamed: 0,Y_Class,Y_Quality,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,1,0.533433,A_31,,,,,,,,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
1,2,0.541819,A_31,,,,,,,,...,193.296552,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55
2,1,0.531267,A_31,,,,,,,,...,179.820690,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
3,2,0.537325,A_31,,,,,,,,...,181.920690,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78
4,1,0.531590,A_31,,,,,,,,...,196.393333,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,1,0.526546,T_31,2.0,95.0,10.0,50.0,10.0,52.0,468.9,...,,,,,,,,,,
594,0,0.524022,A_31,,,,,,,,...,180.810345,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49
595,0,0.521289,A_31,,,,,,,,...,176.486207,156.6,383.0,367.018868,352.0,,,,,
596,1,0.531375,O_31,40.0,94.0,11.0,45.0,10.0,31.0,505.8,...,,,,,,,,,,


# **How to classify?**



In [94]:
# Y_Quality 값을 통해서, Y_Class를 예측하므로 어느 정도에서 끊기는지 확인
# 정량 데이터이므로, 구간이 겹치지 않는 것을 확인할 수 있음.

over = train[train['Y_Class'] == 2]['Y_Quality']
normal = train[train['Y_Class'] == 1]['Y_Quality']
under = train[train['Y_Class'] == 0]['Y_Quality']

print(f"적정 기준 초과 여부 커트라인 구간: [{normal.max()}, {over.min()}]")
print(f"적정 기준 미달 여부 커트라인 구간: [{under.max()}, {normal.min()}]")

적정 기준 초과 여부 커트라인 구간: [0.534842857, 0.534950794]
적정 기준 미달 여부 커트라인 구간: [0.525066667, 0.525085714]


In [95]:
# 커트라인에 따른 분류 함수1 - 확정적인 경우

def classify(train, y_quality):
    over = train[train['Y_Class'] == 2]['Y_Quality']
    normal = train[train['Y_Class'] == 1]['Y_Quality']
    under = train[train['Y_Class'] == 0]['Y_Quality']
    
    if normal.min() <= y_quality <= normal.max():
        return 1
    elif y_quality <= under.max():
        return 0
    elif over.min() <= y_quality:
        return 2
    else:
        # 커트라인 확정이 안되는 경우
        return -1

In [96]:
# LINE에 따라서 어떻게 분포하는지 확인

# lines = set(train['LINE'])

# for line in lines:
#     print(f"LINE : {line}")
#     tmp = train[train['LINE'] == line]['Y_Class'].value_counts()
#     print(f"{tmp} \ntotal : {sum(tmp)}")
#     print('-' * 79)

In [97]:
# PRODUCT_CODE 에 따라서 어떻게 분포하는지 확인

# codes = set(train['PRODUCT_CODE'])

# for code in codes:
#     print(f"PRODUCT_CODE : {code}")
#     tmp = train[train['PRODUCT_CODE'] == code]['Y_Class'].value_counts()
#     print(f"{tmp} \ntotal : {sum(tmp)}")
#     print('-' * 79)

In [98]:
# PRODUCT_CODE 가 어디서 생산되는지 확인
# 생각해보니, 해당 라인에 대해서 고정적으로 생산되는건지 아니면 이번 데이터에서만 이런건지...
# 구분할 수가 없으므로 유의해서 처리해야할 것 같다.

# codes = set(train['PRODUCT_CODE'])

# for code in codes:
#     print(f"PRODUCT_CODE : {code}")
#     tmp = train[train['PRODUCT_CODE'] == code]['LINE'].value_counts()
#     print(f"{tmp} \ntotal : {sum(tmp)}")
#     print('-' * 79)

# **Missing values handling**
데이터프레임을 `PRODUCT_CODE`에 따라서 분리하고, 그렇게 분리된 데이터프레임에서 각 열의 결측값의 개수를 센다.  
**각 열의 값이 전부 결측값이면 해당 열은 학습하지 않는다.**  
**나머지 경우는 결측값을 보간**해주는데, 결측값이 많을수록 부정확하므로 분포를 살펴본다.

In [99]:
# Split dataframe by PRODUCT_CODE
class Product:
    def __init__(self, train, code):
        self.df = train[train['PRODUCT_CODE'] == code].drop(columns=['PRODUCT_CODE'])
        self.counter = self.nan_count()
    
    def nan_count(self):
        counter = {}
        for col in self.df.columns:
            nan = self.df[col].isnull().sum()  # 해당 열의 결측치 개수
            if nan == 0:
                continue
            if nan in counter:
                counter[nan].append(col)
            else:
                counter[nan] = [col]
        return counter
    
    def removed_lst(self, thr):
        """ 결측치가 {thr}번 이상 나타나는 열의 이름을 리스트로 반환 """
        lst = []
        for t in self.counter:
            if t >= thr:
                lst += self.counter[t]
        return lst


codes = set(train['PRODUCT_CODE'])
products = {}
processed = {}
removed_cols = {}

for code in codes:
    products[code] = Product(train, code)

for code in codes:
    print(f"{code} ({products[code].df.shape[0]}) : {sorted(products[code].counter)}")

removed_cols['T_31'] = products['T_31'].removed_lst(170)
removed_cols['O_31'] = products['O_31'].removed_lst(2)
removed_cols['A_31'] = products['A_31'].removed_lst(119)

for code in codes:
    processed[code] = products[code].df.drop(columns=removed_cols[code])

O_31 (6) : [3, 4, 6]
A_31 (249) : [1, 2, 16, 120, 129, 130, 131, 139, 145, 149, 155, 171, 179, 187, 190, 197, 207, 210, 216, 226, 243, 249]
T_31 (343) : [1, 171, 172, 256, 335, 343]


# **Modeling**

In [106]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier


test = pd.read_csv(f"{dir_path}/test.csv").drop(columns=invalid)

test_codes = set(test['PRODUCT_CODE'])
test_products = {}
test_processed = {}

for code in test_codes:
    test_products[code] = Product(test, code)

for code in test_codes:
    test_processed[code] = test_products[code].df.drop(columns=removed_cols[code]).fillna(0)

x_trains = {}
y_trains = {}

for code in codes:
    x_trains[code] = processed[code].drop(columns=['Y_Class', 'Y_Quality']).fillna(0)
    y_trains[code] = processed[code]['Y_Class']


classifiers = {}

for code in codes:
    classifiers[code] = RandomForestClassifier(random_state=37).fit(x_trains[code], y_trains[code])

res = []

for idx in range(test.shape[0]):
    curr_row = test.iloc[idx].to_frame().T.drop(columns=removed_cols[code]).fillna(0)
    code = curr_row['PRODUCT_CODE']
    curr_row = curr_row.drop(columns=['PRODUCT_CODE'])
    print(curr_row)
    #k = classifiers[code].predict(curr_row)
    #res.append(k)

res

   X_1   X_2   X_5   X_7   X_8   X_9   X_11   X_12   X_13  X_15  ...  X_924  \
0  2.0  94.0  10.0  51.0  10.0  52.0  469.6  474.4  461.0   0.0  ...  174.1   

   X_925   X_926  X_927  X_928       X_929  X_930  X_931      X_932  X_933  
0  160.1  154.37  150.0  317.0  302.909091  288.0   13.6  13.383871   13.2  

[1 rows x 536 columns]


TypeError: ignored

In [None]:
submit = pd.read_csv('./sample_submission.csv')

submit['Y_Class'] = res

submit.to_csv('./baseline_submission.csv', index=False)