In [101]:
import random
import os
import numpy as np

def seed_everything(seed):
    random.seed(seed)
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    np.random.seed(seed)
    
# 시드 값 37로 난수 생성기 고정
seed_everything(37)

In [103]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error

def get_clf_eval(y_test, pred=None, pred_proba=None):
    
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='macro')
    recall = recall_score(y_test, pred, average='macro')
    f1 = f1_score(y_test, pred, average='macro')

    print('오차 행렬')
    print(confusion)

    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))


def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

def rmse(y,pred):
    return np.sqrt(mean_squared_error(y,pred))

def evaluate_regr(y,pred):
    rmsle_val = rmsle(y,pred)
    rmse_val = rmse(y,pred)
    mae_val = mean_absolute_error(y,pred)
    print('RMSLE: {0:.3f}, RMSE: {1:.3F}, MAE: {2:.3F}'.format(rmsle_val, rmse_val, mae_val))

In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


data_df = pd.read_csv("../../data/dacon_open/train.csv")
predict_df = pd.read_csv("../../data/dacon_open/test.csv")
submission_df = pd.read_csv("../../data/dacon_open/sample_submission.csv")

print(data_df['Y_Class'].value_counts())

total_cnt = data_df.Y_Class.count()
standard_cnt = data_df[data_df['Y_Class'] == 1].Y_Class.count()
under_cnt = data_df[data_df['Y_Class'] == 0].Y_Class.count()
over_cnt = data_df[data_df['Y_Class'] == 2].Y_Class.count()
print('standard 비율: {0:.2f}'.format((standard_cnt / total_cnt * 100)))
print('under 비율: {0:.2f}'.format((under_cnt / total_cnt * 100)))
print('over 비율: {0:.2f}'.format((over_cnt / total_cnt * 100)))

data_df.head()

Y_Class
1    407
2    103
0     88
Name: count, dtype: int64
standard 비율: 68.06
under 비율: 14.72
over 비율: 17.22


Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


## Imbalanced Data
> - standard 비율: 68.06
> - under 비율: 14.72
> - over 비율: 17.22


## feature가 너무 많다.

### 1. 불필요한 feature 제거
> - 모든 값이 NaN값인 columns 제거
> - 고유한 값의 수가 1인 columns 제거
> - TIMESTAMP: Month, Day, Hour, Minute으로 나누고 제거 => 그냥 제거

In [105]:
# data_df['TIMESTAMP'] = pd.to_datetime(data_df['TIMESTAMP'])
# data_df['Month'] = data_df['TIMESTAMP'].dt.month
# data_df['Day'] = data_df['TIMESTAMP'].dt.day
# data_df['Hour'] = data_df['TIMESTAMP'].dt.hour
# data_df['Minute'] = data_df['TIMESTAMP'].dt.minute


# predict_df['TIMESTAMP'] = pd.to_datetime(predict_df['TIMESTAMP'])
# predict_df['Month'] = predict_df['TIMESTAMP'].dt.month
# predict_df['Day'] = predict_df['TIMESTAMP'].dt.day
# predict_df['Hour'] = predict_df['TIMESTAMP'].dt.hour
# predict_df['Minute'] = predict_df['TIMESTAMP'].dt.minute


data_df.drop(["TIMESTAMP"], axis=1, inplace=True)
predict_df.drop(["TIMESTAMP"], axis=1, inplace=True)

In [106]:
all_nan_columns = data_df.columns[data_df.isna().all()].tolist()
print(f"모든 값이 NaN인 컬럼 개수: {len(all_nan_columns)}")

predict_all_nan_columns = predict_df.columns[predict_df.isna().all()].tolist()
print(f"모든 값이 NaN인 컬럼 개수: {len(predict_all_nan_columns)}")

data_df.drop(columns=all_nan_columns, inplace=True)
predict_df.drop(columns=all_nan_columns, inplace=True)

모든 값이 NaN인 컬럼 개수: 82
모든 값이 NaN인 컬럼 개수: 506


In [107]:
unique_one_columns = [col for col in data_df.columns if data_df[col].nunique() == 1]
print(len(unique_one_columns))

377


In [108]:
data_df.drop(columns=unique_one_columns, inplace=True, axis=1)
predict_df = predict_df.drop(columns=unique_one_columns, axis=1)
data_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,TRAIN_000,1,0.533433,T050304,A_31,,,,,,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
1,TRAIN_001,2,0.541819,T050307,A_31,,,,,,...,193.296552,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55
2,TRAIN_002,1,0.531267,T050304,A_31,,,,,,...,179.820690,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
3,TRAIN_003,2,0.537325,T050307,A_31,,,,,,...,181.920690,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78
4,TRAIN_004,1,0.531590,T050304,A_31,,,,,,...,196.393333,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,T100306,T_31,2.0,95.0,10.0,50.0,10.0,...,,,,,,,,,,
594,TRAIN_594,0,0.524022,T050304,A_31,,,,,,...,180.810345,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49
595,TRAIN_595,0,0.521289,T050304,A_31,,,,,,...,176.486207,156.6,383.0,367.018868,352.0,,,,,
596,TRAIN_596,1,0.531375,T100304,O_31,40.0,94.0,11.0,45.0,10.0,...,,,,,,,,,,


In [10]:
predict_df

Unnamed: 0,PRODUCT_ID,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,TEST_000,T100306,T_31,2.0,94.0,10.0,51.0,10.0,52.0,469.6,...,,,,,,,,,,
1,TEST_001,T100304,T_31,2.0,93.0,11.0,45.0,10.0,31.0,506.6,...,,,,,,,,,,
2,TEST_002,T100304,T_31,2.0,95.0,11.0,45.0,10.0,31.0,506.6,...,,,,,,,,,,
3,TEST_003,T010305,A_31,,,,,,,,...,191.450000,183.8,467.0,444.192308,423.0,,,,,
4,TEST_004,T010306,A_31,,,,,,,,...,193.082143,179.7,465.0,443.211539,423.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,T100306,T_31,2.0,91.0,10.0,51.0,10.0,52.0,502.5,...,,,,,,,,,,
306,TEST_306,T100304,T_31,2.0,96.0,11.0,45.0,10.0,31.0,513.7,...,,,,,,,,,,
307,TEST_307,T100306,T_31,2.0,91.0,10.0,50.0,10.0,52.0,502.8,...,,,,,,,,,,
308,TEST_308,T100306,T_31,2.0,95.0,10.0,51.0,10.0,52.0,503.2,...,,,,,,,,,,


#### 2881 column -> 2421 column

### LINE 그룹화
> - T050304
> - T050307

> - T010305
> - T010306

> - T100304
> - T100306

## Day2: T050~ 이상하다!!

> - what? = NaN 값이 어떤 것은 있고 어떤 것은 없다.
> - 다른 LINE?

# 0, 1 라인 데이터 분리

In [109]:
def get_line(line):
    line_mapping = {
        'T050304': 0,
        'T050307': 1,
        'T010305': 2,
        'T010306': 3,
        'T100304': 4,
        'T100306': 5
    }
    return line_mapping.get(line, 9999)

data_df['LINE'] = data_df['LINE'].apply(lambda x: get_line(x))
predict_df['LINE'] = predict_df['LINE'].apply(lambda x: get_line(x))

In [110]:
for line_number in data_df['LINE'].unique():
    df_name = f'line{line_number}_df'
    globals()[df_name] = data_df[data_df['LINE'] == line_number].drop(columns=['PRODUCT_ID'])

line0_df

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,1,0.533433,0,A_31,,,,,,,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
2,1,0.531267,0,A_31,,,,,,,...,179.820690,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
4,1,0.531590,0,A_31,,,,,,,...,196.393333,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97
6,1,0.533665,0,A_31,,,,,,,...,188.180000,176.9,384.0,368.425926,353.0,32.50,41.42,38.36,30.83,76.93
8,1,0.531821,0,A_31,,,,,,,...,182.296667,173.8,393.0,373.500000,353.0,42.25,43.17,55.60,33.26,78.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,0,0.523465,0,A_31,,,,,,,...,183.833333,176.0,384.0,367.333333,352.0,,,,,
582,0,0.522233,0,A_31,,,,,,,...,198.366667,174.0,384.0,367.037037,352.0,50.88,53.23,52.44,56.28,66.83
583,0,0.522340,0,A_31,,,,,,,...,191.993103,181.6,394.0,371.943396,353.0,51.71,59.64,54.61,57.05,63.18
594,0,0.524022,0,A_31,,,,,,,...,180.810345,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49


In [112]:
group_counts = {}

for line_number in range(6):
    df = globals()[f'line{line_number}_df']
    grouped = df.groupby(df.isnull().apply(tuple, axis=1))
    cnt = 0
    for name, group in grouped:
        cnt += 1
        globals()[f'line{line_number}_group{cnt}_df'] = group
    group_counts[line_number] = cnt
    

for line_number, count in group_counts.items():
    print(f'Line {line_number}: {count}')

Line 0: 31
Line 1: 17
Line 2: 3
Line 3: 3
Line 4: 5
Line 5: 3


In [113]:
train_line01_df = data_df
test_line01_df = predict_df


lines_to_include = [0, 1]
train01_df = train_line01_df[train_line01_df['LINE'].isin(lines_to_include)]
test01_df = test_line01_df[test_line01_df['LINE'].isin(lines_to_include)]


train01_Q = train01_df['Y_Quality']
train01_C = train01_df['Y_Class']


train01_df.drop(['LINE', 'PRODUCT_CODE', 'PRODUCT_ID', 'Y_Quality', 'Y_Class'], inplace=True, axis=1)
test01_df.drop(['LINE', 'PRODUCT_CODE', 'PRODUCT_ID'], inplace=True, axis=1)



all_nan_columns = train01_df.columns[train01_df.isna().all()].tolist()
print(f"모든 값이 NaN인 컬럼 개수: {len(all_nan_columns)}")

predict_all_nan_columns = test01_df.columns[test01_df.isna().all()].tolist()
print(f"모든 값이 NaN인 컬럼 개수: {len(predict_all_nan_columns)}")

train01_df.drop(columns=all_nan_columns, inplace=True)
test01_df.drop(columns=all_nan_columns, inplace=True)


train01_df.fillna(0, inplace=True)
test01_df.fillna(0, inplace=True)

train01_df

모든 값이 NaN인 컬럼 개수: 644
모든 값이 NaN인 컬럼 개수: 1136


Unnamed: 0,X_128,X_129,X_130,X_131,X_132,X_133,X_134,X_136,X_137,X_138,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,7813.0,7813.0,0.0,0.0,0.19,0.20,0.19,228.0,228.0,225.0,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
1,0.0,0.0,19854.0,19854.0,0.20,0.21,0.20,413.0,414.0,414.0,...,193.296552,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55
2,7815.0,7815.0,0.0,0.0,0.19,0.20,0.19,228.0,228.0,225.0,...,179.820690,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
3,0.0,0.0,19856.0,19856.0,0.20,0.21,0.20,414.0,414.0,414.0,...,181.920690,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78
4,7817.0,7817.0,0.0,0.0,0.19,0.20,0.18,228.0,228.0,225.0,...,196.393333,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,11864.0,11864.0,0.0,0.0,0.19,0.20,0.19,230.0,230.0,230.0,...,183.833333,176.0,384.0,367.333333,352.0,0.00,0.00,0.00,0.00,0.00
582,11898.0,11898.0,0.0,0.0,0.18,0.20,0.18,230.0,230.0,230.0,...,198.366667,174.0,384.0,367.037037,352.0,50.88,53.23,52.44,56.28,66.83
583,11920.0,11920.0,0.0,0.0,0.19,0.20,0.19,230.0,230.0,230.0,...,191.993103,181.6,394.0,371.943396,353.0,51.71,59.64,54.61,57.05,63.18
594,14810.0,14810.0,0.0,0.0,0.19,0.20,0.19,304.0,304.0,304.0,...,180.810345,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49


In [15]:
test01_df

Unnamed: 0,X_128,X_129,X_130,X_131,X_132,X_133,X_134,X_136,X_137,X_138,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
7,18031.0,18031.0,0.0,0.0,0.19,0.2,0.19,354.0,354.0,354.0,...,190.8,168.3,384.0,369.462963,354.0,57.74,52.51,54.45,57.99,63.16
8,18064.0,18064.0,0.0,0.0,0.19,0.2,0.19,355.0,354.0,355.0,...,182.273333,169.8,384.0,370.259259,353.0,53.18,56.02,41.79,47.94,60.72
13,0.0,0.0,3370.0,3370.0,0.21,0.22,0.21,230.0,230.0,231.0,...,151.166667,138.3,384.0,367.462963,353.0,51.89,55.25,50.14,57.08,65.39
14,295.0,295.0,0.0,0.0,0.19,0.21,0.19,60.0,60.0,60.0,...,151.541379,139.5,384.0,369.377359,353.0,49.77,54.93,49.85,48.62,63.18
35,8632.0,8632.0,0.0,0.0,0.2,0.21,0.2,249.0,249.0,249.0,...,130.403448,103.5,384.0,369.660377,352.0,55.22,58.14,48.93,56.16,64.25
36,0.0,0.0,4394.0,4394.0,0.21,0.22,0.21,99.0,99.0,99.0,...,142.72069,129.7,384.0,368.207547,354.0,0.0,0.0,0.0,0.0,0.0
41,10712.0,10712.0,0.0,0.0,0.2,0.21,0.2,293.0,293.0,293.0,...,176.803333,165.6,383.0,368.018519,352.0,49.67,45.0,49.81,55.19,67.17
42,10729.0,10729.0,0.0,0.0,0.2,0.21,0.19,293.0,293.0,293.0,...,183.486207,171.8,384.0,368.45283,353.0,51.3,47.66,60.0,53.75,62.58
52,12261.0,12261.0,0.0,0.0,0.19,0.21,0.19,330.0,330.0,330.0,...,173.882759,161.0,384.0,368.320755,353.0,54.06,51.18,52.17,57.75,63.94
53,12283.0,12283.0,0.0,0.0,0.2,0.21,0.2,330.0,330.0,330.0,...,187.072414,175.1,384.0,361.226415,342.0,54.04,51.25,54.98,57.22,63.33


### 0, 1 라인 분리(완)

> - 데이터의 분포: 일부 모델은 입력 데이터의 분포에 민감합니다. 스케일링이 데이터의 분포를 변경하여 모델의 성능에 영향
> - 특성 간 상관관계: 스케일링은 특성 간의 상대적 중요성을 변경 / 범주형 데이터를 수치형 데이터와 함께 스케일링할 때 이런 문제가 발생

In [114]:
data_df.drop(['PRODUCT_ID'], inplace=True, axis=1)
predict_df.drop(['PRODUCT_ID'], inplace=True, axis=1)

data_df.fillna(0, inplace=True)
predict_df.fillna(0, inplace=True)

In [115]:
from sklearn.preprocessing import LabelEncoder

features = ['PRODUCT_CODE']
le = LabelEncoder()
for i in features:
    encoder = LabelEncoder()
    encoder.fit(data_df[i])
    data_df[i] = encoder.transform(data_df[i])
    predict_df[i] = encoder.transform(predict_df[i])

data_df.head()

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,1,0.533433,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
1,2,0.541819,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.296552,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55
2,1,0.531267,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,179.82069,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
3,2,0.537325,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,181.92069,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78
4,1,0.53159,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,196.393333,182.6,383.0,367.351852,352.0,38.7,41.89,46.93,33.09,76.97


In [57]:
predict_df.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,5,2,2.0,94.0,10.0,51.0,10.0,52.0,469.6,474.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,2,2.0,93.0,11.0,45.0,10.0,31.0,506.6,511.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,2,2.0,95.0,11.0,45.0,10.0,31.0,506.6,511.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,191.45,183.8,467.0,444.192308,423.0,0.0,0.0,0.0,0.0,0.0
4,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.082143,179.7,465.0,443.211539,423.0,0.0,0.0,0.0,0.0,0.0


In [116]:
train_Q = data_df['Y_Quality']
train_Y = data_df['Y_Class']

X_train = data_df.drop(columns=['Y_Quality', "Y_Class"])

In [61]:
X_train.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.296552,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55
2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,179.82069,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
3,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,181.92069,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,196.393333,182.6,383.0,367.351852,352.0,38.7,41.89,46.93,33.09,76.97


In [62]:
predict_df.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,5,2,2.0,94.0,10.0,51.0,10.0,52.0,469.6,474.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,2,2.0,93.0,11.0,45.0,10.0,31.0,506.6,511.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,2,2.0,95.0,11.0,45.0,10.0,31.0,506.6,511.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,191.45,183.8,467.0,444.192308,423.0,0.0,0.0,0.0,0.0,0.0
4,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.082143,179.7,465.0,443.211539,423.0,0.0,0.0,0.0,0.0,0.0


## Y_Quality를 Y_Class를 기준으로 min, max값 찾기

In [117]:
minmax = data_df[['Y_Class', 'Y_Quality']].groupby('Y_Class').agg(['min', 'max'])
minmax

Unnamed: 0_level_0,Y_Quality,Y_Quality
Unnamed: 0_level_1,min,max
Y_Class,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.500856,0.525067
1,0.525086,0.534843
2,0.534951,0.578841


## train, test 분리

In [118]:
from sklearn.model_selection import train_test_split

# train_Y
# train_Q

train_data, test_data, y_train, y_test = train_test_split(X_train, train_Q, test_size=0.2, random_state=0)

In [135]:
from catboost import CatBoostRegressor

cat_reg = CatBoostRegressor(iterations=500, learning_rate=0.05, verbose=0)
cat_reg.fit(X_train, train_Q, early_stopping_rounds=50, cat_features=['PRODUCT_CODE', 'LINE'])

train_pred = cat_reg.predict(X_train)
evaluate_regr(train_Q, train_pred)



# cat_reg.fit(train_data, y_train, early_stopping_rounds=50, cat_features=['PRODUCT_CODE', 'LINE'])

# train_pred = cat_reg.predict(train_data)
# evaluate_regr(y_train, train_pred)


# test_pred = cat_reg.predict(test_data)
# evaluate_regr(y_test, test_pred)


predict_quality = cat_reg.predict(predict_df)

RMSLE: 0.001, RMSE: 0.001, MAE: 0.001


In [120]:
predict_quality[:10]

array([0.53087408, 0.53779313, 0.53656061, 0.52430552, 0.53034669,
       0.53126389, 0.53049172, 0.52525024, 0.52401561, 0.53164491])

In [121]:
minmax = data_df[['Y_Class','Y_Quality']].groupby('Y_Class').agg(['min', 'max'])

In [122]:
minmax

Unnamed: 0_level_0,Y_Quality,Y_Quality
Unnamed: 0_level_1,min,max
Y_Class,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.500856,0.525067
1,0.525086,0.534843
2,0.534951,0.578841


In [123]:
minmax[('Y_Quality', 'max')][0]

0.525066667

In [124]:
minmax[('Y_Quality', 'min')][2]

0.534950794

In [125]:
pred_Q = []

for i in predict_quality:
    if i < minmax[('Y_Quality', 'max')][0]:
        pred_Q.append(0)
    elif i < minmax[('Y_Quality', 'min')][2]:
        pred_Q.append(1)
    else:
        pred_Q.append(2)

pred_Q[:20]

[1, 2, 2, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]

In [126]:
len(pred_Q)

310

## 분리한 LINE 예측

In [30]:
len(test01_df)

39

In [31]:
len(train01_df)

120

In [32]:
len(train01_Q)

120

In [127]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(iterations=500, verbose=0, learning_rate=0.05)
clf.fit(train01_df, train01_C)
test01_quality = clf.predict(test01_df)

# cat_reg =  CatBoostRegressor(iterations=500, learning_rate=0.05, verbose=0)
# cat_reg.fit(train01_df, train01_Q, early_stopping_rounds=50)
# test01_quality=cat_reg.predict(test01_df)

In [128]:
test01_quality

array([[0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [129]:
pred01_Q = []

for i in test01_quality:
    if i <= minmax[('Y_Quality', 'max')][0]:
        pred01_Q.append(0)
    elif i <= minmax[('Y_Quality', 'min')][2]:
        pred01_Q.append(1)
    else:
        pred01_Q.append(2)

pred01_Q[:20]

[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0]

In [130]:
final = predict_df[(predict_df['LINE']==0)|(predict_df['LINE']==1)].index
total = pred_Q

In [91]:
final

Index([  7,   8,  13,  14,  35,  36,  41,  42,  52,  53,  54,  61,  62,  63,
        64,  65,  66, 130, 131, 132, 248, 249, 250, 251, 252, 253, 254, 255,
       260, 263, 280, 281, 282, 283, 284, 285, 286, 292, 293],
      dtype='int64')

In [92]:
total

[1,
 2,
 2,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 2,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 0,
 1,
 0,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,


In [131]:
for i in range(len(final)):
  total[final[i]]=pred01_Q[i]

In [132]:
len(total)

310

In [133]:
submit = submission_df
submit['Y_Class'] = total
submit
submit.to_csv('./sample_submission_sc.csv', index=False)

In [134]:
submit

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,2
2,TEST_002,2
3,TEST_003,0
4,TEST_004,1
...,...,...
305,TEST_305,1
306,TEST_306,1
307,TEST_307,1
308,TEST_308,1
