In [109]:
import random
import os
import numpy as np

def seed_everything(seed):
    random.seed(seed)
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    np.random.seed(seed)
    
# 시드 값 37로 난수 생성기 고정
seed_everything(37)

In [110]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error

def get_clf_eval(y_test, pred=None, pred_proba=None):
    
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='macro')
    recall = recall_score(y_test, pred, average='macro')
    f1 = f1_score(y_test, pred, average='macro')

    print('오차 행렬')
    print(confusion)

    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))


def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

def rmse(y,pred):
    return np.sqrt(mean_squared_error(y,pred))

def evaluate_regr(y,pred):
    rmsle_val = rmsle(y,pred)
    rmse_val = rmse(y,pred)
    mae_val = mean_absolute_error(y,pred)
    print('RMSLE: {0:.3f}, RMSE: {1:.3F}, MAE: {2:.3F}'.format(rmsle_val, rmse_val, mae_val))

In [204]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


data_df = pd.read_csv("../../data/dacon_open/train.csv")
predict_df = pd.read_csv("../../data/dacon_open/test.csv")
submission_df = pd.read_csv("../../data/dacon_open/sample_submission.csv")

print(data_df['Y_Class'].value_counts())

total_cnt = data_df.Y_Class.count()
standard_cnt = data_df[data_df['Y_Class'] == 1].Y_Class.count()
under_cnt = data_df[data_df['Y_Class'] == 0].Y_Class.count()
over_cnt = data_df[data_df['Y_Class'] == 2].Y_Class.count()
print('standard 비율: {0:.2f}'.format((standard_cnt / total_cnt * 100)))
print('under 비율: {0:.2f}'.format((under_cnt / total_cnt * 100)))
print('over 비율: {0:.2f}'.format((over_cnt / total_cnt * 100)))
print("---------------------------------------------------------")
print('2023: 데이터 세트 Null 값 갯수 ',data_df.isnull().sum().sum())

data_df.head()

Y_Class
1    407
2    103
0     88
Name: count, dtype: int64
standard 비율: 68.06
under 비율: 14.72
over 비율: 17.22
---------------------------------------------------------
2023: 데이터 세트 Null 값 갯수  1172834


Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


## Imbalanced Data
> - standard 비율: 68.06
> - under 비율: 14.72
> - over 비율: 17.22


## feature가 너무 많다.

### 1. 불필요한 feature 제거
> - 모든 값이 NaN값인 columns 제거
> - 고유한 값의 수가 1인 columns 제거
> - TIMESTAMP: Month, Day, Hour, Minute으로 나누고 제거 => 그냥 제거

In [205]:
# data_df['TIMESTAMP'] = pd.to_datetime(data_df['TIMESTAMP'])
# data_df['Month'] = data_df['TIMESTAMP'].dt.month
# data_df['Day'] = data_df['TIMESTAMP'].dt.day
# data_df['Hour'] = data_df['TIMESTAMP'].dt.hour
# data_df['Minute'] = data_df['TIMESTAMP'].dt.minute


# predict_df['TIMESTAMP'] = pd.to_datetime(predict_df['TIMESTAMP'])
# predict_df['Month'] = predict_df['TIMESTAMP'].dt.month
# predict_df['Day'] = predict_df['TIMESTAMP'].dt.day
# predict_df['Hour'] = predict_df['TIMESTAMP'].dt.hour
# predict_df['Minute'] = predict_df['TIMESTAMP'].dt.minute


data_df.drop(["TIMESTAMP"], axis=1, inplace=True)
predict_df.drop(["TIMESTAMP"], axis=1, inplace=True)

In [206]:
all_nan_columns = data_df.columns[data_df.isna().all()].tolist()
print(f"모든 값이 NaN인 컬럼 개수: {len(all_nan_columns)}")

predict_all_nan_columns = predict_df.columns[predict_df.isna().all()].tolist()
print(f"모든 값이 NaN인 컬럼 개수: {len(predict_all_nan_columns)}")

data_df.drop(columns=all_nan_columns, inplace=True)
predict_df.drop(columns=all_nan_columns, inplace=True)

모든 값이 NaN인 컬럼 개수: 82
모든 값이 NaN인 컬럼 개수: 506


In [207]:
unique_one_columns = [col for col in data_df.columns if data_df[col].nunique() == 1]
print(len(unique_one_columns))

377


In [208]:
data_df.drop(columns=unique_one_columns, inplace=True, axis=1)
predict_df = predict_df.drop(columns=unique_one_columns, axis=1)
data_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,TRAIN_000,1,0.533433,T050304,A_31,,,,,,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
1,TRAIN_001,2,0.541819,T050307,A_31,,,,,,...,193.296552,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55
2,TRAIN_002,1,0.531267,T050304,A_31,,,,,,...,179.820690,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
3,TRAIN_003,2,0.537325,T050307,A_31,,,,,,...,181.920690,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78
4,TRAIN_004,1,0.531590,T050304,A_31,,,,,,...,196.393333,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,T100306,T_31,2.0,95.0,10.0,50.0,10.0,...,,,,,,,,,,
594,TRAIN_594,0,0.524022,T050304,A_31,,,,,,...,180.810345,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49
595,TRAIN_595,0,0.521289,T050304,A_31,,,,,,...,176.486207,156.6,383.0,367.018868,352.0,,,,,
596,TRAIN_596,1,0.531375,T100304,O_31,40.0,94.0,11.0,45.0,10.0,...,,,,,,,,,,


In [189]:
predict_df

Unnamed: 0,PRODUCT_ID,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,TEST_000,T100306,T_31,2.0,94.0,10.0,51.0,10.0,52.0,469.6,...,,,,,,,,,,
1,TEST_001,T100304,T_31,2.0,93.0,11.0,45.0,10.0,31.0,506.6,...,,,,,,,,,,
2,TEST_002,T100304,T_31,2.0,95.0,11.0,45.0,10.0,31.0,506.6,...,,,,,,,,,,
3,TEST_003,T010305,A_31,,,,,,,,...,191.450000,183.8,467.0,444.192308,423.0,,,,,
4,TEST_004,T010306,A_31,,,,,,,,...,193.082143,179.7,465.0,443.211539,423.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,T100306,T_31,2.0,91.0,10.0,51.0,10.0,52.0,502.5,...,,,,,,,,,,
306,TEST_306,T100304,T_31,2.0,96.0,11.0,45.0,10.0,31.0,513.7,...,,,,,,,,,,
307,TEST_307,T100306,T_31,2.0,91.0,10.0,50.0,10.0,52.0,502.8,...,,,,,,,,,,
308,TEST_308,T100306,T_31,2.0,95.0,10.0,51.0,10.0,52.0,503.2,...,,,,,,,,,,


#### 2881 column -> 2421 column

### LINE 그룹화
> - T050304
> - T050307

> - T010305
> - T010306

> - T100304
> - T100306

## Day2: T050~ 이상하다!!

> - what? = NaN 값이 어떤 것은 있고 어떤 것은 없다.
> - 다른 LINE?

# 0, 1 라인 데이터 분리

In [209]:
def get_line(line):
    line_mapping = {
        'T050304': 0,
        'T050307': 1,
        'T010305': 2,
        'T010306': 3,
        'T100304': 4,
        'T100306': 5
    }
    return line_mapping.get(line, 9999)

data_df['LINE'] = data_df['LINE'].apply(lambda x: get_line(x))
predict_df['LINE'] = predict_df['LINE'].apply(lambda x: get_line(x))

In [210]:
for line_number in data_df['LINE'].unique():
    df_name = f'line{line_number}_df'
    globals()[df_name] = data_df[data_df['LINE'] == line_number].drop(columns=['PRODUCT_ID'])

line0_df

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,1,0.533433,0,A_31,,,,,,,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
2,1,0.531267,0,A_31,,,,,,,...,179.820690,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
4,1,0.531590,0,A_31,,,,,,,...,196.393333,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97
6,1,0.533665,0,A_31,,,,,,,...,188.180000,176.9,384.0,368.425926,353.0,32.50,41.42,38.36,30.83,76.93
8,1,0.531821,0,A_31,,,,,,,...,182.296667,173.8,393.0,373.500000,353.0,42.25,43.17,55.60,33.26,78.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,0,0.523465,0,A_31,,,,,,,...,183.833333,176.0,384.0,367.333333,352.0,,,,,
582,0,0.522233,0,A_31,,,,,,,...,198.366667,174.0,384.0,367.037037,352.0,50.88,53.23,52.44,56.28,66.83
583,0,0.522340,0,A_31,,,,,,,...,191.993103,181.6,394.0,371.943396,353.0,51.71,59.64,54.61,57.05,63.18
594,0,0.524022,0,A_31,,,,,,,...,180.810345,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49


In [211]:
group_counts = {}

for line_number in range(6):
    df = globals()[f'line{line_number}_df']
    grouped = df.groupby(df.isnull().apply(tuple, axis=1))
    cnt = 0
    for name, group in grouped:
        cnt += 1
        globals()[f'line{line_number}_group{cnt}_df'] = group
    group_counts[line_number] = cnt
    

for line_number, count in group_counts.items():
    print(f'Line {line_number}: {count}')

Line 0: 31
Line 1: 17
Line 2: 3
Line 3: 3
Line 4: 5
Line 5: 3


In [212]:
train_line01_df = data_df
test_line01_df = predict_df


lines_to_include = [0, 1]
train01_df = train_line01_df[train_line01_df['LINE'].isin(lines_to_include)]
test01_df = test_line01_df[test_line01_df['LINE'].isin(lines_to_include)]


train01_Q = train01_df['Y_Quality']
train01_C = train01_df['Y_Class']


train01_df.drop(['LINE', 'PRODUCT_CODE', 'PRODUCT_ID', 'Y_Quality', 'Y_Class'], inplace=True, axis=1)
test01_df.drop(['LINE', 'PRODUCT_CODE', 'PRODUCT_ID'], inplace=True, axis=1)



all_nan_columns = train01_df.columns[train01_df.isna().all()].tolist()
print(f"모든 값이 NaN인 컬럼 개수: {len(all_nan_columns)}")

predict_all_nan_columns = test01_df.columns[test01_df.isna().all()].tolist()
print(f"모든 값이 NaN인 컬럼 개수: {len(predict_all_nan_columns)}")

train01_df.drop(columns=all_nan_columns, inplace=True)
test01_df.drop(columns=all_nan_columns, inplace=True)


train01_df.fillna(0, inplace=True)
test01_df.fillna(0, inplace=True)

train01_df

모든 값이 NaN인 컬럼 개수: 644
모든 값이 NaN인 컬럼 개수: 1136


Unnamed: 0,X_128,X_129,X_130,X_131,X_132,X_133,X_134,X_136,X_137,X_138,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,7813.0,7813.0,0.0,0.0,0.19,0.20,0.19,228.0,228.0,225.0,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
1,0.0,0.0,19854.0,19854.0,0.20,0.21,0.20,413.0,414.0,414.0,...,193.296552,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55
2,7815.0,7815.0,0.0,0.0,0.19,0.20,0.19,228.0,228.0,225.0,...,179.820690,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
3,0.0,0.0,19856.0,19856.0,0.20,0.21,0.20,414.0,414.0,414.0,...,181.920690,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78
4,7817.0,7817.0,0.0,0.0,0.19,0.20,0.18,228.0,228.0,225.0,...,196.393333,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,11864.0,11864.0,0.0,0.0,0.19,0.20,0.19,230.0,230.0,230.0,...,183.833333,176.0,384.0,367.333333,352.0,0.00,0.00,0.00,0.00,0.00
582,11898.0,11898.0,0.0,0.0,0.18,0.20,0.18,230.0,230.0,230.0,...,198.366667,174.0,384.0,367.037037,352.0,50.88,53.23,52.44,56.28,66.83
583,11920.0,11920.0,0.0,0.0,0.19,0.20,0.19,230.0,230.0,230.0,...,191.993103,181.6,394.0,371.943396,353.0,51.71,59.64,54.61,57.05,63.18
594,14810.0,14810.0,0.0,0.0,0.19,0.20,0.19,304.0,304.0,304.0,...,180.810345,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49


In [120]:
test01_df

Unnamed: 0,X_128,X_129,X_130,X_131,X_132,X_133,X_134,X_136,X_137,X_138,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
7,18031.0,18031.0,0.0,0.0,0.19,0.2,0.19,354.0,354.0,354.0,...,190.8,168.3,384.0,369.462963,354.0,57.74,52.51,54.45,57.99,63.16
8,18064.0,18064.0,0.0,0.0,0.19,0.2,0.19,355.0,354.0,355.0,...,182.273333,169.8,384.0,370.259259,353.0,53.18,56.02,41.79,47.94,60.72
13,0.0,0.0,3370.0,3370.0,0.21,0.22,0.21,230.0,230.0,231.0,...,151.166667,138.3,384.0,367.462963,353.0,51.89,55.25,50.14,57.08,65.39
14,295.0,295.0,0.0,0.0,0.19,0.21,0.19,60.0,60.0,60.0,...,151.541379,139.5,384.0,369.377359,353.0,49.77,54.93,49.85,48.62,63.18
35,8632.0,8632.0,0.0,0.0,0.2,0.21,0.2,249.0,249.0,249.0,...,130.403448,103.5,384.0,369.660377,352.0,55.22,58.14,48.93,56.16,64.25
36,0.0,0.0,4394.0,4394.0,0.21,0.22,0.21,99.0,99.0,99.0,...,142.72069,129.7,384.0,368.207547,354.0,0.0,0.0,0.0,0.0,0.0
41,10712.0,10712.0,0.0,0.0,0.2,0.21,0.2,293.0,293.0,293.0,...,176.803333,165.6,383.0,368.018519,352.0,49.67,45.0,49.81,55.19,67.17
42,10729.0,10729.0,0.0,0.0,0.2,0.21,0.19,293.0,293.0,293.0,...,183.486207,171.8,384.0,368.45283,353.0,51.3,47.66,60.0,53.75,62.58
52,12261.0,12261.0,0.0,0.0,0.19,0.21,0.19,330.0,330.0,330.0,...,173.882759,161.0,384.0,368.320755,353.0,54.06,51.18,52.17,57.75,63.94
53,12283.0,12283.0,0.0,0.0,0.2,0.21,0.2,330.0,330.0,330.0,...,187.072414,175.1,384.0,361.226415,342.0,54.04,51.25,54.98,57.22,63.33


### 0, 1 라인 분리(완)

In [213]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_scaled = sc.fit_transform(train01_df)
train01_df = pd.DataFrame(X_scaled, columns=train01_df.columns)
print('피처 데이터 shape:{0}'.format(train01_df.shape))

predict_df_scaled = sc.transform(test01_df)
test01_df = pd.DataFrame(predict_df_scaled, columns=test01_df.columns)
print('피처 데이터 shape:{0}'.format(test01_df.shape))

피처 데이터 shape:(120, 1772)
피처 데이터 shape:(39, 1772)


In [122]:
train01_df

Unnamed: 0,X_128,X_129,X_130,X_131,X_132,X_133,X_134,X_136,X_137,X_138,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,-0.180370,-0.186560,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.604357,-0.605226,-0.614824,...,2.023531,2.035623,-0.421350,0.124820,0.481750,-0.150142,-0.177590,-0.447943,-0.429001,0.882891
1,-0.989393,-0.995545,2.139491,2.139491,0.803495,0.667670,0.912871,0.076557,0.079866,0.080717,...,1.644027,1.739131,-0.421350,-0.068170,0.481750,-0.172668,-0.086990,0.131727,-0.369150,0.675559
2,-0.180163,-0.186353,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.604357,-0.605226,-0.614824,...,0.362325,-0.013662,-0.421350,-0.211108,0.481750,-0.157651,-0.376630,0.057737,-0.312173,0.905928
3,-0.989393,-0.995545,2.139758,2.139758,0.803495,0.667670,0.912871,0.080237,0.079866,0.080717,...,0.562058,0.012499,-0.120386,0.432112,0.481750,-0.230234,-0.258333,0.552703,-0.597061,0.644976
4,-0.179956,-0.186145,-0.509145,-0.509145,-1.164247,-1.497746,-2.738613,-0.604357,-0.605226,-0.614824,...,1.938565,1.477520,-0.421350,-0.200399,0.203817,-0.182179,-0.130647,0.285319,-0.476881,0.851116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.239104,0.232895,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.596995,-0.597860,-0.596423,...,0.743972,0.901976,-0.120386,-0.206776,0.203817,-2.119391,-2.097103,-2.109394,-2.061245,-2.206035
116,0.242625,0.236416,-0.509145,-0.509145,-3.131989,-1.497746,-2.738613,-0.596995,-0.597860,-0.596423,...,2.126250,0.727569,-0.120386,-0.308806,0.203817,0.427517,0.401690,0.566480,0.633466,0.448368
117,0.244903,0.238694,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.596995,-0.597860,-0.596423,...,1.520055,1.390316,2.889260,1.380698,0.481750,0.469065,0.702597,0.677209,0.670334,0.303395
118,0.544158,0.537935,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.324630,-0.325296,-0.324095,...,0.456452,0.265390,-0.120386,0.646518,0.481750,0.356937,0.394179,0.487388,0.576967,0.434864


### 스케일링 (완)

> - 데이터의 분포: 일부 모델은 입력 데이터의 분포에 민감합니다. 스케일링이 데이터의 분포를 변경하여 모델의 성능에 영향
> - 특성 간 상관관계: 스케일링은 특성 간의 상대적 중요성을 변경 / 범주형 데이터를 수치형 데이터와 함께 스케일링할 때 이런 문제가 발생

In [214]:
data_df.drop(['PRODUCT_ID'], inplace=True, axis=1)
predict_df.drop(['PRODUCT_ID'], inplace=True, axis=1)

data_df.fillna(0, inplace=True)
predict_df.fillna(0, inplace=True)

In [215]:
from sklearn.preprocessing import LabelEncoder

features = ['PRODUCT_CODE']
le = LabelEncoder()
for i in features:
    encoder = LabelEncoder()
    encoder.fit(data_df[i])
    data_df[i] = encoder.transform(data_df[i])
    predict_df[i] = encoder.transform(predict_df[i])

data_df.head()

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,1,0.533433,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,197.286667,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77
1,2,0.541819,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.296552,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55
2,1,0.531267,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,179.82069,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35
3,2,0.537325,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,181.92069,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78
4,1,0.53159,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,196.393333,182.6,383.0,367.351852,352.0,38.7,41.89,46.93,33.09,76.97


In [237]:
predict_df.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,5,2,0.127626,0.81889,0.766018,0.927147,0.834377,1.265809,0.731582,0.726694,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
1,4,2,0.127626,0.797612,0.960697,0.68011,0.834377,0.309542,0.882369,0.874359,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
2,4,2,0.127626,0.840168,0.960697,0.68011,0.834377,0.309542,0.882369,0.875566,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
3,2,0,-0.302348,-1.18126,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.381044,1.432616,1.382323,1.364452,1.356562,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
4,3,0,-0.302348,-1.18126,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.399997,1.381896,1.372811,1.359587,1.356562,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886


# 스케일링, 샘플링

In [216]:
train_Y = data_df['Y_Class']

X_train = data_df.drop(columns=["Y_Class"])
X_train_copy = X_train.copy()
X_predict_df_copy = predict_df.copy()


numeric_features = X_train.iloc[:, 3:].columns


sc = StandardScaler()

X_train[numeric_features] = sc.fit_transform(X_train[numeric_features])
X_train = pd.DataFrame(X_train, columns=X_train_copy.columns)
print('피처 데이터 shape:{0}'.format(X_train.shape))

predict_df[numeric_features] = sc.transform(predict_df[numeric_features])
predict_df = pd.DataFrame(predict_df, columns=X_predict_df_copy.columns)
print('피처 데이터 shape:{0}'.format(predict_df.shape))

피처 데이터 shape:(598, 2419)
피처 데이터 shape:(310, 2418)


In [217]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

smoteto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X_train, train_Y = smoteto.fit_resample(X_train, train_Y)

print('피처 데이터 shape:{0}'.format(X_train.shape))
print('피처 데이터 shape:{0}'.format(train_Y.shape))
print('피처 데이터 shape:{0}'.format(predict_df.shape))

피처 데이터 shape:(1220, 2419)
피처 데이터 shape:(1220,)
피처 데이터 shape:(310, 2418)


In [218]:
train_Q = X_train['Y_Quality']
X_train = X_train.drop(columns=["Y_Quality"])
X_train

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,0,0,-0.302348,-1.181260,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.448822,1.496943,0.982802,0.988022,0.993310,1.608668,1.574442,1.294833,1.297452,2.671373
1,1,0,-0.302348,-1.181260,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.402487,1.454883,0.982802,0.985243,0.993310,1.585198,1.669622,1.901036,1.361174,2.462073
2,0,0,-0.302348,-1.181260,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.245997,1.206233,0.982802,0.983184,0.993310,1.600845,1.365342,1.823660,1.421838,2.694629
3,1,0,-0.302348,-1.181260,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.270384,1.209944,0.987558,0.992448,0.993310,1.525219,1.489618,2.341280,1.118519,2.431199
4,0,0,-0.302348,-1.181260,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.438448,1.417771,0.982802,0.983338,0.988121,1.575288,1.623758,2.061658,1.246474,2.639297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1215,0,0,-0.302348,-1.181260,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.296435,1.252038,0.980901,0.977918,0.972577,2.327593,2.276776,2.312505,2.203446,2.077917
1216,3,0,-0.302348,-1.181260,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,0.984075,0.978308,1.344388,1.329043,1.334486,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
1217,4,2,0.127626,0.897054,0.960697,0.68011,0.834377,0.309542,0.879268,0.872557,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
1218,0,0,-0.302348,-1.181260,-1.180768,-1.17267,-1.183243,-1.102091,-1.182188,-1.182091,...,1.345840,1.393759,0.987558,0.977347,0.962103,0.821466,0.871462,0.578595,0.898022,0.755945


In [104]:
# train_Q = data_df['Y_Quality']
# train_Y = data_df['Y_Class']

# X_train = data_df.drop(columns=['Y_Quality', "Y_Class"])

In [168]:
X_train

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,0,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.448822,1.496943,0.982802,0.988022,0.993310,1.608668,1.574442,1.294833,1.297452,2.671373
1,1,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.402487,1.454883,0.982802,0.985243,0.993310,1.585198,1.669622,1.901036,1.361174,2.462073
2,0,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.245997,1.206233,0.982802,0.983184,0.993310,1.600845,1.365342,1.823660,1.421838,2.694629
3,1,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.270384,1.209944,0.987558,0.992448,0.993310,1.525219,1.489618,2.341280,1.118519,2.431199
4,0,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.438448,1.417771,0.982802,0.983338,0.988121,1.575288,1.623758,2.061658,1.246474,2.639297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,5,2,0.127626,0.840168,0.766018,0.885975,0.834377,1.265809,0.728729,0.724682,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
594,0,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.257490,1.245819,0.987558,0.995537,0.993310,2.137001,2.175113,2.272976,2.368498,2.219091
595,0,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.207276,1.096134,0.982802,0.981687,0.988121,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
596,4,1,8.297142,0.818890,0.960697,0.680110,0.834377,0.309542,0.879108,0.873554,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886


In [169]:
predict_df

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_5,X_7,X_8,X_9,X_11,X_12,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,5,2,0.127626,0.818890,0.766018,0.927147,0.834377,1.265809,0.731582,0.726694,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
1,4,2,0.127626,0.797612,0.960697,0.680110,0.834377,0.309542,0.882369,0.874359,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
2,4,2,0.127626,0.840168,0.960697,0.680110,0.834377,0.309542,0.882369,0.875566,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
3,2,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.381044,1.432616,1.382323,1.364452,1.356562,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
4,3,0,-0.302348,-1.181260,-1.180768,-1.172670,-1.183243,-1.102091,-1.182188,-1.182091,...,1.399997,1.381896,1.372811,1.359587,1.356562,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,5,2,0.127626,0.755056,0.766018,0.927147,0.834377,1.265809,0.865660,0.877578,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
306,4,2,0.127626,0.861447,0.960697,0.680110,0.834377,0.309542,0.911303,0.905743,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
307,5,2,0.127626,0.755056,0.766018,0.885975,0.834377,1.265809,0.866882,0.876371,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886
308,5,2,0.127626,0.840168,0.766018,0.927147,0.834377,1.265809,0.868513,0.880394,...,-0.842183,-0.841109,-0.838827,-0.838654,-0.838516,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886


## Y_Quality를 Y_Class를 기준으로 min, max값 찾기

In [219]:
minmax = data_df[['Y_Class', 'Y_Quality']].groupby('Y_Class').agg(['min', 'max'])
minmax

Unnamed: 0_level_0,Y_Quality,Y_Quality
Unnamed: 0_level_1,min,max
Y_Class,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.500856,0.525067
1,0.525086,0.534843
2,0.534951,0.578841


In [220]:
from catboost import CatBoostRegressor

cat_reg =  CatBoostRegressor(iterations=500, learning_rate=0.05, verbose=0)
cat_reg.fit(X_train, train_Q, early_stopping_rounds=50) # cat_features=['PRODUCT_CODE', 'LINE']

predict_quality = cat_reg.predict(predict_df)

In [221]:
predict_quality

array([0.53010346, 0.53519777, 0.5338396 , 0.52268483, 0.53606671,
       0.53527594, 0.53315094, 0.52296605, 0.52388529, 0.53687507,
       0.53081343, 0.53491797, 0.53743651, 0.52202118, 0.52282126,
       0.52528832, 0.53270639, 0.52905701, 0.53133099, 0.53311419,
       0.52903084, 0.53497393, 0.52876512, 0.52940322, 0.52807526,
       0.52782438, 0.53220599, 0.53009906, 0.52894512, 0.53438195,
       0.52829064, 0.53394316, 0.52723426, 0.5324888 , 0.53393955,
       0.52336583, 0.52600439, 0.5200987 , 0.54772427, 0.52123774,
       0.53433887, 0.52389857, 0.52340525, 0.52949911, 0.53005804,
       0.53194533, 0.53110573, 0.51515561, 0.52983229, 0.51854271,
       0.53215251, 0.51871784, 0.52332457, 0.52367166, 0.52400192,
       0.52508579, 0.5304082 , 0.53003201, 0.52532624, 0.52714598,
       0.52517709, 0.5248795 , 0.52302775, 0.52435656, 0.52252019,
       0.52069306, 0.52212956, 0.53258032, 0.53071999, 0.52935887,
       0.52955166, 0.51456231, 0.53148059, 0.53192001, 0.53347

In [223]:
minmax[('Y_Quality', 'max')][0]

0.525066667

In [224]:
minmax[('Y_Quality', 'min')][2]

0.534950794

In [225]:
pred_Q = []

for i in predict_quality:
    if i < minmax[('Y_Quality', 'max')][0]:
        pred_Q.append(0)
    elif i < minmax[('Y_Quality', 'min')][2]:
        pred_Q.append(1)
    else:
        pred_Q.append(2)

pred_Q[:20]

[1, 2, 1, 0, 2, 2, 1, 0, 0, 2, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1]

In [226]:
len(pred_Q)

310

In [178]:
len(test01_df)

39

In [179]:
len(train01_df)

120

In [180]:
len(train01_Q)

120

In [181]:
train01_df

Unnamed: 0,X_128,X_129,X_130,X_131,X_132,X_133,X_134,X_136,X_137,X_138,...,X_2861,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870
0,-0.180370,-0.186560,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.604357,-0.605226,-0.614824,...,2.023531,2.035623,-0.421350,0.124820,0.481750,-0.150142,-0.177590,-0.447943,-0.429001,0.882891
1,-0.989393,-0.995545,2.139491,2.139491,0.803495,0.667670,0.912871,0.076557,0.079866,0.080717,...,1.644027,1.739131,-0.421350,-0.068170,0.481750,-0.172668,-0.086990,0.131727,-0.369150,0.675559
2,-0.180163,-0.186353,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.604357,-0.605226,-0.614824,...,0.362325,-0.013662,-0.421350,-0.211108,0.481750,-0.157651,-0.376630,0.057737,-0.312173,0.905928
3,-0.989393,-0.995545,2.139758,2.139758,0.803495,0.667670,0.912871,0.080237,0.079866,0.080717,...,0.562058,0.012499,-0.120386,0.432112,0.481750,-0.230234,-0.258333,0.552703,-0.597061,0.644976
4,-0.179956,-0.186145,-0.509145,-0.509145,-1.164247,-1.497746,-2.738613,-0.604357,-0.605226,-0.614824,...,1.938565,1.477520,-0.421350,-0.200399,0.203817,-0.182179,-0.130647,0.285319,-0.476881,0.851116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.239104,0.232895,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.596995,-0.597860,-0.596423,...,0.743972,0.901976,-0.120386,-0.206776,0.203817,-2.119391,-2.097103,-2.109394,-2.061245,-2.206035
116,0.242625,0.236416,-0.509145,-0.509145,-3.131989,-1.497746,-2.738613,-0.596995,-0.597860,-0.596423,...,2.126250,0.727569,-0.120386,-0.308806,0.203817,0.427517,0.401690,0.566480,0.633466,0.448368
117,0.244903,0.238694,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.596995,-0.597860,-0.596423,...,1.520055,1.390316,2.889260,1.380698,0.481750,0.469065,0.702597,0.677209,0.670334,0.303395
118,0.544158,0.537935,-0.509145,-0.509145,-1.164247,-1.497746,-0.912871,-0.324630,-0.325296,-0.324095,...,0.456452,0.265390,-0.120386,0.646518,0.481750,0.356937,0.394179,0.487388,0.576967,0.434864


In [227]:
# from catboost import CatBoostClassifier

# clf = CatBoostClassifier(iterations=500, verbose=0, learning_rate=0.05)
# clf.fit(train01_df, train01_Q)
# test01_quality = clf.predict(test01_df)

cat_reg =  CatBoostRegressor(iterations=500, learning_rate=0.05, verbose=0)
cat_reg.fit(train01_df, train01_Q, early_stopping_rounds=50)
test01_quality=cat_reg.predict(test01_df)

In [228]:
test01_quality

array([0.52372819, 0.52275281, 0.52226557, 0.52215184, 0.52251029,
       0.52593954, 0.52494206, 0.52557227, 0.52396692, 0.52332989,
       0.52532826, 0.52572762, 0.52406304, 0.52547987, 0.52311792,
       0.52153113, 0.5230056 , 0.52516926, 0.52497616, 0.52459872,
       0.52597887, 0.52518974, 0.52506975, 0.52360237, 0.52455833,
       0.52543715, 0.5241528 , 0.52361236, 0.53427317, 0.52685836,
       0.52716235, 0.5265126 , 0.52681485, 0.52802414, 0.52399702,
       0.52522598, 0.52666225, 0.52595202, 0.52656724])

In [230]:
pred01_Q = []

for i in test01_quality:
    if i <= minmax[('Y_Quality', 'max')][0]:
        pred01_Q.append(0)
    elif i <= minmax[('Y_Quality', 'min')][2]:
        pred01_Q.append(1)
    else:
        pred01_Q.append(2)

pred01_Q[:20]

[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0]

In [231]:
final = predict_df[(predict_df['LINE']==0)|(predict_df['LINE']==1)].index
total = pred_Q

In [232]:
final

Index([  7,   8,  13,  14,  35,  36,  41,  42,  52,  53,  54,  61,  62,  63,
        64,  65,  66, 130, 131, 132, 248, 249, 250, 251, 252, 253, 254, 255,
       260, 263, 280, 281, 282, 283, 284, 285, 286, 292, 293],
      dtype='int64')

In [233]:
total

[1,
 2,
 1,
 0,
 2,
 2,
 1,
 0,
 0,
 2,
 1,
 1,
 2,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 2,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 0,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [234]:
for i in range(len(final)):
  total[final[i]]=pred01_Q[i]

In [235]:
len(total)

310

In [236]:
submit = submission_df
submit['Y_Class'] = total
submit
submit.to_csv('./sample_submission_sc_sampling.csv', index=False)

In [63]:
submit

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,1
2,TEST_002,1
3,TEST_003,0
4,TEST_004,1
...,...,...
305,TEST_305,1
306,TEST_306,1
307,TEST_307,1
308,TEST_308,1


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

X_train01, X_test01, y_train01, y_test01 = train_test_split(train01_df, train01_Q, random_state=0, test_size=0.2)

params = {
    'max_depth': [2, 3, 4, 8, 10],
    'min_samples_leaf' : [6, 12, 24],
    'min_samples_split' : [8, 16, 24]
}


rf_clf = RandomForestRegressor(n_estimators=500, n_jobs=-1, oob_score=True, criterion='squared_error')
grid_cv = GridSearchCV(rf_clf, param_grid=params , cv=5, n_jobs=-1)
grid_cv.fit(X_train01, y_train01)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

train_preds = grid_cv.predict(X_train01)
# train_pred_proba = grid_cv.predict_proba(X_train01)[:, 1]
ws_preds = grid_cv.predict(X_test01)
# ws_pred_proba = grid_cv.predict_proba(X_test01)[:, 1]

evaluate_regr(y_train01, train_preds)
evaluate_regr(y_test01, ws_preds)

In [None]:
test01_quality=grid_cv.predict(test01_df)
test01_quality

In [None]:
len(pred01_Q)

In [None]:
final = predict_df[(predict_df['LINE']==0)|(predict_df['LINE']==1)].index

total = pred_Q

for i in range(len(final)):
  total[final[i]]=pred01_Q[i]

In [None]:
submit = submission_df
submit['Y_Class'] = total
submit
submit.to_csv('./sample_submission.csv', index=False)