# Multi-Label Classification

- train 데이터와 val 데이터 중 겹치는 10가지 열을 사용하여 학습, test 데이터로 추론
- sklearn의 multioutputclassifier를 사용하여 S1-S4 컬럼에 대한 다중 분류 수행
- 학습 및 추론 코드가 간단하여 하나의 파일에서 모두 수행

- 입력 데이터 (경로: ../data/multi_output_classifier/)
    - train
        - train_user.csv: train 데이터 전체 (500x12)
        - train_label.csv: train 데이터 라벨 전체 (500)
    - val
        - val_01_user.csv: user01의 validation 데이터 (40x25)
        - val_02_user.csv: user02의 validation 데이터 (18x25)
        - val_03_user.csv: user03의 validation 데이터 (20x25)
        - val_04_user.csv: user04의 validation 데이터 (27x25)
    - test
        - test_05_user.csv: user05의 test 데이터 (20x19)
        - test_06_user.csv: user06의 test 데이터 (33x19)
        - test_07_user.csv: user07의 test 데이터 (35x19)
        - test_08_user.csv: user08의 test 데이터 (33x19)

- 사용한 모델 (저장 경로: ../model/multi_output_classifier/)
    - VotingClassifier
        - RandomForestClassifier
        - GradientBoostingClassifier
        - LogisticRegression
        - SupportVectorClassifier
        - DecisionTreeClassifier
        - KNeighborsClassifier

## Import Dataset

In [18]:
import pandas as pd

# 학습 데이터 로드
train_user = pd.read_csv('../data/muti_output_classifier/train_user.csv')
train_label = pd.read_csv('../data/muti_output_classifier/train_label.csv')

# 검증 데이터 로드
user01_df = pd.read_csv('../data/muti_output_classifier/val_01_user.csv')
user02_df = pd.read_csv('../data/muti_output_classifier/val_02_user.csv')
user03_df = pd.read_csv('../data/muti_output_classifier/val_03_user.csv')
user04_df = pd.read_csv('../data/muti_output_classifier/val_04_user.csv')

# 평가 데이터 로드
user05_df = pd.read_csv('../data/muti_output_classifier/test_05_user.csv') ; user05_df['subject_id'] = 5
user06_df = pd.read_csv('../data/muti_output_classifier/test_06_user.csv') ; user06_df['subject_id'] = 6
user07_df = pd.read_csv('../data/muti_output_classifier/test_07_user.csv') ; user07_df['subject_id'] = 7
user08_df = pd.read_csv('../data/muti_output_classifier/test_08_user.csv') ; user08_df['subject_id'] = 8

# 정답 샘플 데이터 로드
answer_sample = pd.read_csv('../data/answer_sample.csv')

In [24]:
# train-val 데이터 중 겹치는 열 추출
train_overlap = ['m_acc_x_mean', 'm_acc_x_var', 'm_acc_y_mean', 'm_acc_y_var', 'm_acc_z_mean', 'm_acc_z_var', 'activity', 's_hr_mean', 'm_gps_lat_mean', 'm_gps_lon_mean']
val_overlap = ['m_acc_x_mean', 'm_acc_x_var', 'm_acc_y_mean', 'm_acc_y_var', 'm_acc_z_mean', 'm_acc_z_var', 'm_activity_mode', 's_heart_rate', 'm_gps_latitude', 'm_gps_longitude']

In [25]:
# train 학습용 데이터
train_user = train_user[train_overlap]
train_user = train_user.fillna(0)

# train 데이터 컬럼명을 val 컬럼명으로 통일
train_user = train_user.rename(columns={'activity':'m_activity_mode', 's_hr_mean':'s_heart_rate', 
                                        'm_gps_lat_mean':'m_gps_latitude', 'm_gps_lon_mean':'m_gps_longitude'}) ;train_user

Unnamed: 0,m_acc_x_mean,m_acc_x_var,m_acc_y_mean,m_acc_y_var,m_acc_z_mean,m_acc_z_var,m_activity_mode,s_heart_rate,m_gps_latitude,m_gps_longitude
0,-0.707907,16.188875,-0.161292,39.236581,6.067595,44.414906,2.719828,0.000000,37.494725,127.020054
1,-0.173821,10.073242,-1.346915,54.106859,3.783996,57.746758,2.738126,87.921995,37.442792,127.007351
2,-1.493587,66.382578,-1.437502,82.489159,6.590162,24.476068,2.865330,86.935213,37.483732,127.003324
3,-0.724343,33.257282,1.744564,42.898067,6.149547,21.753621,2.846785,83.276502,37.483296,127.003187
4,-1.274421,55.080675,-1.256600,52.688470,6.969066,16.496700,2.871658,83.205781,37.484793,126.991129
...,...,...,...,...,...,...,...,...,...,...
495,-0.033176,76.851372,0.010461,44.258423,8.831255,34.493616,2.980861,82.390253,37.486774,126.882378
496,0.176962,38.759584,0.189291,53.310184,8.599122,44.695242,2.977408,80.331553,37.486807,126.882836
497,-0.290505,21.535924,0.577183,48.438511,8.179368,55.187854,2.990728,79.677352,37.486750,126.884176
498,0.078069,87.741890,0.213279,57.306259,7.929909,40.437786,2.979042,80.901443,37.486919,126.883586


In [26]:
# train 학습용 라벨
train_label = train_label[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']].fillna(0)
train_label = train_label.astype(int) ;train_label

Unnamed: 0,Q1,Q2,Q3,S1,S2,S3,S4
0,1,0,0,1,1,0,0
1,0,0,0,0,1,1,1
2,1,0,1,1,1,0,1
3,1,0,1,1,1,1,1
4,0,1,1,0,1,1,1
...,...,...,...,...,...,...,...
495,0,0,1,0,1,0,0
496,0,0,1,0,1,1,0
497,1,0,1,0,1,1,1
498,0,0,1,0,1,1,0


In [27]:
# val 학습용 데이터
val = pd.concat([user01_df, user02_df, user03_df, user04_df], axis=0).fillna(0)
val_user = val[val_overlap] ;val_user

Unnamed: 0,m_acc_x_mean,m_acc_x_var,m_acc_y_mean,m_acc_y_var,m_acc_z_mean,m_acc_z_var,m_activity_mode,s_heart_rate,m_gps_latitude,m_gps_longitude
0,0.392374,1.756724,0.833553,2.437862,6.346475,2.478189,3.226389,41.247214,0.013385,0.930735
1,0.554709,3.547447,-1.060436,4.201570,6.708900,3.236058,3.505556,48.585655,0.013316,0.930746
2,1.267402,2.193416,-0.955981,2.632809,6.024911,2.400287,3.886806,44.164511,0.013419,0.930552
3,1.950653,2.304939,-0.800915,2.727436,2.513997,2.884176,3.465972,41.837744,0.013663,0.931180
4,-0.218677,1.971424,-0.962672,2.682496,6.143505,2.488044,3.306944,42.578246,0.013209,0.930936
...,...,...,...,...,...,...,...,...,...,...
22,0.603936,0.480638,0.655137,0.450947,4.229028,0.608775,2.906359,38.274441,0.498630,0.217011
23,-0.709910,0.323667,0.452504,0.343004,0.003704,0.539942,2.924306,40.737421,0.497290,0.222552
24,-0.380443,0.577497,0.584300,0.792561,0.323266,0.936399,2.792361,50.755633,0.501668,0.215483
25,0.858719,0.290357,0.072040,0.463033,2.036119,0.821129,2.905556,46.688909,0.501510,0.212306


In [28]:
# val 학습용 라벨
val_label = val[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']].fillna(0) ;val_label

Unnamed: 0,Q1,Q2,Q3,S1,S2,S3,S4
0,1,1,1,0,0,0,0
1,1,1,1,0,0,1,0
2,0,1,1,0,1,1,0
3,0,1,1,0,0,1,0
4,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...
22,0,1,0,0,1,1,1
23,1,1,0,1,1,1,1
24,1,1,0,0,1,1,1
25,0,1,0,0,0,1,1


## MultiOutputClassifier

In [29]:
# train과 val 데이터를 합친 최종 학습 데이터
train_val = pd.concat([train_user, val_user], axis=0) ;train_val

Unnamed: 0,m_acc_x_mean,m_acc_x_var,m_acc_y_mean,m_acc_y_var,m_acc_z_mean,m_acc_z_var,m_activity_mode,s_heart_rate,m_gps_latitude,m_gps_longitude
0,-0.707907,16.188875,-0.161292,39.236581,6.067595,44.414906,2.719828,0.000000,37.494725,127.020054
1,-0.173821,10.073242,-1.346915,54.106859,3.783996,57.746758,2.738126,87.921995,37.442792,127.007351
2,-1.493587,66.382578,-1.437502,82.489159,6.590162,24.476068,2.865330,86.935213,37.483732,127.003324
3,-0.724343,33.257282,1.744564,42.898067,6.149547,21.753621,2.846785,83.276502,37.483296,127.003187
4,-1.274421,55.080675,-1.256600,52.688470,6.969066,16.496700,2.871658,83.205781,37.484793,126.991129
...,...,...,...,...,...,...,...,...,...,...
22,0.603936,0.480638,0.655137,0.450947,4.229028,0.608775,2.906359,38.274441,0.498630,0.217011
23,-0.709910,0.323667,0.452504,0.343004,0.003704,0.539942,2.924306,40.737421,0.497290,0.222552
24,-0.380443,0.577497,0.584300,0.792561,0.323266,0.936399,2.792361,50.755633,0.501668,0.215483
25,0.858719,0.290357,0.072040,0.463033,2.036119,0.821129,2.905556,46.688909,0.501510,0.212306


In [30]:
# train과 val 라벨을 합친 최종 학습 라벨
train_val_label = pd.concat([train_label, val_label], axis=0) ;train_val_label

Unnamed: 0,Q1,Q2,Q3,S1,S2,S3,S4
0,1,0,0,1,1,0,0
1,0,0,0,0,1,1,1
2,1,0,1,1,1,0,1
3,1,0,1,1,1,1,1
4,0,1,1,0,1,1,1
...,...,...,...,...,...,...,...
22,0,1,0,0,1,1,1
23,1,1,0,1,1,1,1
24,1,1,0,0,1,1,1
25,0,1,0,0,0,1,1


In [32]:
# 사용자 별 test 데이터 병합
test_user = pd.concat([user05_df, user06_df, user07_df, user08_df], axis=0).fillna(0) 

# answer_sample에 있는 데이터만 추출
answer_sample['key'] = answer_sample['subject_id'].astype(str) + answer_sample['date']
test_user['key'] = test_user['subject_id'].astype(str) + test_user['date']
test_user = test_user[test_user['key'].isin(answer_sample['key'])]

# test 추론 데이터
test_user = test_user[val_overlap]
test_user = test_user.reset_index(drop=True) ;test_user

Unnamed: 0,m_acc_x_mean,m_acc_x_var,m_acc_y_mean,m_acc_y_var,m_acc_z_mean,m_acc_z_var,m_activity_mode,s_heart_rate,m_gps_latitude,m_gps_longitude
0,-0.025354,1.407288,1.366390,1.694444,1.138492,2.134006,3.211994,51.924342,0.039281,0.026648
1,0.890448,1.795161,1.930635,2.205943,6.020560,2.213233,2.975694,55.848824,0.038817,0.020516
2,0.062869,1.637819,1.808122,1.832477,7.727069,2.249789,2.952778,64.660832,0.038441,0.026529
3,-0.049565,1.054668,-0.142927,1.285119,7.456657,1.404697,3.635540,95.184109,0.038831,0.020476
4,-0.492484,1.221004,1.529056,1.504676,7.457833,1.715990,2.936806,90.054438,0.038773,0.020482
...,...,...,...,...,...,...,...,...,...,...
110,-0.572178,1.889257,1.687971,2.347146,5.019627,1.715268,2.973611,83.390303,1.594411,0.929467
111,-3.594138,2.223034,2.371284,5.859808,3.574392,2.993113,3.421089,52.962963,1.600312,0.929183
112,-0.166466,1.141286,0.154934,2.192296,8.115459,0.950882,2.872117,63.557196,1.598157,0.927421
113,-0.147444,1.275830,-0.023144,2.368825,1.189906,2.354820,3.421712,58.965463,1.587414,0.935909


In [33]:
print("학습 데이터 크기: ", train_val.shape)
print("학습 라벨 크기: ", train_val_label.shape)
print("추론 데이터 크기: ", test_user.shape)

학습 데이터 크기:  (605, 10)
학습 라벨 크기:  (605, 7)
추론 데이터 크기:  (115, 10)


In [34]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# SEED 값 설정
SEED = 42

# 모델 생성
forest = RandomForestClassifier(random_state=SEED)
boosting = GradientBoostingClassifier(random_state=SEED)
log_clf = LogisticRegression(random_state=SEED)
svm_clf = make_pipeline(StandardScaler(), SVC(probability=True, random_state=SEED))
tree_clf = DecisionTreeClassifier(random_state=SEED)
knn_clf = make_pipeline(StandardScaler(), KNeighborsClassifier())

# VotingClassifier에 여러 모델 조합
voting = VotingClassifier(
    estimators=[
        ('forest', forest),
        ('boosting', boosting),
        ('logistic', log_clf),
        ('svm', svm_clf),
        ('tree', tree_clf),
        ('knn', knn_clf)
    ],
    voting='soft'
)

# MultiOutputClassifier로 다중 출력 분류기 설정
multi_target_classifier = MultiOutputClassifier(voting, n_jobs=-1)

# 모델 학습
multi_target_classifier.fit(train_val, train_val_label)

# 모델 추론
predictions = multi_target_classifier.predict(test_user)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [35]:
# 예측 결과 확인
predictions

array([[0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 1, 1],
       [0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 1, 0],
       [1, 1, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 0],
       [1, 1, 1, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 1, 0],
       [0, 1, 1, 0, 0, 1, 0],
       [1,

## Submission

In [36]:
# answer_sample 형태로 변환
predictions_df = pd.DataFrame(predictions, columns=['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4'])
predictions_df = pd.concat([answer_sample[['subject_id', 'date']], predictions_df], axis=1)
predictions_df

Unnamed: 0,subject_id,date,Q1,Q2,Q3,S1,S2,S3,S4
0,5,2023-11-05,0,1,0,0,0,0,1
1,5,2023-11-06,0,1,0,0,0,0,0
2,5,2023-11-07,0,0,0,0,0,0,0
3,5,2023-11-08,1,0,0,0,0,0,0
4,5,2023-11-09,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
110,8,2023-11-05,1,1,0,0,0,0,0
111,8,2023-11-06,1,1,1,0,0,1,0
112,8,2023-11-07,1,0,0,0,0,1,0
113,8,2023-11-08,1,1,0,0,0,1,1


In [38]:
# 제출 파일 생성
predictions_df.to_csv('../predictions/moc_result_s_column.csv', index=False)

----