# file loading

In [1]:
import requests
import zipfile
import io

# 구글 드라이브 공유 링크에서 파일 ID를 가져옵니다.
file_id = '1ZUueEhjvhzmo8UpZLwqMgzMKA6FGtLJk'  # 여기에 파일 ID를 입력하세요.
download_url = f'https://drive.google.com/uc?export=download&id={file_id}'

# 파일 다운로드
response = requests.get(download_url)
response.raise_for_status()  # 요청이 실패하면 예외를 발생시킵니다.

# 다운로드한 ZIP 파일을 메모리에서 직접 압축 해제합니다.
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    zip_ref.extractall('/content/unzipped_folder')

# 결과 확인
import os
os.listdir('/content/unzipped_folder')


['test.csv', 'train.csv', 'sample_submission.csv']

# data preprocessing

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [4]:
train = pd.read_csv("/content/unzipped_folder/train.csv")

In [5]:
# 범주형 변수를 찾기
cat_features_indices = train.select_dtypes(include=['object']).columns

# 범주형 변수의 인덱스 찾기
cat_feature_indices = [train.columns.get_loc(col) for col in cat_features_indices]

print("범주형 변수 인덱스:", cat_feature_indices)

범주형 변수 인덱스: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 21

In [6]:
# 범주형 변수를 제외한 데이터프레임
non_cat_features = train.drop(columns=cat_features_indices)

# 범주형이 아닌 변수의 이름과 인덱스 출력
non_cat_features_indices = non_cat_features.columns
non_cat_feature_indices = [train.columns.get_loc(col) for col in non_cat_features_indices]

print("범주형이 아닌 변수 인덱스:", non_cat_feature_indices)
print("범주형이 아닌 변수 이름:", list(non_cat_features_indices))

범주형이 아닌 변수 인덱스: []
범주형이 아닌 변수 이름: []


In [7]:
X = train.drop(columns=['SUBCLASS', 'ID'])
y = train['SUBCLASS']

In [8]:
from sklearn.feature_selection import SelectKBest, f_classif

# 예시 데이터프레임
X_encoded = pd.get_dummies(X)

# 상위 500개의 특성만 선택
selector = SelectKBest(score_func=f_classif, k=500)
X_new = selector.fit_transform(X_encoded, y)

# 선택된 특성의 인덱스 저장
selected_features_mask = selector.get_support()

  19281  19282  19283  19284  19285  19286  19287  19288  19289  19290
  19291  19292  19293  19294  23654  28497  28740  28741  28742  29456
  29457  29458  34659  40476  44685  46490  46645  48763  51069  52830
  54664  56557  59609  59713  61945  62052  63798  64599  65109  69398
  73081  73516  75858  77914  78793  78814  79314  79575  80101  80102
  80254  80565  80566  80567  80568  80569  80570  80571  80572  80573
  80574  80575  81013  86744  86828  92132  96038  96412 100736 100737
 103284 107826 109283 110212 110213 110214 110215 113346 113347 113448
 114156 115480 116152 116236 116823 120157 124334 124335 124336 124337
 126334 126335 126363 127784 131322 132458 134602 135184 139540 141337
 144526 144695 146424 146425 148258 150497 151863 152608 154796 156121
 156234 157268 158407 159326 160205 161319 165467 168080 168418 168419
 168420 168564 168721 168722 171460 174440 174935 174936 179322 180757
 181100 182755 186478 186504 186505 187893 187894 187895 190956 193867
 19392

In [9]:
selected_feature_names = X_encoded.columns[selected_features_mask]
print(selected_feature_names)

Index(['A2M_WT', 'ABCA2_WT', 'ABCA3_WT', 'ABCA4_WT', 'ABCA8_WT', 'ABCB1_WT',
       'ABCB11_WT', 'ABCC3_WT', 'ABCC8_WT', 'ACACA_WT',
       ...
       'VHL_WT', 'VWA5A_WT', 'VWF_WT', 'XDH_WT', 'XYLT2_WT', 'XYLT2_Y526fs',
       'ZBTB16_WT', 'ZEB1_WT', 'ZFPM2_WT', 'ZNF292_WT'],
      dtype='object', length=500)


In [10]:
# _ 앞까지만 추출하고 중복 제거
unique_prefixes = sorted(set(name.split('_')[0] for name in selected_feature_names), reverse=False)

# 결과를 리스트로 변환
result = list(unique_prefixes)

# 결과 출력
print(result)

['A2M', 'ABCA2', 'ABCA3', 'ABCA4', 'ABCA8', 'ABCB1', 'ABCB11', 'ABCC3', 'ABCC8', 'ACACA', 'ACE', 'ACOX2', 'ACSL5', 'ACSM1', 'ACTN1', 'ACTN2', 'ADAM2', 'ADAMDEC1', 'ADAMTS5', 'ADCY1', 'ADCY2', 'ADD2', 'AHNAK', 'AKAP13', 'AKR1D1', 'ALB', 'ALMS1', 'ALS2', 'AMOT', 'ANGPT1', 'ANK1', 'ANPEP', 'APC', 'APOE', 'ARHGAP29', 'ARHGEF11', 'ARID5B', 'ARPP21', 'ATOH8', 'ATP1A3', 'ATRX', 'AVL9', 'B2M', 'BCL11B', 'BDKRB2', 'BRAF', 'BRCA2', 'BRS3', 'BTG1', 'BTG2', 'C3', 'C8A', 'C8B', 'CA2', 'CACNA1A', 'CACNA1B', 'CACNA1F', 'CACNA1H', 'CAD', 'CALCRL', 'CAMK4', 'CD1D', 'CD2', 'CD86', 'CD8B', 'CDCP1', 'CDH1', 'CDH11', 'CDH13', 'CDH15', 'CDH16', 'CDH4', 'CDH6', 'CDH8', 'CDKN1A', 'CDKN2A', 'CEBPA', 'CELSR1', 'CELSR2', 'CENPF', 'CEP250', 'CES1', 'CFB', 'CFH', 'CFHR2', 'CFI', 'CFTR', 'CIITA', 'CIT', 'CKB', 'CLEC5A', 'CLGN', 'CLVS1', 'CMPK2', 'CNTN1', 'COL11A1', 'COL12A1', 'COL1A1', 'COL1A2', 'COL2A1', 'COL3A1', 'COL4A1', 'COL4A2', 'COL5A1', 'COL5A2', 'COL5A3', 'COL6A3', 'COL7A1', 'COL9A1', 'COPB1', 'CR1', 'CR2'

In [11]:
# 원본 데이터프레임에서 선택된 특성만 추출
X_new_cat = X[result]

In [12]:
# 범주형 변수를 찾기
cat_features_indices = X_new_cat.select_dtypes(include=['object']).columns

# 범주형 변수의 인덱스 찾기
cat_feature_indices = [X_new_cat.columns.get_loc(col) for col in cat_features_indices]

print("범주형 변수 인덱스:", cat_feature_indices)

범주형 변수 인덱스: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 21

In [13]:
# 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X_new_cat, y, test_size=0.2, random_state=42)

In [14]:
model = CatBoostClassifier(
    loss_function='MultiClass',
    custom_metric=['MultiClass', 'AUC', 'F1'],
    eval_metric = 'MultiClass',

    iterations=1000,
    learning_rate=0.1,
    random_seed=42,

    depth=6,
    grow_policy='Depthwise',
    auto_class_weights='SqrtBalanced',
    cat_features=cat_feature_indices,  # 범주형 특징 인덱스


    verbose=True,
    early_stopping_rounds=50,
    task_type="GPU"
)

# 모델 학습
model.fit(X_train, y_train, eval_set=(X_valid, y_valid))


# 최적 모델 저장
model.save_model('best_model.cbm')

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 3.0602136	test: 3.0903093	best: 3.0903093 (0)	total: 340ms	remaining: 5m 39s
1:	total: 412ms	remaining: 3m 25s
2:	total: 492ms	remaining: 2m 43s
3:	total: 547ms	remaining: 2m 16s
4:	total: 605ms	remaining: 2m
5:	learn: 2.7383366	test: 2.7874422	best: 2.7874422 (5)	total: 875ms	remaining: 2m 24s
6:	total: 934ms	remaining: 2m 12s
7:	total: 993ms	remaining: 2m 3s
8:	total: 1.1s	remaining: 2m 1s
9:	total: 1.15s	remaining: 1m 54s
10:	learn: 2.6061915	test: 2.6737307	best: 2.6737307 (10)	total: 1.45s	remaining: 2m 10s
11:	total: 1.52s	remaining: 2m 5s
12:	total: 1.62s	remaining: 2m 3s
13:	total: 1.72s	remaining: 2m 1s
14:	total: 1.81s	remaining: 1m 58s
15:	learn: 2.4964885	test: 2.5798673	best: 2.5798673 (15)	total: 2.24s	remaining: 2m 17s
16:	total: 2.34s	remaining: 2m 15s
17:	total: 2.42s	remaining: 2m 12s
18:	total: 2.52s	remaining: 2m 9s
19:	total: 2.62s	remaining: 2m 8s
20:	learn: 2.4289081	test: 2.5179872	best: 2.5179872 (20)	total: 2.87s	remaining: 2m 13s
21:	total: 2.96s	re

In [24]:
from catboost import CatBoostClassifier


test = pd.read_csv("/content/unzipped_folder/test.csv")
X_test = test.drop(columns=['ID'])

X_test_cat = X_test[result]

# NaN 값을 대체하는 예시 (가장 흔한 값으로 대체)
X_test_cat.fillna('missing_value', inplace=True)  # 문자열로 대체

# 저장된 모델 불러오기
loaded_model = CatBoostClassifier()
loaded_model.load_model('best_model.cbm')

# 예측 수행
predictions = loaded_model.predict(X_test_cat)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_cat.fillna('missing_value', inplace=True)  # 문자열로 대체


In [25]:
submisson = pd.read_csv("/content/unzipped_folder/sample_submission.csv")

In [26]:
print("predictions의 길이:", len(predictions))
print("submisson의 길이:", len(submisson))

predictions의 길이: 2546
submisson의 길이: 2546


In [29]:
predictions = predictions.ravel()  # 또는 predictions.flatten()
submisson["SUBCLASS"] = predictions

In [30]:
submisson.to_csv('submission.csv', encoding='UTF-8-sig', index=False)