In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("../data/raw_data/train.csv")
test = pd.read_csv("../data/raw_data/test.csv")

In [3]:
train = train.drop(columns = ['ID'])
test = test.drop(columns = ['ID'])

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['SUBCLASS'] = le.fit_transform(train['SUBCLASS'])

In [5]:
train[test.columns] = train[test.columns].map(lambda x: 0 if x == 'WT' else 1)
test[test.columns] = test[test.columns].map(lambda x: 0 if x == 'WT' else 1)

# Unvariate Feature Selection

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 # 카이제곱값 이용

In [78]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [79]:
# 가능한 k 값 (몇 개의 변수를 선택할지)
k_values = range(10, 500, 10)  # 10개씩 증가하면서 최대 500개까지 탐색 (필요에 따라 조정 가능)

# 결과 저장 리스트
mean_scores = []

In [80]:
X = train[test.columns]
y = train['SUBCLASS']

In [81]:
# 각 k 값에 대해 교차 검증을 통한 성능 평가
for k in k_values:
    selector = SelectKBest(chi2, k=k)
    X_new = selector.fit_transform(X, y)
    
    # 분류 모델 정의 (랜덤 포레스트로 예시)
    clf = RandomForestClassifier()

    # 교차 검증을 통한 성능 평가
    scores = cross_val_score(clf, X_new, y, cv=5)  # 5-fold 교차 검증
    mean_scores.append(np.mean(scores))

# 최적의 k 값 찾기
optimal_k = k_values[np.argmax(mean_scores)]
print(f"최적의 k 값: {optimal_k}")

최적의 k 값: 310


In [82]:
selector = SelectKBest(chi2, k=optimal_k)

In [83]:
X_new = selector.fit_transform(X, y)

In [84]:
ufs_col = selector.get_feature_names_out().tolist()

In [85]:
ufs_col.append('SUBCLASS')

In [86]:
train_ufs = train[ufs_col]
test_ufs = test[selector.get_feature_names_out()]

In [87]:
train_ufs.head()

Unnamed: 0,A2M,ABCA4,ABCA8,ABCB1,ABCB11,ABCC3,ABCC8,ACACA,ACE,ACSM1,...,UTP20,VCAN,VHL,VWA5A,VWF,XDH,ZBTB16,ZEB1,ZFPM2,SUBCLASS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,19
2,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,20
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,9
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6


In [88]:
test_ufs.head()

Unnamed: 0,A2M,ABCA4,ABCA8,ABCB1,ABCB11,ABCC3,ABCC8,ACACA,ACE,ACSM1,...,UQCRFS1,UTP20,VCAN,VHL,VWA5A,VWF,XDH,ZBTB16,ZEB1,ZFPM2
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
from sklearn.model_selection import train_test_split

In [90]:
x_train, x_test, y_train, y_test = train_test_split(train_ufs.drop(columns=['SUBCLASS'], axis=1), train_ufs['SUBCLASS'], test_size=0.3, random_state=42)

In [91]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [92]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [93]:
# 테스트 데이터로 예측
y_pred = model.predict(x_test)

In [94]:
# 평가
# 1. 정확도 (accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.30


In [49]:
test_predict = model.predict(test_ufs.values)



In [96]:
test_predict = le.inverse_transform(test_predict)

In [97]:
submission = pd.read_csv("../data/raw_data/sample_submission.csv")

In [98]:
submission['SUBCLASS'] = test_predict

In [99]:
submission.to_csv("../data/submission/submission_07.csv", index=False)