In [7]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd

# 데이터 불러오기
train_data = pd.read_csv('processed_train_data.csv')
test_data = pd.read_csv('test.csv')

# 열 이름과 상위 5개 데이터 확인
print(train_data.columns)
print(train_data.head())


Index(['class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-color', 'has-ring', 'ring-type',
       'habitat', 'season', 'is_spring_or_fall'],
      dtype='object')
   class  cap-diameter cap-shape cap-surface cap-color does-bruise-or-bleed  \
0      0          8.80         f           s   Unknown                    f   
1      1          4.51         x           h         o                    f   
2      0          6.94         f           s   Unknown                    f   
3      0          3.88         f           y         g                    f   
4      0          5.85         x     Unknown         w                    f   

  gill-attachment gill-spacing gill-color  stem-height  stem-width stem-color  \
0               a            c          w         4.51       15.39          w   
1               a            c          n         4.79        6

In [8]:
for col in train_data.select_dtypes(include='object').columns:
    train_data[col] = train_data[col].astype('category')

In [9]:
train_data.dtypes

class                      int64
cap-diameter             float64
cap-shape               category
cap-surface             category
cap-color               category
does-bruise-or-bleed    category
gill-attachment         category
gill-spacing            category
gill-color              category
stem-height              float64
stem-width               float64
stem-color              category
has-ring                category
ring-type               category
habitat                 category
season                     int64
is_spring_or_fall          int64
dtype: object

In [10]:
X_train = train_data.drop(columns=['class'])
Y_train = train_data['class']

X_test = test_data

### 카테고리형 처리

In [11]:
import xgboost as xgb

# X_train과 X_test의 열을 일치시킴
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# 모든 object 타입의 열을 category 타입으로 변환
X_train = X_train.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)
X_test = X_test.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)

# DMatrix로 변환
dtrain = xgb.DMatrix(X_train, label=Y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, enable_categorical=True)

# 모델 훈련
params = {
    'objective': 'binary:logistic',  # 이진 분류
    'tree_method': 'hist',  # enable_categorical=True와 호환 가능한 트리 방식
}

model = xgb.train(params, dtrain)

# 예측
Y_pred = model.predict(dtest)

# 훈련 데이터의 score를 계산하려면
train_pred = model.predict(dtrain)
train_score = sum(train_pred == Y_train) / len(Y_train)
print("train score =", train_score)


train score = 0.0


In [11]:
id = pd.read_csv("sample_submission.csv")
id = id[['id']]
print(Y_pred)
print(id)

[1. 1. 1. ... 1. 0. 1.]
              id
0        3116945
1        3116946
2        3116947
3        3116948
4        3116949
...          ...
2077959  5194904
2077960  5194905
2077961  5194906
2077962  5194907
2077963  5194908

[2077964 rows x 1 columns]


In [8]:
# Y_pred를 1차원으로 변환
Y_pred = Y_pred.flatten() 
# 예측 결과를 'e'와 'p'로 변환
Y_pred = pd.Series(Y_pred).map({0: 'e', 1: 'p'})


# DataFrame으로 변환
results = pd.DataFrame({
    "id": test_data['id'],  # test_data에서 id 열을 직접 사용
    "class": Y_pred
})



In [9]:
results.head()

# CSV 파일로 저장
results.to_csv("submission.csv", index=False)

In [10]:
results.head()

Unnamed: 0,id,class
0,3116945,p
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,p


### 이진 분류

In [5]:
import xgboost as xgb
import pandas as pd

# X_train과 X_test의 열을 일치시킴
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# 모든 object 타입의 열을 category 타입으로 변환
X_train = X_train.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)
X_test = X_test.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)

# DMatrix로 변환
dtrain = xgb.DMatrix(X_train, label=Y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, enable_categorical=True)

# 모델 훈련
params = {
    'objective': 'binary:logistic',  # 이진 분류
    'tree_method': 'hist',  # enable_categorical=True와 호환 가능한 트리 방식
}
model = xgb.train(params, dtrain)

# 예측
Y_pred = model.predict(dtest)
Y_pred = (Y_pred > 0.5).astype(int)  # 이진 예측 결과로 변환

# 훈련 데이터의 score 계산
train_pred = model.predict(dtrain)
train_pred = (train_pred > 0.5).astype(int)  # 이진 예측 결과로 변환
train_score = sum(train_pred == Y_train) / len(Y_train)
print("train score =", train_score)

# 예측 결과를 'e'와 'p'로 변환
Y_pred = pd.Series(Y_pred).map({0: 'e', 1: 'p'})

# 결과를 DataFrame으로 저장
results = pd.DataFrame({
    "id": test_data['id'],  # test_data에서 id 열을 직접 사용
    "class": Y_pred
})

# CSV 파일로 저장
results.to_csv("submission.csv", index=False)


train score = 0.9732777543821429
