# 문제 정의
- 주어진 데이터에서 약물의 종류를 예측하시오.
  - 제공된 데이터 목록: drug_train.csv, drug_test.csv
  - 예측할 컬럼: Drug(DrugY, drugX, durgA, durgC, drugB)
- 학습용 데이터(train.csv)를 이용해 약물의 종류를 예측하는 모델을 만든 후 이를 평가용 데이터(test.csv)에 적용해 얻은 예측값을 다음과 같은 형식의 CSV 파일로 생성하시오.

제출 파일은 다음 1개의 컬럼을 포함해야 한다.
 - pred: 예측값
 - 제출 파일명: 'result.csv'
제출한 모델의 성능은 f1-macro 평가지표에 따라 채점한다.

In [1]:
# 파일 업로드
from google.colab import files
uploads = files.upload()

Saving drug_train.csv to drug_train.csv
Saving drug_test.csv to drug_test.csv


In [26]:
# 데이터 불러오기
import pandas as pd
train = pd.read_csv('drug_train.csv')
test = pd.read_csv('drug_test.csv')

In [27]:
# EDA
print('===== DATA SIZE =====')
print('train:', train.shape, 'test:', test.shape)

print('\n===== DATA TYPE =====')
print(train.info())

print('\n===== object =====')
print(train.describe(include= 'O'))

print('\n===== int/float =====')
print(train.describe())

print('\n===== Missing Value(train) =====')
print(train.isnull().sum())

print('\n===== Missing Value(test) =====')
print(test.isnull().sum())

print('\n===== target(Drug) =====')
print(train['Drug'].value_counts())

===== DATA SIZE =====
train: (100, 6) test: (100, 5)

===== DATA TYPE =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          100 non-null    int64  
 1   Sex          100 non-null    object 
 2   BP           100 non-null    object 
 3   Cholesterol  100 non-null    object 
 4   Na_to_K      100 non-null    float64
 5   Drug         100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB
None

===== object =====
        Sex    BP Cholesterol   Drug
count   100   100         100    100
unique    2     3           2      5
top       M  HIGH      NORMAL  DrugY
freq     51    41          54     41

===== int/float =====
              Age     Na_to_K
count  100.000000  100.000000
mean    43.150000   15.634260
std     17.127788    7.096329
min     15.000000    6.683000
25%     28.750000   10.167500
50%     41

# 데이터 전처리
object = [Sex, BP, Cholesterol]
스케일링 할거 [Na_to_K] 선택

In [28]:
# 데이터 추출
target = train.pop('Drug')

In [42]:
# 인코딩 (원핫)
data = pd.concat([train, test], axis=0)
data_oh = pd.get_dummies(data)
train = data_oh.iloc[:len(train)]
test = data_oh.iloc[len(train):]
train.shape, test.shape

((100, 9), (100, 9))

# 머신러닝 학습 및 평가

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size=0.2,
    random_state=0
)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((80, 9), (20, 9), (80,), (20,))

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
pred = rf.predict(X_val)

from sklearn.metrics import f1_score
f1 = f1_score(y_val, pred, average='macro')
f1
# f1 이 1.0으로 과적합 --> 모델 조정 크로스벨리데이션

1.0

# 크로스 벨리데이션
f1 값이 1.0이나와서 과적합가능성 존재
데이터셋을 k개로 나눠 학습과 검증데이터로 활용해 k번의 학습과 평가를 진행.

In [32]:
from sklearn.model_selection import cross_val_score
f1_scores = cross_val_score(rf, train, target, cv=3, scoring='f1_macro')
print(f1_scores)
print(f1_scores.mean())

[1.         0.93777778 0.78461538]
0.9074643874643874


# 성능 개선
스케일링은 성능개선 실패

In [21]:
# # 스케일링
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# train_copy = train.copy()
# test_copy = test.copy()

# cols = ['Age','Na_to_K']

# train_copy[cols] = scaler.fit_transform(train_copy[cols])
# test_copy[cols] = scaler.transform(test_copy[cols])

# # 성능확인 (CV)
# from sklearn.model_selection import cross_val_score
# f1_scores = cross_val_score(rf, train_copy, target, cv=3, scoring='f1_macro')
# print(f1_scores.mean())

0.9074643874643874


In [22]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# train_copy = train.copy()
# test_copy = test.copy()

# cols = ['Age','Na_to_K']

# train_copy[cols] = scaler.fit_transform(train_copy[cols])
# test_copy[cols] = scaler.transform(test_copy[cols])

# # 성능확인 (CV)
# from sklearn.model_selection import cross_val_score
# f1_scores = cross_val_score(rf, train_copy, target, cv=3, scoring='f1_macro')
# print(f1_scores.mean())

0.9074643874643874


In [23]:
# from sklearn.preprocessing import RobustScaler
# scaler = RobustScaler()
# train_copy = train.copy()
# test_copy = test.copy()

# cols = ['Age','Na_to_K']

# train_copy[cols] = scaler.fit_transform(train_copy[cols])
# test_copy[cols] = scaler.transform(test_copy[cols])

# # 성능확인 (CV)
# from sklearn.model_selection import cross_val_score
# f1_scores = cross_val_score(rf, train_copy, target, cv=3, scoring='f1_macro')
# print(f1_scores.mean())

0.9074643874643874


In [25]:
# # 레이블 인코딩 - 성능 저하
# import pandas as pd
# train = pd.read_csv('drug_train.csv')
# test = pd.read_csv('drug_test.csv')
# target = train.pop('Drug')
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

# cols = ['Sex', 'BP', 'Cholesterol']
# train_copy = train.copy()
# test_copy = test.copy()
# for col in cols:
#   train_copy[col] = le.fit_transform(train_copy[col])
#   test_copy[col] = le.transform(test_copy[col])

# from sklearn.model_selection import cross_val_score
# f1_scores = cross_val_score(rf, train_copy, target, cv=3, scoring='f1_macro')
# print(f1_scores.mean())

0.8815384615384616


In [39]:
# # 하이퍼 파라미터조정 (depth, n_estimator 의미 X)
# rf = RandomForestClassifier(random_state=0, n_estimators=500)
# f1_scores = cross_val_score(rf, train, target, cv=3, scoring='f1_macro')
# print(f1_scores)
# print(f1_scores.mean())

[1.         0.93777778 0.78461538]
0.9074643874643874


In [40]:
# # LightGBM (성능저하)
# import lightgbm as lgb
# lgbmc = lgb.LGBMClassifier(random_state=0, verbose=-1)
# f1_scores = cross_val_score(lgbmc, train, target, cv=3, scoring='f1_macro')
# print(f1_scores)
# print(f1_scores.mean())

[1.         0.86908213 0.78461538]
0.8845658367397498


In [46]:
# 최종 파일 제출
pred = rf.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index=False)
pd.read_csv('result.csv')

Unnamed: 0,pred
0,DrugY
1,DrugY
2,DrugY
3,DrugY
4,drugB
...,...
95,drugX
96,drugC
97,DrugY
98,DrugY
