### 데이터 불러오기

In [3]:
import pandas as pd

train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
submission = pd.read_csv('sample_submission.csv', index_col='id')

#### 데이터 합치기

In [4]:
all_data = pd.concat([train,test])
all_data = all_data.drop('target', axis=1)
all_data

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0,0,0,F,N,Green,Square,Lion,Canada,Theremin,...,9e4b23160,acc31291f,1,Novice,Lava Hot,j,A,Gb,1,3
499996,1,0,0,F,Y,Green,Trapezoid,Lion,China,Piano,...,cfbd87ed0,eae3446d0,1,Contributor,Lava Hot,f,S,Ed,2,2
499997,0,1,1,T,Y,Green,Trapezoid,Lion,Canada,Oboe,...,1108bcd6c,33dd3cf4b,1,Novice,Boiling Hot,g,V,TR,3,1
499998,1,0,0,T,Y,Blue,Star,Hamster,Costa Rica,Bassoon,...,606ac930b,d4cf587dd,2,Grandmaster,Boiling Hot,g,X,Ye,2,1


#### 원핫인코딩

In [5]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
all_data_encoded = encoder.fit_transform(all_data)

#### 데이터 나누기

In [6]:
num_train =len(train)

X_train = all_data_encoded[:num_train]
X_test = all_data_encoded[num_train:]

y = train['target']

#### 트레인데이터를 훈련데이터와 검증데이터로 나누기

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y, 
                                                      test_size=0.1, 
                                                      stratify=y, #타깃값인 y가 tran, valid에 같은 비율로 포함된다는 의미
                                                      random_state=10)

### 모델훈련

In [9]:
#로지스틱 회귀 모델
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=1000, random_state=42) #모델 생성 #회귀계수를 업데이트하는 반복 횟수. 
logistic_model.fit(X_train, y_train) #모델 훈련

### 모델 성능 검증

- predict() : 타깃값 자체를 예측
- predict_proba() : 타깃값의 확률을 예측 (0일 확률과 1일 확률)

In [10]:
logistic_model.predict(X_valid)

array([1, 0, 0, ..., 1, 1, 0], dtype=int64)

In [11]:
logistic_model.predict_proba(X_valid)

array([[0.23286161, 0.76713839],
       [0.91415399, 0.08584601],
       [0.83038333, 0.16961667],
       ...,
       [0.24863897, 0.75136103],
       [0.4946746 , 0.5053254 ],
       [0.95658381, 0.04341619]])

In [13]:
logistic_model.predict_proba(X_valid)[:,1] 

array([0.76713839, 0.08584601, 0.16961667, ..., 0.75136103, 0.5053254 ,
       0.04341619])

In [14]:
#검증 데이터를 활용한 타깃예측
y_valid_preds = logistic_model.predict_proba(X_valid)[:,1]

- ROC AUC 점수 : 모델의 분류 성능 평가 점수. 1에 가까울수록 모델의 성능 좋음. 
- 0.5 ~ 0.7 : 무작위 추측보다는 낫지만, 여전히 개선의 여지 있음.
- 0.7 ~ 0.8 : 양호
- 0.8 ~ 0.9 : 매우 좋은 성능
- 0.9 이상 : 우수한 성능. 의료 이미징 분석과 같은 높은 정확도가 필요한 분야에 매우 바람직.
- 그러나, 이것만으로 판단하는 것은 부적절. 다른 평가 지표들과 함께 고려.

In [15]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_valid, y_valid_preds)

print(f'검증 데이터 ROC AUC : {roc_auc:.4f}')

검증 데이터 ROC AUC : 0.7965


### 예측 및 결과 제출

In [18]:
y_preds = logistic_model.predict_proba(X_test)[:,1]

In [19]:
submission['target'] = y_preds
submission.to_csv('submission.csv')