# 여행자 보험 가입여부 분류
종속 변수 : TravelInsurance , TravelInsurance가 1일 확률을 구해서 제출하라. 

평가지표 : auc

제출 파일의 컬럼은 ID, proba 두개만 존재해야한다.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e3_p2_train_.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e3_p2_test_.csv')
display(train.head())
test.head()

Unnamed: 0,ID,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,1008,26,Private Sector/Self Employed,Yes,1400000,5,0,No,Yes,1
1,199,30,Private Sector/Self Employed,No,1450000,5,0,Yes,Yes,1
2,86,32,Government Sector,Yes,900000,4,0,No,No,0
3,560,26,Private Sector/Self Employed,Yes,1400000,7,0,No,Yes,1
4,161,34,Private Sector/Self Employed,No,1400000,3,1,No,Yes,1


Unnamed: 0,ID,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad
0,6,31,Government Sector,Yes,1300000,4,0,No,No
1,9,33,Government Sector,Yes,800000,3,0,Yes,No
2,20,28,Private Sector/Self Employed,Yes,1150000,6,0,Yes,No
3,21,29,Private Sector/Self Employed,Yes,350000,3,0,No,No
4,23,28,Government Sector,Yes,600000,9,0,No,No


In [3]:
train.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   1490 non-null   int64 
 1   Age                  1490 non-null   int64 
 2   Employment Type      1490 non-null   object
 3   GraduateOrNot        1490 non-null   object
 4   AnnualIncome         1490 non-null   int64 
 5   FamilyMembers        1490 non-null   int64 
 6   ChronicDiseases      1490 non-null   int64 
 7   FrequentFlyer        1490 non-null   object
 8   EverTravelledAbroad  1490 non-null   object
 9   TravelInsurance      1490 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 116.5+ KB


In [4]:
# 타겟 레이블이 불균형하다

train.TravelInsurance.value_counts()

0    959
1    531
Name: TravelInsurance, dtype: int64

In [5]:
# 결측치는 존재하지 않는다

train.isnull().sum() 

ID                     0
Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           0
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
dtype: int64

In [6]:
train['Employment Type'].value_counts()

Private Sector/Self Employed    1061
Government Sector                429
Name: Employment Type, dtype: int64

In [7]:
test['Employment Type'].value_counts()

Private Sector/Self Employed    356
Government Sector               141
Name: Employment Type, dtype: int64

In [8]:
train['EverTravelledAbroad'].value_counts()

No     1207
Yes     283
Name: EverTravelledAbroad, dtype: int64

In [9]:
train.describe()

Unnamed: 0,ID,Age,AnnualIncome,FamilyMembers,ChronicDiseases,TravelInsurance
count,1490.0,1490.0,1490.0,1490.0,1490.0,1490.0
mean,994.842282,29.716779,932047.0,4.761745,0.285235,0.356376
std,573.43807,2.931325,376065.5,1.615161,0.451678,0.479089
min,0.0,25.0,300000.0,2.0,0.0,0.0
25%,509.5,28.0,600000.0,4.0,0.0,0.0
50%,997.5,29.0,900000.0,5.0,0.0,0.0
75%,1491.75,32.0,1250000.0,6.0,1.0,1.0
max,1986.0,35.0,1800000.0,9.0,1.0,1.0


In [10]:
# 카테고리 타입 컬럼 레이블인코딩 적용

categorical = list(train.select_dtypes('object').columns)
categorical

['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad']

In [11]:
for category in categorical:
    le = LabelEncoder()
    train[category] = le.fit_transform(train[category])
    test[category] = le.fit_transform(test[category])

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   ID                   1490 non-null   int64
 1   Age                  1490 non-null   int64
 2   Employment Type      1490 non-null   int32
 3   GraduateOrNot        1490 non-null   int32
 4   AnnualIncome         1490 non-null   int64
 5   FamilyMembers        1490 non-null   int64
 6   ChronicDiseases      1490 non-null   int64
 7   FrequentFlyer        1490 non-null   int32
 8   EverTravelledAbroad  1490 non-null   int32
 9   TravelInsurance      1490 non-null   int64
dtypes: int32(4), int64(6)
memory usage: 93.2 KB


## 모델링

In [13]:
x = train.drop(columns=['ID', 'TravelInsurance'])
y = train.TravelInsurance

In [14]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state=42, stratify=y)

In [37]:
rfc = RandomForestClassifier(max_depth=8, random_state=42)

In [38]:
rfc.fit(x_train, y_train)

In [39]:
predict = rfc.predict_proba(x_val)
predict

array([[0.78625492, 0.21374508],
       [0.74713044, 0.25286956],
       [0.82022626, 0.17977374],
       [0.82987934, 0.17012066],
       [0.4863486 , 0.5136514 ],
       [0.85258972, 0.14741028],
       [0.72650523, 0.27349477],
       [0.07679834, 0.92320166],
       [0.11274918, 0.88725082],
       [0.88160669, 0.11839331],
       [0.85283028, 0.14716972],
       [0.7818036 , 0.2181964 ],
       [0.84893072, 0.15106928],
       [0.83949398, 0.16050602],
       [0.69105045, 0.30894955],
       [0.95976238, 0.04023762],
       [0.73416559, 0.26583441],
       [0.8487586 , 0.1512414 ],
       [0.78473014, 0.21526986],
       [0.42209798, 0.57790202],
       [0.02462388, 0.97537612],
       [0.93530283, 0.06469717],
       [0.70730051, 0.29269949],
       [0.83328226, 0.16671774],
       [0.76467005, 0.23532995],
       [0.8090823 , 0.1909177 ],
       [0.0239352 , 0.9760648 ],
       [0.72215459, 0.27784541],
       [0.86782255, 0.13217745],
       [0.84098276, 0.15901724],
       [0.

In [40]:
predict[:, 1]

array([0.21374508, 0.25286956, 0.17977374, 0.17012066, 0.5136514 ,
       0.14741028, 0.27349477, 0.92320166, 0.88725082, 0.11839331,
       0.14716972, 0.2181964 , 0.15106928, 0.16050602, 0.30894955,
       0.04023762, 0.26583441, 0.1512414 , 0.21526986, 0.57790202,
       0.97537612, 0.06469717, 0.29269949, 0.16671774, 0.23532995,
       0.1909177 , 0.9760648 , 0.27784541, 0.13217745, 0.15901724,
       0.07380906, 0.2487195 , 0.04320387, 0.16270795, 0.02745161,
       0.21811835, 0.26643723, 0.73175489, 0.16664172, 0.40528334,
       0.22485148, 0.15114556, 0.93881916, 0.37072016, 0.98032601,
       0.38043451, 0.15035946, 0.2234441 , 0.24670325, 0.26501611,
       0.36024154, 0.22680444, 0.63978587, 0.27404768, 0.179407  ,
       0.12976883, 0.69579668, 0.27850669, 0.32314106, 0.11602513,
       0.94120009, 0.25026043, 0.81330914, 0.17336477, 0.2234441 ,
       0.22071952, 0.07380906, 0.18123822, 0.15462938, 0.10761313,
       0.16063994, 0.79762243, 0.16377951, 0.14272654, 0.14044

In [41]:
auc_score = roc_auc_score(y_val, predict[:, 1])
auc_score

0.7643229166666667

## 예측

In [42]:
rfc_final = RandomForestClassifier(max_depth=8, random_state=42)

In [43]:
rfc_final.fit(x, y)

In [44]:
predict_final = rfc_final.predict_proba(test.drop(columns='ID'))

In [45]:
predict_final

array([[0.83236596, 0.16763404],
       [0.67929871, 0.32070129],
       [0.79684303, 0.20315697],
       [0.85786612, 0.14213388],
       [0.76103631, 0.23896369],
       [0.01880169, 0.98119831],
       [0.7919031 , 0.2080969 ],
       [0.98928496, 0.01071504],
       [0.76902171, 0.23097829],
       [0.04008116, 0.95991884],
       [0.91760632, 0.08239368],
       [0.75525842, 0.24474158],
       [0.81786049, 0.18213951],
       [0.03395833, 0.96604167],
       [0.80760773, 0.19239227],
       [0.78690165, 0.21309835],
       [0.791162  , 0.208838  ],
       [0.98986189, 0.01013811],
       [0.70658744, 0.29341256],
       [0.837577  , 0.162423  ],
       [0.80518593, 0.19481407],
       [0.34835248, 0.65164752],
       [0.01396649, 0.98603351],
       [0.76633455, 0.23366545],
       [0.73973134, 0.26026866],
       [0.81777506, 0.18222494],
       [0.84552417, 0.15447583],
       [0.83720995, 0.16279005],
       [0.81547643, 0.18452357],
       [0.75573621, 0.24426379],
       [0.

In [46]:
submission = pd.DataFrame()

In [47]:
submission['ID'] = test['ID']
submission['proba'] = predict_final[:, 1]
submission

Unnamed: 0,ID,proba
0,6,0.167634
1,9,0.320701
2,20,0.203157
3,21,0.142134
4,23,0.238964
...,...,...
492,1964,0.142386
493,1970,0.217322
494,1973,0.970272
495,1976,0.285290


In [48]:
submission.to_csv("000000.csv", index=False)