이진분류

In [214]:
# 환자의 당뇨병 여부 예측
import pandas as pd
train = pd.read_csv('diabetes_train.csv')
test = pd.read_csv('diabetes_test.csv')

In [216]:
# 탐색적 자료 분석
# 데이터 개수 파악
print(train.shape)
print(test.shape)

(614, 9)
(154, 8)


In [218]:
# 데이터 미리보기
print(train.head())
print(test.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            1      118             58             36       94  33.3   
1            3      173             78             39      185  33.8   
2            4      109             64             44       99  34.8   
3            9       57             80             37        0  32.8   
4            2      129              0              0        0  38.5   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.261   23        0  
1                     0.970   31        1  
2                     0.905   26        1  
3                     0.096   41        0  
4                     0.304   41        0  
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            3      102             74              0        0  29.5   
1            5      104             74              0        0  28.8   
2            4       95             70             32        0  32.1   
3            1 

In [220]:
# 데이터 정보 파악
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               614 non-null    int64  
 1   Glucose                   614 non-null    int64  
 2   BloodPressure             614 non-null    int64  
 3   SkinThickness             614 non-null    int64  
 4   Insulin                   614 non-null    int64  
 5   BMI                       614 non-null    float64
 6   DiabetesPedigreeFunction  614 non-null    float64
 7   Age                       614 non-null    int64  
 8   Outcome                   614 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 43.3 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies     

In [222]:
# 결측치 개수 파악
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


In [224]:
# 목표 변수 빈도 파악
print(train['Outcome'].value_counts())

Outcome
0    403
1    211
Name: count, dtype: int64


In [226]:
# 데이터 전처리
# 결측치가 없으므로 별도의 결측치 처리는 하지 않음
# 범주형 변수가 없으므로 인코딩을 하지 않음
# 목표 변수 분리
target = train.pop('Outcome')

In [230]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

In [232]:
# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train, target, test_size=0.2, random_state=0)

In [234]:
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(X_train, Y_train)
pred = lr.predict_proba(X_val)

In [236]:
# 모델 평가
from sklearn.metrics import roc_auc_score
score = roc_auc_score(Y_val, pred[:,1])
print(score)

0.8057534246575342


In [238]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=3, random_state=0)
dt.fit(X_train, Y_train)
pred = dt.predict_proba(X_val)

In [240]:
score = roc_auc_score(Y_val, pred[:,1])
print(score)
# 0.6256164383561644
# 0.7212328767123288 (max_depth=5)
# 0.7941095890410959 (max_depth=3)

0.7941095890410959


In [242]:
# xgboost
import xgboost as xgb
xg = xgb.XGBClassifier(random_state=0)
xg.fit(X_train, Y_train)
pred = xg.predict_proba(X_val)

In [244]:
score = roc_auc_score(Y_val, pred[:,1])
print(score)

0.7906849315068494


In [246]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=5, n_estimators=500, random_state=0)
rf.fit(X_train, Y_train)
pred = rf.predict_proba(X_val)

In [248]:
score = roc_auc_score(Y_val, pred[:,1])
print(score)
# 0.8001369863013699
# 0.8123287671232877 (max_depth=3)
# 0.8145205479452055 (max_depth=5)
# 0.8219178082191781 (max_depth=5, n_estimators=200)
# 0.8221917808219178 (max_depth=5, n_estimators=400)
# 0.8246575342465754 (max_depth=5, n_estimators=500)

0.8246575342465753


In [250]:
# 결과 예측 및 파일 생성
pred = rf.predict_proba(test)
submit = pd.DataFrame({'pred':pred[:,1]})
submit.to_csv('result.csv', index=False)

In [252]:
pd.read_csv('result.csv').head()

Unnamed: 0,pred
0,0.183228
1,0.253687
2,0.146948
3,0.060001
4,0.075731


결측치가 있는 경우, 범주형 변수가 있는 경우

In [254]:
# 이직 여부 예측
import pandas as pd
train = pd.read_csv('hr_train.csv')
test = pd.read_csv('hr_test.csv')

In [256]:
# 탐색적 자료 분석
# 데이터 개수 파악
print(train.shape)
print(test.shape)

(15326, 14)
(3832, 13)


In [258]:
# 데이터 미리보기
print(train.head())
print(test.head())

   enrollee_id      city  city_development_index gender  \
0        30266   city_84                   0.698   Male   
1        13254   city_16                   0.910   Male   
2        31675   city_21                   0.624   Male   
3        30804  city_104                   0.924   Male   
4        18269  city_143                   0.740   Male   

       relevent_experience enrolled_university education_level  \
0   No relevent experience    Full time course     High School   
1  Has relevent experience       no_enrollment        Graduate   
2   No relevent experience    Full time course     High School   
3   No relevent experience       no_enrollment        Graduate   
4  Has relevent experience       no_enrollment        Graduate   

  major_discipline experience company_size company_type last_new_job  \
0              NaN          1      100-500      Pvt Ltd        never   
1             STEM         15        50-99      Pvt Ltd            1   
2              NaN          3   

In [260]:
# 데이터 정보 파악
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11750 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15012 non-null  object 
 6   education_level         14961 non-null  object 
 7   major_discipline        13045 non-null  object 
 8   experience              15272 non-null  object 
 9   company_size            10539 non-null  object 
 10  company_type            10383 non-null  object 
 11  last_new_job            14984 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target                  15326 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [262]:
# 결측치 파악
print(train.isnull().sum())
print(test.isnull().sum())

enrollee_id                  0
city                         0
city_development_index       0
gender                    3576
relevent_experience          0
enrolled_university        314
education_level            365
major_discipline          2281
experience                  54
company_size              4787
company_type              4943
last_new_job               342
training_hours               0
target                       0
dtype: int64
enrollee_id                  0
city                         0
city_development_index       0
gender                     932
relevent_experience          0
enrolled_university         72
education_level             95
major_discipline           532
experience                  11
company_size              1151
company_type              1197
last_new_job                81
training_hours               0
dtype: int64


In [264]:
# 목표 변수 빈도 파악
print(train['target'].value_counts())

target
0.0    11517
1.0     3809
Name: count, dtype: int64


In [266]:
# 데이터 전처리
# 결측치 처리
# 칼럼 삭제
train.drop(['enrollee_id','gender', 'company_size', 'company_type'], axis=1, inplace=True)
test.drop(['enrollee_id','gender', 'company_size', 'company_type'], axis=1, inplace=True)

In [268]:
# 결측치 대체
train.fillna('X', inplace=True)
test.fillna('X', inplace=True)

In [270]:
# 목표변수 분리
target = train.pop('target')

In [272]:
# 범주형 변수 원-핫 인코딩
combined = pd.concat([train, test])
combined_dummies = pd.get_dummies(combined)
n_train = len(train)
train = combined_dummies[:n_train]
test = combined_dummies[n_train:]

In [None]:
# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train, target, test_size=0.2, random_state=0)

In [321]:
# 머신러닝 모델 학습
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=5, class_weight='balanced', random_state=0)
dt.fit(X_train, Y_train)
pred = dt.predict(X_val)

In [282]:
# 모델 평가
from sklearn.metrics import roc_auc_score
score = roc_auc_score(Y_val, pred[:,1])
print(score)
# 0.602371219464228
# 0.7383019371514461 (max_depth=5)
# 0.738418414410871 (max_depth=5, class_weight='balacned')

0.738418414410871


In [284]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=7, n_estimators=300, class_weight='balanced', random_state=0)
rf.fit(X_train, Y_train)
pred = rf.predict_proba(X_val)

In [286]:
score = roc_auc_score(Y_val, pred[:,1])
print(score)
# 0.7089098665018426
# 0.7336334817686752 (max_depth=5)
# 0.7342029911828472 (max_depth=7)
# 0.7365951648475143 (max_depth=7, n_estimators=300)
# 0.7389258806617313 (max_depth=7, n_estimators=300, class_weight='balanced')

0.7389153450302255


In [288]:
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(X_train, Y_train)
pred = lr.predict_proba(X_val)

In [290]:
score = roc_auc_score(Y_val, pred[:,1])
print(score)

0.7440166392740247


In [292]:
# 결과 예측 및 파일 생성
pred = lr.predict_proba(test)
submit = pd.DataFrame({'pred':pred[:,1]})
submit.to_csv('result.csv', index=False)

In [294]:
pd.read_csv('result.csv').head()

Unnamed: 0,pred
0,0.207092
1,0.454818
2,0.541545
3,0.170645
4,0.095378


목표변수 불균형이 심한 경우

In [323]:
# 신용카드 신청자의 미래 신용 예측
import pandas as pd
train = pd.read_csv('creditcard_train.csv')
test = pd.read_csv('creditcard_test.csv')

In [325]:
# 탐색적 자료 분석
print(train.shape)
print(test.shape)

(25519, 19)
(7591, 18)


In [327]:
# 데이터 미리보기
print(train.head())
print(test.head())

        ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
0  5010092           F            N               N             0   
1  5116701           M            N               Y             1   
2  5069290           F            N               Y             0   
3  5113315           F            Y               Y             0   
4  5148853           M            Y               Y             0   

   AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0          225000.0  Commercial associate  Secondary / secondary special   
1          112500.0  Commercial associate  Secondary / secondary special   
2          270000.0  Commercial associate              Incomplete higher   
3          153000.0               Working  Secondary / secondary special   
4          261000.0               Working  Secondary / secondary special   

  NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
0            Married  House / apartment      -10304       

In [329]:
# 데이터 정보 파악
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25519 entries, 0 to 25518
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   25519 non-null  int64  
 1   CODE_GENDER          25519 non-null  object 
 2   FLAG_OWN_CAR         25519 non-null  object 
 3   FLAG_OWN_REALTY      25519 non-null  object 
 4   CNT_CHILDREN         25519 non-null  int64  
 5   AMT_INCOME_TOTAL     25519 non-null  float64
 6   NAME_INCOME_TYPE     25519 non-null  object 
 7   NAME_EDUCATION_TYPE  25519 non-null  object 
 8   NAME_FAMILY_STATUS   25519 non-null  object 
 9   NAME_HOUSING_TYPE    25519 non-null  object 
 10  DAYS_BIRTH           25519 non-null  int64  
 11  DAYS_EMPLOYED        25519 non-null  int64  
 12  FLAG_MOBIL           25519 non-null  int64  
 13  FLAG_WORK_PHONE      25519 non-null  int64  
 14  FLAG_PHONE           25519 non-null  int64  
 15  FLAG_EMAIL           25519 non-null 

In [331]:
# 결측치 개수 파악
print(train.isnull().sum())
print(test.isnull().sum())

ID                        0
CODE_GENDER               0
FLAG_OWN_CAR              0
FLAG_OWN_REALTY           0
CNT_CHILDREN              0
AMT_INCOME_TOTAL          0
NAME_INCOME_TYPE          0
NAME_EDUCATION_TYPE       0
NAME_FAMILY_STATUS        0
NAME_HOUSING_TYPE         0
DAYS_BIRTH                0
DAYS_EMPLOYED             0
FLAG_MOBIL                0
FLAG_WORK_PHONE           0
FLAG_PHONE                0
FLAG_EMAIL                0
OCCUPATION_TYPE        7976
CNT_FAM_MEMBERS           0
STATUS                    0
dtype: int64
ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
DAYS_BIRTH             0
DAYS_EMPLOYED          0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
OCCUPATION_TYPE        0
CNT_FAM_MEMBERS        0
dtype

In [333]:
# 목표 변수 빈도 확인
print(train['STATUS'].value_counts())

STATUS
0    25085
1      434
Name: count, dtype: int64


In [335]:
# 데이터 전처리
# 결측치 처리
# 칼럼 제거
train.drop('OCCUPATION_TYPE', axis=1, inplace=True)
test.drop('OCCUPATION_TYPE', axis=1, inplace=True)

In [337]:
# 목표 변수 분리
target = train.pop('STATUS')

In [339]:
# 범주형 변수 원핫인코딩
combined = pd.concat([train, test])
combined_dummies = pd.get_dummies(combined)
n_train = len(train)
train = combined_dummies[:n_train]
test = combined_dummies[n_train:]

In [408]:
# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train, target, test_size=0.2, random_state=0)

In [414]:
# 머신러닝 모델 학습
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(X_train, Y_train)
pred = lr.predict(X_val)

In [416]:
# 모델 평가
from sklearn.metrics import f1_score
score = f1_score(Y_val, pred)
print(score)

0.0


In [428]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(class_weight='balanced',random_state=0)
dt.fit(X_train, Y_train)
pred = dt.predict(X_val)

In [430]:
score = f1_score(Y_val, pred)
print(score)
# 0.27692307692307694
# 0.28421052631578947 (class_weight='balanced')

0.28421052631578947


In [432]:
# xgboost
import xgboost as xgb
xg = xgb.XGBClassifier(random_state=0)
xg.fit(X_train, Y_train)
pred = xg.predict(X_val)

In [434]:
score = f1_score(Y_val, pred)
print(score)

0.26865671641791045


In [470]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, Y_train)
pred = rf.predict(X_val)

In [471]:
score = f1_score(Y_val, pred)
print(score)
# 0.2658959537572254

0.2658959537572254


In [462]:
# 결과 예측 및 파일 생성
pred = rf.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index=False)

In [464]:
pd.read_csv('result.csv').head()

Unnamed: 0,pred
0,0
1,0
2,0
3,0
4,0


다중분류

In [476]:
# 신용 등급 예측
import pandas as pd
train = pd.read_csv('score_train.csv')
test = pd.read_csv('score_test.csv')

In [478]:
# 탐색적 자료 분석
# 데이터 개수 파악
print(train.shape)
print(test.shape)

(4198, 21)
(1499, 20)


In [480]:
# 데이터 미리보기
print(train.head())
print(test.head())

   Delay_from_due_date  Num_of_Delayed_Payment  Num_Credit_Inquiries  \
0                 26.0                    10.0                   5.0   
1                  6.0                    17.0                   4.0   
2                 29.0                    16.0                   6.0   
3                 14.0                     6.0                   2.0   
4                  9.0                    10.0                   4.0   

   Credit_Utilization_Ratio  Credit_History_Age Payment_of_Min_Amount  \
0                 30.358905               200.0                    NM   
1                 24.589796                70.0                   Yes   
2                 39.456866                14.0                   Yes   
3                 28.368731               245.0                    NM   
4                 25.099529               220.0                    No   

   Amount_invested_monthly  Monthly_Balance Credit_Mix  \
0               192.341800       321.431503       Good   
1           

In [482]:
# 데이터 정보 파악
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4198 entries, 0 to 4197
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Delay_from_due_date       4198 non-null   float64
 1   Num_of_Delayed_Payment    4198 non-null   float64
 2   Num_Credit_Inquiries      4198 non-null   float64
 3   Credit_Utilization_Ratio  4198 non-null   float64
 4   Credit_History_Age        4198 non-null   float64
 5   Payment_of_Min_Amount     4198 non-null   object 
 6   Amount_invested_monthly   4198 non-null   float64
 7   Monthly_Balance           4198 non-null   float64
 8   Credit_Mix                4198 non-null   object 
 9   Payment_Behaviour         4198 non-null   object 
 10  Age                       4198 non-null   float64
 11  Annual_Income             4198 non-null   float64
 12  Num_Bank_Accounts         4198 non-null   float64
 13  Num_Credit_Card           4198 non-null   float64
 14  Interest

In [484]:
# 결측치 개수 파악
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


In [486]:
# 목표변수 빈도 파악
print(train['Credit_Score'].value_counts())

Credit_Score
Standard    2225
Poor        1232
Good         741
Name: count, dtype: int64


In [488]:
# 데이터 전처리
# 목표변수 분리
target = train.pop('Credit_Score')

In [490]:
# 범주형 변수 원-핫 인코딩
combined = pd.concat([train, test])
combined_dummies = pd.get_dummies(combined)
n_train = len(train)
train = combined_dummies[:n_train]
test = combined_dummies[n_train:]

In [492]:
# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train, target, test_size=0.2, random_state=0)

In [494]:
# 머신러닝 모델 학습
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(X_train, Y_train)
pred = lr.predict(X_val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [498]:
# 모델 평가
from sklearn.metrics import f1_score
score = f1_score(Y_val, pred, average='macro')
print(score)

0.358825362042318


In [516]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=5, class_weight='balanced', random_state=0)
dt.fit(X_train, Y_train)
pred = dt.predict(X_val)

In [518]:
score = f1_score(Y_val, pred, average='macro')
print(score)
# 0.6022676139978193
# 0.6732225936086008 (max_depth=5)
# 0.6737692606290663 (max_depth=5, class_weight='balanced')

0.6737692606290663


In [544]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, Y_train)
pred = rf.predict(X_val)

In [545]:
score = f1_score(Y_val, pred, average='macro')
print(score)
# 0.7019165695111894

0.7019165695111894


In [550]:
# 결과 예측 및 파일 생성
pred = rf.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index=False)

In [552]:
pd.read_csv('result.csv').head()

Unnamed: 0,pred
0,Poor
1,Good
2,Standard
3,Good
4,Standard


In [554]:
# 약물 종류 예측
import pandas as pd
train = pd.read_csv('drug_train.csv')
test = pd.read_csv('drug_test.csv')

In [556]:
# 탐색적 자료 분석
# 데이터 개수 파악
print(train.shape)
print(test.shape)

(100, 6)
(100, 5)


In [558]:
# 데이터 미리보기
print(train.head())
print(test.head())

   Age Sex      BP Cholesterol  Na_to_K   Drug
0   70   M    HIGH        HIGH    9.849  drugB
1   36   M     LOW      NORMAL   11.424  drugX
2   23   F    HIGH        HIGH   25.355  DrugY
3   40   F  NORMAL        HIGH   10.103  drugX
4   45   M     LOW      NORMAL   10.017  drugX
   Age Sex      BP Cholesterol  Na_to_K
0   74   F     LOW        HIGH   20.942
1   65   M    HIGH      NORMAL   34.997
2   58   F     LOW        HIGH   38.247
3   34   M  NORMAL        HIGH   22.456
4   59   M    HIGH        HIGH   13.935


In [560]:
# 데이터 정보 파악
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          100 non-null    int64  
 1   Sex          100 non-null    object 
 2   BP           100 non-null    object 
 3   Cholesterol  100 non-null    object 
 4   Na_to_K      100 non-null    float64
 5   Drug         100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          100 non-null    int64  
 1   Sex          100 non-null    object 
 2   BP           100 non-null    object 
 3   Cholesterol  100 non-null    object 
 4   Na_to_K      100 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.0+ KB
None


In [562]:
# 결측치 개수 파악
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


In [564]:
# 목표변수 빈도 확인
print(train['Drug'].value_counts())

Drug
DrugY    41
drugX    34
drugA    13
drugB     8
drugC     4
Name: count, dtype: int64


In [566]:
# 데이터 전처리
# 목표변수 분리
target = train.pop('Drug')

In [568]:
# 범주형 변수 원핫인코딩
combined = pd.concat([train, test])
combined_dummies = pd.get_dummies(combined)
n_train = len(train)
train = combined_dummies[:n_train]
test = combined_dummies[n_train:]

In [594]:
# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train, target, test_size=0.25, random_state=0)

In [596]:
# 머신러닝 모델 학습
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(X_train, Y_train)
pred = lr.predict(X_val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [598]:
# 모델 평가
from sklearn.metrics import f1_score
score = f1_score(Y_val, pred, average='macro')
print(score)

0.78


In [620]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, Y_train)
pred = dt.predict(X_val)

In [622]:
score = f1_score(Y_val, pred, average='macro')
print(score)
# 0.78

0.78


In [644]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, Y_train)
pred = rf.predict(X_val)

In [646]:
score = f1_score(Y_val, pred, average='macro')
print(score)
# 0.9228070175438596

0.9228070175438596


In [648]:
# 결과 예측 및 파일 생성
pred = rf.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index=False)

In [650]:
pd.read_csv('result.csv').head()

Unnamed: 0,pred
0,DrugY
1,DrugY
2,DrugY
3,DrugY
4,drugB


In [652]:
# 유리 종류 예측
import pandas as pd
train = pd.read_csv('glass_train.csv')
test = pd.read_csv('glass_test.csv')

In [654]:
# 탐색적 자료 분석
# 데이터 개수 파악
print(train.shape)
print(test.shape)

(149, 10)
(65, 9)


In [656]:
# 데이터 미리보기
print(train.head())
print(test.head())

        RI     Na    Mg    Al     Si     K    Ca   Ba    Fe  Type
0  1.51829  14.46  2.24  1.62  72.38  0.00  9.26  0.0  0.00     6
1  1.51610  13.33  3.53  1.34  72.67  0.56  8.33  0.0  0.00     3
2  1.52172  13.48  3.74  0.90  72.01  0.18  9.61  0.0  0.07     1
3  1.51905  13.60  3.62  1.11  72.64  0.14  8.76  0.0  0.00     1
4  1.51631  13.34  3.57  1.57  72.87  0.61  7.89  0.0  0.00     2
        RI     Na    Mg    Al     Si     K     Ca    Ba    Fe
0  1.51748  12.86  3.56  1.27  73.21  0.54   8.38  0.00  0.17
1  1.52058  12.85  1.61  2.17  72.18  0.76   9.70  0.24  0.51
2  1.52475  11.45  0.00  1.88  72.19  0.81  13.24  0.00  0.34
3  1.51690  13.33  3.54  1.61  72.54  0.68   8.11  0.00  0.00
4  1.52177  13.75  1.01  1.36  72.19  0.33  11.14  0.00  0.00


In [658]:
# 데이터 정보 파악
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      149 non-null    float64
 1   Na      149 non-null    float64
 2   Mg      149 non-null    float64
 3   Al      149 non-null    float64
 4   Si      149 non-null    float64
 5   K       149 non-null    float64
 6   Ca      149 non-null    float64
 7   Ba      149 non-null    float64
 8   Fe      149 non-null    float64
 9   Type    149 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 11.8 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      65 non-null     float64
 1   Na      65 non-null     float64
 2   Mg      65 non-null     float64
 3   Al      65 non-null     float64
 4   Si      65 non-null     float64
 5   K       65 non-null     float

In [660]:
# 결측치 개수 파악
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


In [662]:
# 목표 변수 빈도 확인
print(train['Type'].value_counts())

Type
2    53
1    49
7    23
3     9
5     8
6     7
Name: count, dtype: int64


In [664]:
# 데이터 전처리
# 목표 변수 분리
target = train.pop('Type')

In [666]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

In [668]:
# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train, target, test_size=0.2, random_state=0)

In [670]:
# 머신러닝 모델 학습
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(X_train, Y_train)
pred = lr.predict(X_val)

In [672]:
# 모델 평가
from sklearn.metrics import f1_score
score = f1_score(Y_val, pred, average='weighted')
print(score)

0.6427494074552897


In [714]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=5, n_estimators=300, random_state=0)
rf.fit(X_train, Y_train)
pred = rf.predict(X_val)

In [716]:
score = f1_score(Y_val, pred, average='weighted')
print(score)
# 0.6119801766860591
# 0.6410714285714286 (max_depth=5)
# 0.6507936507936507 (max_depth=5, n_estimators=300)

0.6507936507936507


In [718]:
# 의사결정 나무
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, Y_train)
pred = dt.predict(X_val)

In [720]:
score = f1_score(Y_val, pred, average='weighted')
print(score)
# 0.6619047619047619

0.6619047619047619


In [722]:
# 결과 예측 및 모델 평가
pred = dt.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index=False)

In [724]:
pd.read_csv('result.csv').head()

Unnamed: 0,pred
0,2
1,5
2,5
3,2
4,7
