#### 01. 환자의 당뇨병 여부 예측

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [2]:
#1. 데이터 불러오기
train = pd.read_csv('diabetes_train.csv')
test = pd.read_csv('diabetes_test.csv')

train.shape, test.shape

((614, 9), (154, 8))

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               614 non-null    int64  
 1   Glucose                   614 non-null    int64  
 2   BloodPressure             614 non-null    int64  
 3   SkinThickness             614 non-null    int64  
 4   Insulin                   614 non-null    int64  
 5   BMI                       614 non-null    float64
 6   DiabetesPedigreeFunction  614 non-null    float64
 7   Age                       614 non-null    int64  
 8   Outcome                   614 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 43.3 KB


In [4]:
x_train = train.iloc[:, train.columns != 'Outcome']
y_train = train.iloc[:, -1]
x_test = test

In [5]:
x_train.shape, x_test.shape

((614, 8), (154, 8))

In [6]:
#2. 결측치 처리
x_train.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [7]:
#3. 스케일링
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
#4. 검정 데이터 분리
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=0)

In [9]:
#5. 모델 검정
rfc = RandomForestClassifier(random_state=0, max_depth=4)
rfc_model = rfc.fit(x_train, y_train)
y_pred = rfc_model.predict_proba(x_val)

In [10]:
x_test.shape

(154, 8)

In [11]:
#6. 검증하기
rocauc = roc_auc_score(y_val, y_pred[:,1])
rocauc

np.float64(0.8144051130776795)

In [12]:
pred = rfc_model.predict_proba(x_test)
submit = pd.DataFrame({'pred': pred[:,1]})
submit.to_csv('result4.csv',index=False)

In [13]:
nums = list(range(1,11))

### 02. 이직 여부 예측

In [50]:
#1. 데이터 불러오기
import pandas as pd

In [51]:
train = pd.read_csv('hr_train.csv')
test = pd.read_csv('hr_test.csv')

train.shape, test.shape

((15326, 14), (3832, 13))

In [52]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11750 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15012 non-null  object 
 6   education_level         14961 non-null  object 
 7   major_discipline        13045 non-null  object 
 8   experience              15272 non-null  object 
 9   company_size            10539 non-null  object 
 10  company_type            10383 non-null  object 
 11  last_new_job            14984 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target                  15326 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [53]:
x_train = train.iloc[:, train.columns != 'target']
y_train = train.iloc[:, -1]
x_test = test

y_train

0        0.0
1        0.0
2        0.0
3        1.0
4        0.0
        ... 
15321    0.0
15322    0.0
15323    1.0
15324    1.0
15325    0.0
Name: target, Length: 15326, dtype: float64

In [54]:
#2. 결측치 처리
x_train.isnull().sum()

x_train = x_train.fillna("X")
x_test = x_test.fillna("X")

In [55]:
len(x_train)

15326

In [56]:
# 합치기
x_full = pd.concat([x_train, x_test])
x_full.drop(columns='enrollee_id', inplace=True)
x_full.isnull().sum()

city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64

In [57]:
x_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19158 entries, 0 to 3831
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  object 
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  object 
 3   relevent_experience     19158 non-null  object 
 4   enrolled_university     19158 non-null  object 
 5   education_level         19158 non-null  object 
 6   major_discipline        19158 non-null  object 
 7   experience              19158 non-null  object 
 8   company_size            19158 non-null  object 
 9   company_type            19158 non-null  object 
 10  last_new_job            19158 non-null  object 
 11  training_hours          19158 non-null  int64  
dtypes: float64(1), int64(1), object(10)
memory usage: 1.9+ MB


In [58]:
#2.5 인코딩

x_full = pd.get_dummies(x_full)
x_train = x_full.iloc[:15326,:]
x_test = x_full.iloc[15326:,:]



In [59]:
#3. 스케일링
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [60]:
#4. 검정 데이터 분리
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state =0)

In [61]:
#5. 모델
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0)
rfc_model = rfc.fit(x_train, y_train)

y_pred = rfc_model.predict_proba(x_val)
y_pred

array([[0.7 , 0.3 ],
       [0.34, 0.66],
       [0.76, 0.24],
       ...,
       [0.66, 0.34],
       [1.  , 0.  ],
       [0.3 , 0.7 ]])

In [62]:
#6. 평가
from sklearn.metrics import roc_auc_score
rocauc = roc_auc_score(y_val, y_pred[:,1])
rocauc

np.float64(0.7711485243092139)

### 03. 신용카드 신청자의 미래 신용 예측

In [124]:
import pandas as pd

In [125]:
#1. 데이터 불러오기
train = pd.read_csv('creditcard_train.csv')
test = pd.read_csv('creditcard_test.csv')

In [126]:
train.shape, test.shape

((25519, 19), (7591, 18))

In [127]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25519 entries, 0 to 25518
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   25519 non-null  int64  
 1   CODE_GENDER          25519 non-null  object 
 2   FLAG_OWN_CAR         25519 non-null  object 
 3   FLAG_OWN_REALTY      25519 non-null  object 
 4   CNT_CHILDREN         25519 non-null  int64  
 5   AMT_INCOME_TOTAL     25519 non-null  float64
 6   NAME_INCOME_TYPE     25519 non-null  object 
 7   NAME_EDUCATION_TYPE  25519 non-null  object 
 8   NAME_FAMILY_STATUS   25519 non-null  object 
 9   NAME_HOUSING_TYPE    25519 non-null  object 
 10  DAYS_BIRTH           25519 non-null  int64  
 11  DAYS_EMPLOYED        25519 non-null  int64  
 12  FLAG_MOBIL           25519 non-null  int64  
 13  FLAG_WORK_PHONE      25519 non-null  int64  
 14  FLAG_PHONE           25519 non-null  int64  
 15  FLAG_EMAIL           25519 non-null 

In [128]:
train.isnull().sum()

ID                        0
CODE_GENDER               0
FLAG_OWN_CAR              0
FLAG_OWN_REALTY           0
CNT_CHILDREN              0
AMT_INCOME_TOTAL          0
NAME_INCOME_TYPE          0
NAME_EDUCATION_TYPE       0
NAME_FAMILY_STATUS        0
NAME_HOUSING_TYPE         0
DAYS_BIRTH                0
DAYS_EMPLOYED             0
FLAG_MOBIL                0
FLAG_WORK_PHONE           0
FLAG_PHONE                0
FLAG_EMAIL                0
OCCUPATION_TYPE        7976
CNT_FAM_MEMBERS           0
STATUS                    0
dtype: int64

In [129]:
#2. 결측치 처리
train.describe(include="O")
train['OCCUPATION_TYPE'] = train['OCCUPATION_TYPE'].fillna(train['OCCUPATION_TYPE'].mode()[0])

In [130]:
x_train = train.iloc[:,train.columns != 'STATUS']
y_train = train.iloc[:,-1]
x_test = test

In [131]:
## 2.5 인코딩
x_full = pd.concat([x_train, x_test])
x_full.shape

x_full = pd.get_dummies(x_full,dtype=int)
x_full.dropna(subset=['ID'],inplace=True)

x_train = x_full.iloc[:25519, :]
x_test = x_full.iloc[25519:, :]

In [132]:
#3. 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [133]:
#4. 검정 데이터 나누기
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=0)

In [135]:
#5. 모델
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_model = rfc.fit(x_train, y_train)
y_pred = rfc_model.predict(x_val)


In [140]:
#6. 평가하기
from sklearn.metrics import f1_score, accuracy_score
f1score = f1_score(y_val, y_pred)
f1score

0.24489795918367346

In [141]:
#7. 저장하기
pred = rfc_model.predict(x_test)
submit = pd.DataFrame({'pred': pred})
submit.to_csv('result5.csv', index=False)

In [142]:
pd.read_csv('result5.csv')

Unnamed: 0,pred
0,0
1,0
2,0
3,0
4,0
...,...
7586,0
7587,0
7588,0
7589,0
