### 2-1. 타이타닉 생존여부 예측모델 만들기

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index":"id"})
        id_name = 'id'
    
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])
    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test

In [3]:
df = pd.read_csv('./titanic/train.csv')
X_train, X_test, y_train, y_test = data_load(df, target='Survived', id_name='PassengerId')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 11), (179, 11), (712, 2), (179, 2))

In [5]:
# 모듈 로딩
import pandas as pd

In [6]:
X_train.shape, y_train.shape, X_test.shape

((712, 11), (712, 2), (179, 11))

In [8]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
90,91,3,"Christmann, Mr. Emil",male,29.0,0,0,343276,8.05,,S
103,104,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
577,578,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39.0,1,0,13507,55.9,E44,S
215,216,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.275,D36,C
191,192,2,"Carbines, Mr. William",male,19.0,0,0,28424,13.0,,S


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 90 to 116
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Pclass       712 non-null    int64  
 2   Name         712 non-null    object 
 3   Sex          712 non-null    object 
 4   Age          575 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Ticket       712 non-null    object 
 8   Fare         712 non-null    float64
 9   Cabin        170 non-null    object 
 10  Embarked     711 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


In [10]:
y_train.head()

Unnamed: 0,PassengerId,Survived
90,91,0
103,104,0
577,578,1
215,216,1
191,192,0


In [11]:
y_train['Survived'].value_counts()

0    441
1    271
Name: Survived, dtype: int64

In [13]:
y = y_train['Survived']

# get_dummies 함수는 범주형 변수만 처리함
features = ['Pclass', 'Sex', 'SibSp', 'Parch']
X = pd.get_dummies(X_train[features])
test = pd.get_dummies(X_test[features])

In [14]:
X.shape, test.shape

((712, 5), (179, 5))

#### 모델링 및 평가

In [15]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, max_depth=7, random_state=2021)
model.fit(X, y)
predictions = model.predict(test)

In [16]:
model.score(X, y)

0.8356741573033708

In [17]:
output = pd.DataFrame({'PassengerId' : X_test.PassengerId, 'Survived' : predictions})

In [18]:
output.head()

Unnamed: 0,PassengerId,Survived
210,211,0
876,877,0
666,667,0
819,820,0
736,737,0


#### 결과 채점

In [19]:
model.score(test, y_test['Survived'])

0.7318435754189944

### 2-2. 당뇨병 여부 판단

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index":"id"})
        id_name = 'id'
    
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])
    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test

In [26]:
df = pd.read_csv('./archive/diabetes.csv')
X_train, X_test, y_train, y_test = data_load(df, target='Outcome')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 9), (154, 9), (614, 2), (154, 2))

In [27]:
# 모듈 로딩
import pandas as pd

In [28]:
# 데이터 로딩
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 9), (154, 9), (614, 2), (154, 2))

In [29]:
X_train.head()

Unnamed: 0,id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
147,147,2,106,64,35,119,30.5,1.4,34
344,344,8,95,72,0,0,36.8,0.485,57
390,390,1,100,66,29,196,32.0,0.444,42
150,150,1,136,74,50,204,37.4,0.399,24
132,132,3,170,64,37,225,34.5,0.356,30


In [30]:
y_train.head()

Unnamed: 0,id,Outcome
147,147,0
344,344,0
390,390,0
150,150,0
132,132,1


In [31]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 147 to 116
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        614 non-null    int64  
 1   Pregnancies               614 non-null    int64  
 2   Glucose                   614 non-null    int64  
 3   BloodPressure             614 non-null    int64  
 4   SkinThickness             614 non-null    int64  
 5   Insulin                   614 non-null    int64  
 6   BMI                       614 non-null    float64
 7   DiabetesPedigreeFunction  614 non-null    float64
 8   Age                       614 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 48.0 KB


In [32]:
X_train.isnull().sum()

id                          0
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [33]:
X_test.isnull().sum()

id                          0
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [34]:
X_train.describe()

Unnamed: 0,id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,386.978827,3.845277,120.705212,68.530945,20.982085,82.229642,31.948534,0.477471,33.319218
std,219.881344,3.388952,32.047977,19.761915,15.965031,116.525772,7.734385,0.339811,11.940953
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,196.5,1.0,99.0,62.0,0.0,0.0,27.4,0.24425,24.0
50%,388.0,3.0,117.0,72.0,23.0,44.0,32.25,0.377,29.0
75%,575.75,6.0,140.0,80.0,33.0,130.0,36.6,0.6285,41.0
max,766.0,17.0,199.0,114.0,99.0,846.0,59.4,2.42,81.0


#### 데이터 전처리

In [35]:
# 이상치 확인
# Train Data
print('Glucose :', len(X_train[X_train['Glucose']==0]))
print('BloodPressure :', len(X_train[X_train['BloodPressure']==0]))
print('SkinThickness :', len(X_train[X_train['SkinThickness']==0]))
print('Insulin :', len(X_train[X_train['Insulin']==0]))
print('BMI :', len(X_train[X_train['BMI']==0]))

Glucose : 5
BloodPressure : 31
SkinThickness : 175
Insulin : 287
BMI : 9


In [36]:
# Test Data
print('Glucose :', len(X_test[X_test['Glucose']==0]))
print('BloodPressure :', len(X_test[X_test['BloodPressure']==0]))
print('SkinThickness :', len(X_test[X_test['SkinThickness']==0]))
print('Insulin :', len(X_test[X_test['Insulin']==0]))
print('BMI :', len(X_test[X_test['BMI']==0]))

Glucose : 0
BloodPressure : 4
SkinThickness : 52
Insulin : 87
BMI : 2


##### 포도당은 train 데이터에만 0 값이 있으므로 삭제함
##### 나머지 이상치는 평균값으로 대체함

In [37]:
# 포도당 이상치
del_idx = X_train[(X_train['Glucose']==0)].index
del_idx

Int64Index([182, 75, 342, 502, 349], dtype='int64')

In [38]:
print(X_train.shape, y_train.shape)
X_train = X_train.drop(index=del_idx, axis=0)
y_train = y_train.drop(index=del_idx, axis=0)
print(X_train.shape, y_train.shape)

(614, 9) (614, 2)
(609, 9) (609, 2)


In [41]:
# 이상치를 평균값으로 대체
cols = ['BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
cols_mean = X_train[cols].mean()
X_train[cols].replace(0, cols_mean)

Unnamed: 0,BloodPressure,SkinThickness,Insulin,BMI
147,64.0,35.00000,119.000000,30.5
344,72.0,20.91133,82.866995,36.8
390,66.0,29.00000,196.000000,32.0
150,74.0,50.00000,204.000000,37.4
132,64.0,37.00000,225.000000,34.5
...,...,...,...,...
621,76.0,20.00000,82.866995,24.2
128,88.0,24.00000,145.000000,34.5
57,88.0,60.00000,110.000000,46.8
341,74.0,21.00000,73.000000,25.9


In [42]:
X_train.columns

Index(['id', 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [47]:
X = X_train.iloc[:,1:]
test = X_test.iloc[:,1:]

In [48]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
test = scaler.fit_transform(test)

In [49]:
from sklearn.svm import SVC
model = SVC(random_state=42)
model.fit(X, y_train['Outcome'])
predictions = model.predict(test)

In [50]:
round(model.score(X, y_train['Outcome'])*100, 2)

83.42

In [52]:
output = pd.DataFrame({'idx':X_test.index, 'Outcome':predictions})
output.head()

Unnamed: 0,idx,Outcome
0,258,1
1,220,1
2,438,0
3,130,1
4,730,0


In [51]:
round(model.score(test, y_test['Outcome'])*100, 2)

76.62