In [152]:
import pandas as pd

titanic_df = pd.read_csv('titanic3.csv')
titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [153]:
titanic_df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [133]:
#survived : 탑승객 생존 유무 (0: 사망, 1: 생존)
#pclass : 등실의 등급
#name : 이름
#sex : 성별
#age : 나이
#sibsp : 함께 탐승한 형제자매, 아내, 남편의 수
#parch : 함께 탐승한 부모, 자식의 수
#ticket :티켓 번호
#fare : 티켓의 요금
#cabin : 객실번호
#embarked : 배에 탑승한 항구 이름
#boat :
#body : 사망자 확인 번호
#home.dest : 고향/목적지

In [154]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [155]:
# null값 확인
titanic_df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [156]:
# 전처리 : 결측값 처리, 불필요 컬럼 제거, 파생변수 생성, encoding

In [157]:
# name 컬럼에서 신분을 나타내는 단어를 뽑아서 title변수 생성
# .을 기준으로 앞에 문자열 추출
titanic_df['title'] = titanic_df['name'].str.extract('([A-Za-z]+)\.', expand=False)

In [158]:
titanic_df['title'].unique()

array(['Miss', 'Master', 'Mr', 'Mrs', 'Col', 'Mme', 'Dr', 'Major', 'Capt',
       'Lady', 'Sir', 'Mlle', 'Dona', 'Jonkheer', 'Countess', 'Don',
       'Rev', 'Ms'], dtype=object)

In [159]:
# 카테고리 분류
titanic_df['title'] = titanic_df['title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev', 'Jonkheer', 'Master', 'Countess', 'Don', 'Sir', 'the Countess', 'Lady', 'Dona'], 'Others')
titanic_df['title'] = titanic_df['title'].replace(['Mme', 'Ms', 'Mrs'], 'Mrs')
titanic_df['title'] = titanic_df['title'].replace(['Mlle', 'Miss'], 'Miss')
titanic_df['title'] = titanic_df['title'].replace(['Mr'], 'Mr')

In [160]:
titanic_df['title'].unique()

array(['Miss', 'Others', 'Mr', 'Mrs'], dtype=object)

In [161]:
titanic_df['title'].value_counts()

Mr        757
Miss      262
Mrs       200
Others     90
Name: title, dtype: int64

In [162]:
titanic_df.drop(columns=['name'] , axis=1 , inplace = True)

In [163]:
titanic_df['age'].describe()

count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: age, dtype: float64

In [164]:
# age 결측값은 중간값으로 채우기
titanic_df['age'].fillna(titanic_df['age'].median(), inplace=True)

In [165]:
titanic_df['age'].isnull().sum()

0

In [None]:
# age category
0~18세: 0
19~25세: 1
26~35세: 2
36~60세: 3
61~100세: 4

In [166]:
# age 범주화
def age_category(x):
    y=0
    if x<19:
        y=0
    elif x<26:
        y=1
    elif x<36:
        y=2
    elif x<61:
        y=3
    else:
        y=4
    return y

titanic_df['age_cat']=titanic_df['age'].apply(age_category)

In [167]:
titanic_df['age_cat'].value_counts()

2    544
3    290
1    247
0    196
4     32
Name: age_cat, dtype: int64

In [168]:
titanic_df.drop(columns = ['age'] , axis=1 , inplace = True)

In [169]:
# sex 컬럼 인코딩
titanic_df['sex'].replace(['male','female'],[0,1],inplace=True)

In [None]:
titanic_df['sex_male'] = titanic_df.loc[titanic_df['sex']==0]
titanic_df['sex_female'] = titanic_df.loc[titanic_df['sex']==1]

In [None]:
titanic_df.drop(columns = ['sex'] , axis=1 , inplace = True)

In [170]:
# emabarked 값 체크
titanic_df.embarked.unique()

array(['S', 'C', nan, 'Q'], dtype=object)

In [171]:
titanic_df['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [172]:
# 결측값은 최빈값으로 채우기
titanic_df['embarked'] = titanic_df['embarked'].fillna('S')

In [None]:
# embarked 인코딩
titanic_df['embarked_C'] = titanic_df.loc[titanic_df['embarked']=='C']
titanic_df['embarked_Q'] = titanic_df.loc[titanic_df['embarked']=='Q']
titanic_df['embarked_S'] = titanic_df.loc[titanic_df['embarked']=='S']

In [None]:
titanic_df.drop(columns = ['embarked'] , axis=1 , inplace = True)

In [173]:
# fare 값 체크
# 무단탑승으로 간주하고 0으로 처리
titanic_df['fare'].fillna(0, inplace = True)

In [122]:
# fare category

In [123]:
# 나머지 컬럼 값 체크
titanic_df['cabin'].unique()
#titanic_df['ticket'].unique()
#titanic_df['boat'].unique()
#titanic_df['body'].unique()
#titanic_df['home.dest'].unique()

array(['B5', 'C22 C26', 'E12', 'D7', 'A36', 'C101', nan, 'C62 C64', 'B35',
       'A23', 'B58 B60', 'D15', 'C6', 'D35', 'C148', 'C97', 'B49', 'C99',
       'C52', 'T', 'A31', 'C7', 'C103', 'D22', 'E33', 'A21', 'B10', 'B4',
       'E40', 'B38', 'E24', 'B51 B53 B55', 'B96 B98', 'C46', 'E31', 'E8',
       'B61', 'B77', 'A9', 'C89', 'A14', 'E58', 'E49', 'E52', 'E45',
       'B22', 'B26', 'C85', 'E17', 'B71', 'B20', 'A34', 'C86', 'A16',
       'A20', 'A18', 'C54', 'C45', 'D20', 'A29', 'C95', 'E25', 'C111',
       'C23 C25 C27', 'E36', 'D34', 'D40', 'B39', 'B41', 'B102', 'C123',
       'E63', 'C130', 'B86', 'C92', 'A5', 'C51', 'B42', 'C91', 'C125',
       'D10 D12', 'B82 B84', 'E50', 'D33', 'C83', 'B94', 'D49', 'D45',
       'B69', 'B11', 'E46', 'C39', 'B18', 'D11', 'C93', 'B28', 'C49',
       'B52 B54 B56', 'E60', 'C132', 'B37', 'D21', 'D19', 'C124', 'D17',
       'B101', 'D28', 'D6', 'D9', 'B80', 'C106', 'B79', 'C47', 'D30',
       'C90', 'E38', 'C78', 'C30', 'C118', 'D36', 'D48', 'D47', '

In [174]:
# 불필요 컬럼 제거
titanic_df.drop(columns = ['cabin'] , axis=1 , inplace = True)
titanic_df.drop(columns=['ticket'] , axis=1 , inplace = True)
titanic_df.drop(columns=['boat'] , axis=1 , inplace = True)
titanic_df.drop(columns=['body'] , axis=1 , inplace = True)
titanic_df.drop(columns=['home.dest'] , axis=1 , inplace = True)

In [175]:
# family 묶어주기 (sibsp, parch)
titanic_df['family'] = titanic_df['sibsp'] + titanic_df['parch']

In [176]:
titanic_df.drop(columns=['sibsp'] , axis=1 , inplace = True)
titanic_df.drop(columns=['parch'] , axis=1 , inplace = True)

In [177]:
titanic_df.head()

Unnamed: 0,pclass,survived,sex,fare,embarked,title,age_cat,family
0,1,1,1,211.3375,S,Miss,2,0
1,1,1,0,151.55,S,Others,0,3
2,1,0,1,151.55,S,Miss,0,3
3,1,0,0,151.55,S,Mr,2,3
4,1,0,1,151.55,S,Mrs,1,3


In [35]:
titanic_df = pd.read_pickle('tdf.pkl')
titanic_df.head()

Unnamed: 0,survived,fare_cat,age_cat,family,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,0,0,1,0,0,0,1
1,1,1,4,3,0,1,0,0,1
2,0,1,4,3,1,0,0,0,1
3,0,1,0,3,0,1,0,0,1
4,0,1,0,3,1,0,0,0,1


In [37]:
titanic_df.fare_cat.unique()

array([1, 3, 2, 4])

In [38]:
titanic_df.age_cat.unique()

array([0, 4, 2, 1, 3], dtype=int32)

In [7]:
from sklearn.model_selection import train_test_split

# 독립변수, 종속변수 분리
y_tdf = titanic_df['survived']
x_tdf = titanic_df.drop('survived', axis=1)

# 학습용 데이터와 평가용 데이터를 8:2로 분리
x_train, x_test, y_train, y_test = train_test_split(x_tdf, y_tdf, test_size=0.2, random_state=11)
print(x_train.shape, x_test.shape)


(1047, 8) (262, 8)


In [10]:
# 모델 학습 및 평가
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
rf_pred = rf_model.predict(x_test)
accuracy_rf = accuracy_score(y_test,rf_pred).round(2)

lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
lr_pred = lr_model.predict(x_test)
accuracy_lr = accuracy_score(y_test,lr_pred).round(2)

print(f'rf 정확도: {accuracy_rf}, lr 정확도:{accuracy_lr}')

rf 정확도: 0.77, lr 정확도:0.81


In [11]:
import pickle
import joblib

filename = 'tcl_model.pkl'
joblib.dump(lr_model, filename)

['tcl_model.pkl']

In [12]:
mdl = joblib.load('tcl_model.pkl')
mdl

LogisticRegression()

In [13]:
titanic_df.columns

Index(['survived', 'fare_cat', 'age_cat', 'family', 'sex_female', 'sex_male',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [14]:
data = [1,0,0,1,0,0,0,1]
df = pd.DataFrame(columns=['fare_cat', 'age_cat', 'family', 'sex_female', 'sex_male',
       'embarked_C', 'embarked_Q', 'embarked_S'])
df.loc[0,:] = data
y_pred = mdl.predict(df)
y_pred

array([1], dtype=int64)