In [3]:
titanic_df= pd.read_csv("dataset/titanic.csv")
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


#### 1. 불필요한 feature를 제거

In [4]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
# 정수, 실수 데이터 타입의 컬럼만 남김
column1 = titanic_df.columns[titanic_df.dtypes == "int64"]
column2 = titanic_df.columns[titanic_df.dtypes == "float64"]
result = pd.concat([titanic_df[column1],titanic_df[column2]], axis=1)
result.tail(2)

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch,Age,Fare
889,890,1,1,0,0,26.0,30.0
890,891,0,3,0,0,32.0,7.75


In [6]:
filtered_df_1 = titanic_df[
    ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
]
filtered_df_1.tail(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
889,1,1,male,26.0,0,0,30.0,C
890,0,3,male,32.0,0,0,7.75,Q


#### 2. NaN 데이터 제거

In [7]:
filtered_df_2 = filtered_df_1[filtered_df_1["Age"].notnull()]
len(filtered_df_2)

714

In [8]:
len(filtered_df_1[filtered_df_1["Embarked"].notnull()])

889

In [9]:
filtered_df_1.notnull()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...
886,True,True,True,True,True,True,True,True
887,True,True,True,True,True,True,True,True
888,True,True,True,False,True,True,True,True
889,True,True,True,True,True,True,True,True


In [10]:
filtered_df = filtered_df_1[filtered_df_1.notnull().all(axis=1)]
len(filtered_df)

712

In [11]:
filtered_df.reset_index(drop=True, inplace=True)

#### 3. One Hot Encoding

In [12]:
# 깊은 복사
one_hot_df_1 = filtered_df.copy()

In [13]:
# Male, Female 컬럼에 0, 1을 추가
one_hot_df_1["Male"] = one_hot_df_1["Sex"].apply(lambda x: 1 if x=="male" else 0)
one_hot_df_1["Female"] = one_hot_df_1["Sex"].apply(lambda x: 1 if x=="female" else 0)
one_hot_df_1.drop(["Sex"], axis=1,inplace = True)

In [14]:
one_hot_df_1

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
0,0,3,22.0,1,0,7.2500,S,1,0
1,1,1,38.0,1,0,71.2833,C,0,1
2,1,3,26.0,0,0,7.9250,S,0,1
3,1,1,35.0,1,0,53.1000,S,0,1
4,0,3,35.0,0,0,8.0500,S,1,0
...,...,...,...,...,...,...,...,...,...
707,0,3,39.0,0,5,29.1250,Q,0,1
708,0,2,27.0,0,0,13.0000,S,1,0
709,1,1,19.0,0,0,30.0000,S,0,1
710,1,1,26.0,0,0,30.0000,C,1,0


In [15]:
one_hot_df_2 = filtered_df.copy()

In [16]:
one_hot_df_3 = pd.get_dummies(one_hot_df_2["Sex"])
one_hot_df_3.tail(3)

Unnamed: 0,female,male
709,1,0
710,0,1
711,0,1


In [17]:
one_hot_df_4 = pd.get_dummies(one_hot_df_2["Embarked"])
one_hot_df_4.tail(3)

Unnamed: 0,C,Q,S
709,0,0,1
710,1,0,0
711,0,1,0


In [18]:
one_hot_df = pd.concat([one_hot_df_2,one_hot_df_3, one_hot_df_4], axis=1)
one_hot_df.drop(["Sex", "Embarked"], axis=1 , inplace=True)
one_hot_df.tail(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
710,1,1,26.0,0,0,30.0,0,1,1,0,0
711,0,3,32.0,0,0,7.75,0,1,0,1,0


#### 4. 연령대와 Adult 컬럼을 만들기

In [19]:
result_df = one_hot_df.copy()

In [20]:
result_df.tail(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
710,1,1,26.0,0,0,30.0,0,1,1,0,0
711,0,3,32.0,0,0,7.75,0,1,0,1,0


In [21]:
result_df["Ages"] = ((result_df["Age"] // 10) * 10).astype("int")
result_df.tail(3)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S,Ages
709,1,1,19.0,0,0,30.0,1,0,0,0,1,10
710,1,1,26.0,0,0,30.0,0,1,1,0,0,20
711,0,3,32.0,0,0,7.75,0,1,0,1,0,30


In [22]:
result_df["Adult"] = 0

In [23]:
result_df.loc[result_df["Ages"] >= 20, "Adult"] = 1
result_df.head(3)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S,Ages,Adult
0,0,3,22.0,1,0,7.25,0,1,0,0,1,20,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0,30,1
2,1,3,26.0,0,0,7.925,1,0,0,0,1,20,1


#### 5. 생존 모델 만들기
- 원래 데이터
- 탐색 후 전처리가 끝난 데이터
- 선형회귀분석, 디시전트리

In [24]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [25]:
filtered_df.tail(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
710,1,1,male,26.0,0,0,30.0,C
711,0,3,male,32.0,0,0,7.75,Q


In [26]:
one_hot_df.tail(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
710,1,1,26.0,0,0,30.0,0,1,1,0,0
711,0,3,32.0,0,0,7.75,0,1,0,1,0


In [27]:
# filtered_df
# df_x = filtered_df[["Pclass", "Age", "SibSp", "Parch", "Fare"]]
# df_y = filtered_df[["Survived"]]

In [28]:
one_hot_df.columns[1:]

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male', 'C', 'Q',
       'S'],
      dtype='object')

In [29]:
# one_hot_df
df_x = one_hot_df[one_hot_df.columns[1:]]
df_y = one_hot_df[["Survived"]]

In [30]:
train_x, test_x, train_y, test_y = train_test_split(
    df_x, df_y, test_size = 0.1, random_state=1
)

In [31]:
model = linear_model.LinearRegression()
# model = DecisionTreeClassifier(max_depth=2)
model.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [32]:
pred_y = model.predict(test_x)
pred_y = np.around(pred_y.flatten()).astype("int")
pred_y

array([1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 1])

In [121]:
round(accuracy_score(test_y, pred_y)*100,2)

69.439999999999998