# 사용 패키지

In [12]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# 데이터 불러오기

In [2]:
t_train = pd.read_csv("./data/train.csv")
t_test = pd.read_csv("./data/test.csv")

- Survived : 0 - No, 1 - Yes
- Pclass : Ticket class(1 - 1st, 2 - 2nd, 3 - 3rd)
- SibSp : # of siblings / spouses aboard the Titanic   (배우자나 형제 자매 명 수의 총합)
- Parch : # of parents / children aboard the Titanic   (부모 자식 명 수의 총합)
- Cabin : Cabin number
- Embarked : Port of Embarkation ( C - Cherbourg, Q - Queenstown, S - Southampton)

In [3]:
t_train.head() # 891 x 12

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
t_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
t_train.isnull().sum() # null값 확인  -> age : 177, Cabin : 687, Embarked : 2

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
t_test.head() # 418 x 11
# Survived 없음

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
t_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [8]:
t_test.isnull().sum() # age : 86, Fare : 1, Cabin : 327

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Survived, Pclass만 추출

In [9]:
t = t_train.iloc[:, 1:3]
t

Unnamed: 0,Survived,Pclass
0,0,3
1,1,1
2,1,3
3,1,1
4,0,3
...,...,...
886,0,2
887,1,1
888,0,3
889,1,1


In [10]:
t.isnull().sum() # null 값 개수

Survived    0
Pclass      0
dtype: int64

## 생존자 Pie Chart

In [19]:
survive = t["Survived"].value_counts()
survive = pd.Series(survive)
survive = survive.rename(index = {0 : "Non-survived", 1 : "survived"})
print(survive)
print(type(survive))

Non-survived    549
survived        342
Name: Survived, dtype: int64
<class 'pandas.core.series.Series'>


In [21]:
labels = survive.index
values = survive.values
fig = go.Figure(data = [go.Pie(values=values, labels = labels)])
fig.show()

- 생존자 : 342명 (38.4%)
- 희생자 : 549명 (61.6%)

## Class별 생존자 Pie Chart

In [22]:
pclass = t["Pclass"].value_counts()
pclass = pd.Series(pclass)
pclass = pclass.rename(index = {1 : "1st_class",
                                2 : "2nd_class",
                                3 : "3rd_class"})
pclass

3rd_class    491
1st_class    216
2nd_class    184
Name: Pclass, dtype: int64

In [23]:
labels = pclass.index
values = pclass.values
fig = go.Figure(data = [go.Pie(values=values, labels = labels)])
fig.show()

### 3rd_class

In [24]:
p3 = t[t["Pclass"]==3]
p3

Unnamed: 0,Survived,Pclass
0,0,3
2,1,3
4,0,3
5,0,3
7,0,3
...,...,...
882,0,3
884,0,3
885,0,3
888,0,3


In [25]:
p3_s=p3["Survived"].value_counts()
p3_s 

0    372
1    119
Name: Survived, dtype: int64

In [28]:
labels = survive.index
values = p3_s.values
layout = go.Layout(title = "3rd_class")
fig = go.Figure(data = [go.Pie(values=values, labels = labels)], layout = layout)
fig.show()

### 2nd_class

In [30]:
p2 = t[t["Pclass"]==2]
p2

Unnamed: 0,Survived,Pclass
9,1,2
15,1,2
17,1,2
20,0,2
21,1,2
...,...,...
866,1,2
874,1,2
880,1,2
883,0,2


In [31]:
p2_s = p2["Survived"].value_counts()
p2_s

0    97
1    87
Name: Survived, dtype: int64

In [32]:
labels = survive.index
values = p2_s.values
layout = go.Layout(title = "2nd_class")
fig = go.Figure(data = [go.Pie(values=values, labels = labels)], layout = layout)
fig.show()

### 1st_class

In [33]:
p1 = t[t["Pclass"]==1]
p1_s = p1["Survived"].value_counts(ascending=True)
p1_s

0     80
1    136
Name: Survived, dtype: int64

In [35]:
labels = survive.index
values = p1_s.values
layout = go.Layout(title = "1st_class")
fig = go.Figure(data = [go.Pie(values=values, labels = labels)], layout = layout)
fig.show()

- 1st_class : 216명 (24.2%)
    - 생존자 : 136명 (63%)
    - 희생자 :  80명 (37%)
- 2nd_class : 184명 (20.7%)
    - 생존자 : 87명 (47.3%)
    - 희생자 : 97명 (52.7%)
- 3rd_class : 491명 (55.1%)
    - 생존자 : 119명 (24.2%)
    - 희생자 : 372명 (75.8%)

## ==> 1st class 생존율 높음
(확신은 할 수 없지만...)

## 성별 탑승자수 추출

In [36]:
sex = t_train.iloc[:,4]
sex

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [37]:
sex2 = sex.value_counts()
sex2

male      577
female    314
Name: Sex, dtype: int64

In [38]:
labels = sex2.index
values = sex2.values
layout = go.Layout(title = "성별 탑승자 수")
fig = go.Figure(data = [go.Pie(values=values, labels = labels)], layout = layout)
fig.show()

- 남성이 여성보다 비율이 높다
    - 남성 : 577명 (64.8%)
    - 여성 : 314명 (35.2%)

## 성별 생존자수

In [39]:
sur = t_train.iloc[:,1]
sur

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [40]:
tot = pd.concat([sur, sex], axis = 1)
tot

Unnamed: 0,Survived,Sex
0,0,male
1,1,female
2,1,female
3,1,female
4,0,male
...,...,...
886,0,male
887,1,female
888,0,female
889,1,male


### 남성

In [41]:
tot_m = tot[tot["Sex"]=="male"]
tot_m

Unnamed: 0,Survived,Sex
0,0,male
4,0,male
5,0,male
6,0,male
7,0,male
...,...,...
883,0,male
884,0,male
886,0,male
889,1,male


In [42]:
tot_ma = tot_m["Survived"].value_counts()
tot_ma

0    468
1    109
Name: Survived, dtype: int64

In [43]:
labels = survive.index
values = tot_ma.values
layout = go.Layout(title = "남성 생존 비율")
fig = go.Figure(data = [go.Pie(values=values, labels = labels)], layout = layout)
fig.show()

In [30]:
px.pie(tot_ma, values = tot_ma.values, names = survive.index, title = "남성 생존 비율")

### 여성

In [31]:
tot_f = tot[tot["Sex"]=="female"]
tot_f

Unnamed: 0,Survived,Sex
1,1,female
2,1,female
3,1,female
8,1,female
9,1,female
...,...,...
880,1,female
882,0,female
885,0,female
887,1,female


In [32]:
tot_fe = tot_f["Survived"].value_counts(ascending=True)
tot_fe

0     81
1    233
Name: Survived, dtype: int64

In [33]:
px.pie(tot_fe, values = tot_fe.values, names = survive.index, title = "여성 생존 비율")

- 남성 : 577명 (64.8%)
    - 생존자 : 109명 (18.9%)
    - 희생자 : 468명 (81.1%)
- 여성 : 314명 (35.2%)
    - 생존자 : 233명 (74.2%)
    - 희생자 :  81명 (25.8%)

## => 여성이 남성보다 생존율이 높다