# 探索的データ分析(EDA)

In [22]:
# ライブラリのインポート
import pandas as pd
import numpy as np

## データ取得

In [2]:
# 訓練データの取得
train_df = pd.read_csv("s3://sagemaker-project-p-mbjujshffkps/inputdata/inputdata.csv")

## データの概要把握

In [3]:
# データ数確認
# 教師データ
print(train_df.shape)

(8693, 14)


In [4]:
# データの中身確認
# 学習データ
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
# 学習データの各カラムのデータタイプ
train_df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [6]:
# 学習データの統計情報
train_df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [7]:
# 学習データの各カラム毎の欠損値の合計
train_df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [8]:
# PassengerId の種類ごとの人数
train_df["PassengerId"].value_counts()

PassengerId
9280_02    1
0001_01    1
0002_01    1
0003_01    1
0003_02    1
          ..
0020_04    1
0020_05    1
0020_06    1
0022_01    1
0024_01    1
Name: count, Length: 8693, dtype: int64

In [9]:
# HomePlanetの種類ごとの人数
train_df["HomePlanet"].value_counts()

HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: count, dtype: int64

In [10]:
# CryoSleepの種類ごとの人数
train_df["CryoSleep"].value_counts()

CryoSleep
False    5439
True     3037
Name: count, dtype: int64

In [11]:
# Cabinの種類ごとの人数
train_df["Cabin"].value_counts()

Cabin
G/734/S     8
B/11/S      7
F/1411/P    7
B/82/S      7
G/981/S     7
           ..
G/543/S     1
B/106/P     1
G/542/S     1
F/700/P     1
G/559/P     1
Name: count, Length: 6560, dtype: int64

In [12]:
# Destinationの種類ごとの人数
train_df["Destination"].value_counts()

Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64

In [13]:
# VIPの種類ごとの人数
train_df["VIP"].value_counts()

VIP
False    8291
True      199
Name: count, dtype: int64

In [14]:
# Transportedの種類ごとの人数
train_df["Transported"].value_counts()

Transported
True     4378
False    4315
Name: count, dtype: int64

In [15]:
# HomePlanetとTransportedの関係
result = train_df.groupby("HomePlanet")["Transported"].value_counts()
print(result)

HomePlanet  Transported
Earth       False          2651
            True           1951
Europa      True           1404
            False           727
Mars        True            920
            False           839
Name: count, dtype: int64


In [16]:
# DestinationとTransportedの関係
result = train_df.groupby("Destination")["Transported"].value_counts()
print(result)

Destination    Transported
55 Cancri e    True           1098
               False           702
PSO J318.5-22  True            401
               False           395
TRAPPIST-1e    False          3128
               True           2787
Name: count, dtype: int64


# 前処理の方針(探索的データ分析)

- PassengerId(object)      : GroupNumber毎に、グループの人数に置き換え
- HomePlanet(object)       : OneHot
- CryoSleep(object)        : binary(0/1)で置き換える
- Cabin(object)            : 3つのカラムに分割(deck:OneHot/num:そのまま使用/side:binary化)
- Destination(object)      : OneHot
- Age(float64)             : 3つにグループ化(子供[0-20]/大人[20-60]/老人[60-])
- VIP(object)              : binary(0/1)で置き換える
- RoomService(float64)     : FoodCourt,ShoppingMall,Spa,VRDeckのsumを一つのカラムとする(ServiceTotal)
- FoodCourt(float64)       : RoomService,ShoppingMall,Spa,VRDeckのsumを一つのカラムとする(ServiceTotal)
- ShoppingMall(float64)    : RoomService,FoodCourt,Spa,VRDeckのsumを一つのカラムとする(ServiceTotal)
- Spa(float64)             : RoomService,FoodCourt,ShoppingMall,VRDeckのsumを一つのカラムとする(ServiceTotal)
- VRDeck(float64)          : RoomService,FoodCourt,ShoppingMall,Spaのsumを一つのカラムとする(ServiceTotal)
- Name(object)             : 除外
- Transported(bool)        : binary(0/1)で置き換える
- ServiceTotal(float64)    : [RoomService,FoodCourt,ShoppingMall,Spa,VRDeck]の合計

