In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt


In [None]:
training_data = pd.read_csv("Data/train.csv")
testing_data = pd.read_csv("Data/test.csv")

In [None]:
training_data.describe()

In [None]:
# Split Passenger ID into Group number and position columns
training_data[["GroupNum", "GroupPosition"]] = (
    training_data
    .PassengerId
    .str
    .split("_", expand=True)
)

# Splitting Cabin into Deck, Room number, and side of ship 
training_data[["Deck", "RoomNum", "Side"]] = (
    training_data
    .Cabin
    .str
    .split("/", expand=True)
)

# Convert Data to proper types
training_data["GroupNum"] = training_data.GroupNum.astype('Int64')
training_data["GroupPosition"] = training_data.GroupPosition.astype('Int64')
training_data["RoomNum"] = training_data.RoomNum.astype('Int64')

# # Dropping Passenger ID, Cabin, and Name because they are unused
# training_data = training_data.drop(columns=["PassengerId", "Cabin", "Name"])

In [None]:
training_data.describe()

In [None]:
#Seperating Data into numerical columns then created a for loop to display the distribution of each column
column_name = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
for name in column_name:
    fig=px.histogram(training_data, x=name, title=f'Distribution of {name}')
    fig.show()

## EDA

### Home Planet

In [None]:
# Calculating how many passengers survived per home planet
training_data["count"] = 1
survivability_per_home_planet = (
    training_data
    .groupby(["HomePlanet", "Transported"])
    .sum()["count"]
    .reset_index()
)

In [None]:
# Total number of passengers per home planet
total_passengers_per_planet = (
    pd.DataFrame(
        training_data
        .HomePlanet
        .value_counts()
    )
    .reset_index()
    .rename(columns={"count":"Total"})
)

# How many passengers survived per home planet
home_planet_survived = (
    survivability_per_home_planet
    .query("Transported == True")
    .rename(columns={"count":"Survived"})
)

In [None]:
# Calculate survivability ratio per home planet
passenger_per_planet = total_passengers_per_planet.merge(home_planet_survived, right_on="HomePlanet", left_on="HomePlanet")

passenger_per_planet["Survival Ratio"] = passenger_per_planet.Survived / passenger_per_planet.Total

passenger_per_planet