In [1]:
from pathlib import Path
from pandas import DataFrame
from spaceship_titanic.jutils.visual import Plot
import pandas as pd
import plotly.express as px

In [2]:
class Data:
    path_data = Path(r'../data').resolve().absolute()
    path_raw = path_data.joinpath(r'01_raw')
    path_raw_train = path_raw.joinpath(r'train.parquet')
    path_intermediate = path_data.joinpath(r'02_intermediate')

    def get_csv_raw_train_data():
        return pd.read_csv(Data.path_raw.joinpath(r'train.csv'))

    def load_data(path: Path) -> DataFrame:
        return pd.read_parquet(path)
    
    def save_data(data: DataFrame, path: Path) -> Path:
        data.to_parquet(path)
        return path

In [3]:
Data.save_data(Data.get_csv_raw_train_data(), Data.path_raw_train)

WindowsPath('C:/Users/jevo1/Documents/Python Scripts/kaggle/spaceship-titanic/data/01_raw/train.parquet')

In [4]:
data = Data.load_data(Data.path_raw_train)

In [5]:
columns_classification = {
    'id': {'PassengerId', 'Name'},
    'categorical': {'HomePlanet', 'Cabin', 'Destination'},
    'binary': {'CryoSleep', 'VIP', 'Transported'},
    'continuous': {'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'},
    'discrete': {'Age'},
}

def validate_classification(_columns_classification, _data: DataFrame):
    item_list = []
    for _, items in _columns_classification.items():
        item_list += items
    if len(item_list) != len(set(item_list)):
        raise Exception('Hay columnas duplicadas en columns classification.')
    item_set = set(item_list)
    columns_set = set(_data.columns)
    columns_not_clasiffied = columns_set.difference(item_set)
    innexistent_classified = item_set.difference(columns_set)
    if len(columns_not_clasiffied) > 0:
        raise Exception(f'Las siguientes columnas no están clasificadas {columns_not_clasiffied}')
    if len(innexistent_classified) > 0:
        raise Exception(f'Las siguientes columnas clasificadas no existen en el dataset {innexistent_classified}')

validate_classification(columns_classification, data)

In [6]:
# Análisis exploratorio de los datos
print(f'Shape inicial: {data.shape}')
# Validando duplicados
print(f'Cantidad de indices duplicados: {data[["PassengerId"]].duplicated().sum()}')
print(f'Cantidad de nombres duplicados: {data[["Name"]].duplicated().sum()}')
print(f'Datos duplicados: {data.drop(columns=["PassengerId", "Name"]).duplicated().sum()}')
data = data.drop_duplicates()
print(f'Datos duplicados después de la eliminación de duplicados {data.shape}')

Shape inicial: (8693, 14)
Cantidad de indices duplicados: 0
Cantidad de nombres duplicados: 219
Datos duplicados: 15
Datos duplicados después de la eliminación de duplicados (8693, 14)


In [7]:
# Graficando las distribuciones de los datos
plot = Plot()
# Columnas categoricas y binarias

def graficas_descriptivas(_data: DataFrame, _columns_classification: dict):
    columnas = list(columns_classification['categorical'].union(columns_classification['binary']))
    plots = [plot.bar(data, x=column) for column in columnas]
    plot.grid_subplot(*plots, cols=3, title='Columnas categóricas y binarias', titles=columnas, height=600).show()

    # Columnas continuas
    columnas = list(columns_classification['continuous'])
    plots = [plot.histogram(data, x=column) for column in columnas]
    plot.grid_subplot(*plots, cols=3, title='Columnas continuas', titles=columnas, height=600).show()

    # discrete
    columnas = list(columns_classification['discrete'])
    plots = [plot.histogram(data, x=column) for column in columnas]
    plot.grid_subplot(*plots, cols=3, title='Columnas discretas', titles=columnas, height=600).show()

    # Box plot
    columnas = list(columns_classification['continuous'].union(columns_classification['discrete']))
    plots = [plot.box(data, y=column) for column in columnas]
    plot.grid_subplot(*plots, cols=3, title='Columnas discretas y continuas', titles=columnas, height=600).show()

graficas_descriptivas(data, columns_classification)

# Observaciones
- Se observan muchas personas que gastaron 0 pesos en varias categorías, se validará la distribución para las personas que compraron VIP y para las que no.
- Cabin tiene demasiada cardinalidad, se separará en deck/num/side.
- Hay personas con edades igual a 0, como primera suposición se dirá que son bebés, por lo tanto deben ir en grupos con al menos una persona adulta. Esto mismo aplicará para los niños, deben ir acompañados de un adulto, se considerarán adultos a personas mayores a 18 años. Los que no cumplan estas condiciones se considerarán outliers.

In [8]:
# Columnas continuas Si VIP
columnas = list(columns_classification['continuous'])
plots = [plot.histogram(data[data['VIP'] == True], x=column) for column in columnas]
plot.grid_subplot(*plots, cols=3, title='Columnas continuas si VIP', titles=columnas, height=600).show()

# Columnas continuas No VIP
columnas = list(columns_classification['continuous'])
plots = [plot.histogram(data[data['VIP']==False], x=column) for column in columnas]
plot.grid_subplot(*plots, cols=3, title='Columnas continuas no VIP', titles=columnas + columnas, height=600).show()


# Boxplot
columnas = list(columns_classification['continuous'])
plots = [plot.box(data[(data['VIP'] == True) | (data['VIP'] == False)], y=column, x='VIP') for column in columnas]
plot.grid_subplot(*plots, cols=3, title='Columnas continuas boxplot', titles=columnas, height=600).show()

## Observaciones
- Se considerarán outliers aquellos que hayan gastado mas de 20000

In [9]:
# Eliminando outliers de las columnas categoricas
from functools import reduce

print(f'Shape antes de eliminar outliers {data.shape}')
data = data[~(reduce(lambda a, b: a | b, [data[column] >= 20000 for column in columns_classification['continuous']]))]
print(f'Shape después de eliminar outliers {data.shape}')

Shape antes de eliminar outliers (8693, 14)
Shape después de eliminar outliers (8684, 14)


In [10]:
# Columnas continuas Si VIP
columnas = list(columns_classification['continuous'])
plots = [plot.histogram(data[data['VIP'] == True], x=column) for column in columnas]
plot.grid_subplot(*plots, cols=3, title='Columnas continuas si VIP', titles=columnas, height=600).show()

# Columnas continuas No VIP
columnas = list(columns_classification['continuous'])
plots = [plot.histogram(data[data['VIP']==False], x=column) for column in columnas]
plot.grid_subplot(*plots, cols=3, title='Columnas continuas no VIP', titles=columnas + columnas, height=600).show()


# Boxplot
columnas = list(columns_classification['continuous'])
plots = [plot.box(data[(data['VIP'] == True) | (data['VIP'] == False)], y=column, x='VIP') for column in columnas]
plot.grid_subplot(*plots, cols=3, title='Columnas continuas boxplot', titles=columnas, height=600).show()

In [11]:
# Validación de datos nulos
nulos = data.isna()
nulos_por_columna = nulos.sum()
filas_con_un_nulo = reduce(lambda a, b: a | b, [nulos[columna] for columna in nulos.columns if columna != 'Name'])
cant_filas_con_un_nulo = filas_con_un_nulo.sum()

print(f'Nulos por columna')
print(nulos_por_columna)
print()
print(f'Cant filas con al menos un nulo')
print(cant_filas_con_un_nulo)

Nulos por columna
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     180
FoodCourt       183
ShoppingMall    208
Spa             182
VRDeck          188
Name            198
Transported       0
dtype: int64

Cant filas con al menos un nulo
1927


In [12]:
# Eliminando nulos
print(f'Shape antes de eliminar nulos {data.shape}')
data = data.drop(columns='Name').dropna()
print(f'Shape después de eliminar nulos {data.shape}')

Shape antes de eliminar nulos (8684, 14)
Shape después de eliminar nulos (6757, 13)


In [13]:

data['PassengerId'].head()

0    0001_01
1    0002_01
2    0003_01
3    0003_02
4    0004_01
Name: PassengerId, dtype: object

In [14]:
# Separando columnas
# deck/num/side
splitted = data['Cabin'].str.split('/')
data['deck'], data['num'], data['side'] = (splitted.apply(lambda x: x[0]), splitted.apply(lambda x: x[1]), splitted.apply(lambda x: x[2]))

# Group_person_number
splitted = data['PassengerId'].str.split('_')
data['Group'], data['person_number'] = (splitted.apply(lambda x: x[0]), splitted.apply(lambda x: x[1]))

In [15]:
columns_classification['id'].add('num')
columns_classification['id'].add('person_number')
columns_classification['id'].add('Group')
columns_classification['categorical'].add('deck')
columns_classification['categorical'].add('side')
columns_classification['id'].discard('Name')
columns_classification['categorical'].discard('Cabin')
columns_classification['id'].add('Cabin')
validate_classification(_columns_classification=columns_classification, _data=data)

In [16]:
graficas_descriptivas(data, columns_classification)

In [17]:
# Análisis bivariable
y_column = 'Transported'

In [18]:
from phik import phik_matrix
corr_matrix = data.drop(columns=columns_classification['id']).phik_matrix()

interval columns not set, guessing: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


In [19]:
px.imshow(corr_matrix, color_continuous_scale='blues')

In [20]:
columns_classification

{'id': {'Cabin', 'Group', 'PassengerId', 'num', 'person_number'},
 'categorical': {'Destination', 'HomePlanet', 'deck', 'side'},
 'binary': {'CryoSleep', 'Transported', 'VIP'},
 'continuous': {'FoodCourt', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck'},
 'discrete': {'Age'}}

In [21]:
columnas = list(columns_classification['continuous'].union(columns_classification['discrete']))
plots = [plot.box(data[data['CryoSleep'] == False], y=columna, x='Transported', nbins=0) for columna in columnas]
plot.grid_subplot(*plots, cols=3, title='Variables continuas vs Objetivo para personas que no hibernaron', titles=columnas).show()

columnas = list(columns_classification['binary'].difference({'Transported'}).union(columns_classification['categorical']))
plots = [plot.heatmap(data, x=y_column, y=columna, color_continuous_scale='ylgnbu') for columna in columnas]
plot.grid_subplot(*plots, cols=3, title='Variables binarias vs Objetivo', titles=columnas, height=800).show()

In [58]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import make_scorer

oe = OrdinalEncoder()

X = data.drop(columns=columns_classification['id'].union({'Transported'}))
y = data['Transported']
X_transformed = oe.fit_transform(X)

tc = DecisionTreeClassifier(max_depth=8)
rf = RandomForestClassifier()

modelos_a_probar = {
    'Decision tree': {'modelo': tc},
    'Random forest': {'modelo': rf}
}

In [61]:
from tqdm import tqdm
from datetime import datetime
import numpy as np

for nombre_modelo, dic_modelo in tqdm(modelos_a_probar.items(), desc='Realizando cross validation...'):
    inicial = datetime.now()
    modelo = dic_modelo['modelo']
    dic_modelo['scores'] = cross_val_score(modelo, X_transformed, y, cv=5, scoring=make_scorer(f1_score))
    tiempo_entrenamiento = (datetime.now() - inicial).total_seconds()
    dic_modelo['tiempo_entrenamiento'] = tiempo_entrenamiento
    dic_modelo['media'] = np.mean(dic_modelo['scores'])
    dic_modelo['std'] = np.std(dic_modelo['scores'])

Realizando cross validation...: 100%|██████████| 2/2 [00:04<00:00,  2.03s/it]


In [62]:
tabla_comparativa = pd.DataFrame(modelos_a_probar).transpose()
tabla_comparativa

Unnamed: 0,modelo,scores,tiempo_entrenamiento,media,std
Decision tree,DecisionTreeClassifier(max_depth=8),"[0.7512839325018341, 0.7785923753665689, 0.799...",0.104008,0.784569,0.018851
Random forest,RandomForestClassifier(),"[0.7728965003723007, 0.781319495922906, 0.7959...",3.954507,0.785726,0.008086


In [64]:
trained_tc = DecisionTreeClassifier(max_depth=5)
trained_tc.fit(X_transformed, y)

In [65]:
feature_importance = pd.DataFrame(zip(X.columns, trained_tc.feature_importances_), columns=['Columna', 'Importancia'])

In [66]:
feature_importance.sort_values(by='Importancia', ascending=False)

Unnamed: 0,Columna,Importancia
1,CryoSleep,0.497344
6,FoodCourt,0.098652
0,HomePlanet,0.091473
3,Age,0.091051
7,ShoppingMall,0.069635
8,Spa,0.047554
9,VRDeck,0.039344
10,deck,0.03065
5,RoomService,0.019562
11,side,0.007964


In [None]:
# Modelo inicial:
