In [32]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader

In [5]:
def clean_csv(dataframe):
    columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
    for column in columns:
        if dataframe[column].isnull().values.any():
            mean = dataframe[column].mean()
            dataframe[column].fillna(mean, inplace=True)
            
    dataframe['Sex'].replace({'male': 0, 'female' : 1}, inplace=True)
    dataframe['Embarked'].replace({'C': 0, 'Q' : 1, 'S' : 3}, inplace=True)
            
    return dataframe

In [35]:
class TitanicDataset(Dataset):
  def __init__(self, X, Y,):
    self.X = X
    self.Y = Y
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X, Y = self.X[idx], self.Y[idx]
    return X, Y

In [49]:
def prepare_data(csv_name, opt, batch_size, scaler):
    np.set_printoptions(precision=4, suppress=True)
    
    df = pd.read_csv(csv_name)
    df = clean_csv(df)

    if opt == 0:
        df = df[['Sex', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived']]
        df = df.dropna()
        n_features = df.shape[1] - 1 
        X_train = df.iloc[:, :n_features].values
        Y_train = df.iloc[:, n_features:].values
        X_train = scaler.fit_transform(X_train)
        X_train = scaler.fit_transform(X_train)
        train_dataset = TitanicDataset(X_train, Y_train)
        return DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    else:
        passenger_id = df['PassengerId']
        df = df[['Sex', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
        df = df.dropna()
        X_test = df.values
        X_test = scaler.fit_transform(X_test)
        return torch.tensor(X_test).to('cuda'), passenger_id