### Pre Processing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import pickle

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
#Function to Pre-process Dataset
def pre_process_dataset(df,
                        mode='Train',
                        mean_imputer = None,
                        mode_imputer = None,
                        oh_encoder = None):

  df['Cabin_deck'] = df['Cabin'].str.split("/").str[0]
  df['Cabin_num'] = df['Cabin'].str.split("/").str[1].astype('float64')
  df['Cabin_starboard'] = df['Cabin'].str.split("/").str[2]

  df['Passenger_group'] = df['PassengerId'].str.split("_").str[0]

  cat_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck','Cabin_starboard']
  money_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

  if mode== 'Train':
    mean_imputer = SimpleImputer(strategy='mean')
    mode_imputer = SimpleImputer(strategy='most_frequent')
    oh_encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
    label_encoder = LabelEncoder()



    df[['Age']] = mean_imputer.fit_transform(df[['Age']])
    df[cat_columns] = mode_imputer\
                .fit_transform(df[cat_columns] )


    one_hot = oh_encoder.fit_transform(df[cat_columns])
    one_hot = pd.DataFrame(one_hot,columns = oh_encoder.get_feature_names_out(),index=df.index)
    df = pd.concat([df,one_hot],axis=1)
    df['Transported'] = label_encoder.fit_transform(df['Transported'])

  else:
    df[['Age']] = mean_imputer.transform(df[['Age']])
    df[cat_columns] = mode_imputer\
                .transform(df[cat_columns] )


    one_hot = oh_encoder.transform(df[cat_columns])
    one_hot = pd.DataFrame(one_hot,columns = oh_encoder.get_feature_names_out(),index=df.index)
    df = pd.concat([df,one_hot],axis=1)

  df[money_columns] = df[money_columns].fillna(0)
  df['Cabin_num'] = df['Cabin_num'].fillna(-1)

  if mode == 'Train':
    df = df.drop(columns=['Name','Cabin','PassengerId'])
    return df,mean_imputer, mode_imputer, oh_encoder,label_encoder
  else:
    df = df.drop(columns=['Name','Cabin'])
    return df

In [None]:
pre_process_df,mean_imputer, mode_imputer, oh_encoder,label_encoder = pre_process_dataset(train_df.copy(),'Train')

In [None]:
pre_process_df_test = pre_process_dataset(test_df.copy(),'Test',mean_imputer, mode_imputer, oh_encoder)

In [None]:
pre_process_df.to_csv('pre_processed_train.csv',index=False)
pre_process_df_test.to_csv('pre_process_df_test.csv',index=False)

In [None]:
import pickle
objects_to_save = {
    'mean_imputer': mean_imputer,
    'mode_imputer': mode_imputer,
    'oh_encoder': oh_encoder,
    'label_encoder': label_encoder
}

# Save the dictionary to a file
with open('saved_objects.pkl', 'wb') as f:
    pickle.dump(objects_to_save, f)
