In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv(r'D:\Kaggle\titanic spaceship\spaceship-titanic\train.csv')
test_df = pd.read_csv(r'D:\Kaggle\titanic spaceship\spaceship-titanic\test.csv')
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

In [3]:

class SplitColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[['GroupId', 'Id']] = X_transformed['PassengerId'].str.split('_', expand=True)
        X_transformed[['FirstName', 'SecondName']] = X_transformed['Name'].str.split(' ', expand=True)
        X_transformed[['Deck', 'Num', 'Side']] = X_transformed['Cabin'].str.split('/', expand=True)
        
        return X_transformed


In [4]:
cat_columns = ['HomePlanet', 'CryoSleep', 'VIP', 'Destination', 'Deck', 'Side', 'Num', 'Age', 'SecondName']
train_df_index = train_df['PassengerId']
test_df_index = test_df['PassengerId']

In [5]:
class FillMissingValuesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.groupby_col = 'GroupId'

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed = X_transformed.groupby(self.groupby_col).apply(self._fill_missing_values, X_transformed).reset_index(drop=True)
        
        return X_transformed

    def _fill_missing_values(self, group, df):
        for col in self.columns:
            if col == 'SecondName':
                mode_value_df = 'None'
            else:
                mode_value_df = df[col].mode()[0] 
            mode_value = group[col].mode()[0] if not group[col].mode().empty else mode_value_df
            group[col] = group[col].fillna(mode_value)
        return group


# def fill_missing_values(group, df,columns):
#     for col in columns:
#              if col == 'SecondName':
#                  mode_value_df = 'None'
#              else:
#                  mode_value_df = df[col].mode()[0] 
#              mode_value = group[col].mode()[0] if not group[col].mode().empty else mode_value_df
#              group[col] = group[col].fillna(mode_value)
#     return group

# cat_columns = ['HomePlanet','CryoSleep','VIP','Destination','Deck','Side','Num','Age','SecondName']
# train_df = train_df.groupby('GroupId').apply(fill_missing_values,train_df,cat_columns]).reset_index(drop=True)
# family_count = train_df.groupby('SecondName').size()
# family_count['None']=1
# train_df['FamilyCount'] = train_df['SecondName'].map(family_count)

In [6]:
class FillFamilyInfo(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self,X):
        X_transformed = X.copy()
        family_count = X_transformed.groupby('SecondName').size()
        family_count['None'] = 1
        X_transformed['FamilyCount'] = X_transformed['SecondName'].map(family_count)
        return X_transformed



In [7]:
class ProcessColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.age_bins = [-np.inf,4, 10, 18, 30, 40, 50, np.inf]
        self.age_labels = [0, 1, 2, 3, 4, 5, 6]
        self.spending_bins = [-np.inf, 1, 500, 1000, 3000, 6000, 10000, np.inf]
        self.spending_labels = [0, 1, 2, 3, 4, 5, 6]
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed['TotalSpendings'] = (X_transformed[['RoomService', 'FoodCourt', 'Spa', 'VRDeck', 'ShoppingMall']].fillna(0).sum(axis=1)
        X_transformed['Age_category'] = pd.cut(X_transformed['Age'], bins=self.age_bins, labels=self.age_labels)
        X_transformed['TotalSpendings_cat'] = pd.cut(X_transformed['TotalSpendings'], bins=self.spending_bins, labels=self.spending_labels)
        X_transformed.drop(['TotalSpendings','Age','RoomService', 'FoodCourt', 'Spa', 'VRDeck', 'ShoppingMall', 'Cabin', 'FirstName', 'PassengerId', 'Id', 'Name'], axis=1, inplace=True)
        return X_transformed

In [8]:
one_hot_columns = ['HomePlanet','Destination','Deck']
label_columns = ['VIP','CryoSleep','SecondName','Side']

In [9]:
class CustomEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, one_hot_columns=None, label_encode_columns=None):
        self.one_hot_columns = one_hot_columns
        self.label_encode_columns = label_encode_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.label_encode_columns:
            X_transformed[col] = LabelEncoder().fit_transform(X_transformed[col].astype(str))
        X_transformed = pd.get_dummies(X_transformed, columns=self.one_hot_columns)
        
        return X_transformed

pipeline = Pipeline([
    ('split_columns', SplitColumnsTransformer()),
    ('fill_missing_values', FillMissingValuesTransformer(columns=cat_columns)),
    ('fill_family_info', FillFamilyInfo()),
    ('process_columns', ProcessColumnsTransformer()),
    ('encoder', CustomEncoder(one_hot_columns,label_columns))
])

pipeline.fit(train_df)

train_df = pipeline.transform(train_df)
test_df = pipeline.transform(test_df)

  group[col] = group[col].fillna(mode_value)
  X_transformed = X_transformed.groupby(self.groupby_col).apply(self._fill_missing_values, X_transformed).reset_index(drop=True)
  group[col] = group[col].fillna(mode_value)
  X_transformed = X_transformed.groupby(self.groupby_col).apply(self._fill_missing_values, X_transformed).reset_index(drop=True)
  group[col] = group[col].fillna(mode_value)
  X_transformed = X_transformed.groupby(self.groupby_col).apply(self._fill_missing_values, X_transformed).reset_index(drop=True)


In [10]:
y_train = train_df['Transported']
train_df.drop(['Transported'], axis=1,inplace=True)
X_train = train_df
X_test = test_df

In [11]:
train_df

Unnamed: 0,CryoSleep,VIP,GroupId,SecondName,Num,Side,FamilyCount,TotalSpendings,Age_category,HomePlanet_Earth,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
0,0,0,0001,1432,0,0,1,0.0,4,False,...,False,True,False,True,False,False,False,False,False,False
1,0,0,0002,2110,0,1,4,736.0,3,True,...,False,True,False,False,False,False,False,True,False,False
2,0,1,0003,1991,0,1,6,10383.0,6,False,...,False,True,True,False,False,False,False,False,False,False
3,0,0,0003,1991,0,1,6,5176.0,4,False,...,False,True,True,False,False,False,False,False,False,False
4,0,0,0004,1779,1,1,6,1091.0,2,True,...,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,1,9276,1417,98,0,3,8536.0,5,False,...,False,False,True,False,False,False,False,False,False,False
8689,1,0,9278,1341,1499,1,2,0.0,2,True,...,True,False,False,False,False,False,False,False,True,False
8690,0,0,9279,470,1500,1,6,1873.0,3,True,...,False,True,False,False,False,False,False,False,True,False
8691,0,0,9280,996,608,1,6,4637.0,4,False,...,False,False,False,False,False,False,True,False,False,False


In [12]:
print(X_train)

      CryoSleep  VIP GroupId  SecondName   Num  Side  FamilyCount  \
0             0    0    0001        1432     0     0            1   
1             0    0    0002        2110     0     1            4   
2             0    1    0003        1991     0     1            6   
3             0    0    0003        1991     0     1            6   
4             0    0    0004        1779     1     1            6   
...         ...  ...     ...         ...   ...   ...          ...   
8688          0    1    9276        1417    98     0            3   
8689          1    0    9278        1341  1499     1            2   
8690          0    0    9279         470  1500     1            6   
8691          0    0    9280         996   608     1            6   
8692          0    0    9280         996   608     1            6   

      TotalSpendings Age_category  HomePlanet_Earth  ...  \
0                0.0            4             False  ...   
1              736.0            3              True

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)
X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

print(X_train_scaled.isna().sum())



CryoSleep                    0
VIP                          0
GroupId                      0
SecondName                   0
Num                          0
Side                         0
FamilyCount                  0
TotalSpendings               0
Age_category                 0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
Deck_A                       0
Deck_B                       0
Deck_C                       0
Deck_D                       0
Deck_E                       0
Deck_F                       0
Deck_G                       0
Deck_T                       0
dtype: int64


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)


In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score as ras
models = [LogisticRegression(), XGBClassifier(),
          SVC(kernel='rbf', probability=True)]
 
for i in range(len(models)):
    models[i].fit(X_train, y_train)
 
    print(f'{models[i]} : ')
 
    train_preds = models[i].predict_proba(X_train)[:, 1]
    print('Training Accuracy : ', ras(y_train, train_preds))
 
    val_preds = models[i].predict_proba(X_val)[:, 1]
    print('Validation Accuracy : ', ras(y_val, val_preds))
    print()


LogisticRegression() : 
Training Accuracy :  0.7847102324427165
Validation Accuracy :  0.7933562446590947

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...) : 
Training Accuracy :  0.9871249069401936
Validation Accuracy :  0.8076784689096483

SVC(probability=True) : 
Training Accuracy :  0.8

In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import pandas as pd

# Define the model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.3]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best ROC AUC Score: {grid_search.best_score_}")
best_model = grid_search.best_estimator_



Fitting 5 folds for each of 324 candidates, totalling 1620 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 150, 'subsample': 0.8}
Best ROC AUC Score: 0.8197098015598362


In [17]:
test_probs = best_model.predict_proba(X_test_scaled)[:, 1]

submission = pd.DataFrame({
    'PassengerId': test_df_index,  
    'Transported': test_probs  >= 0.5
})

submission.to_csv(r'D:\Kaggle\titanic spaceship\submission.csv', index=False)

In [18]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Assume 'binary_output' is the binary column and 'categorical_column' is a categorical column

# for column in cat_columns:
#     sns.countplot(x=column, hue='Transported', data=train_df)
#     plt.title('Count Plot of Categorical Column by Binary Output')
#     plt.show()
#     sns.barplot(x=column, y='Transported', data=train_df, errorbar=None)
#     plt.title('Bar Plot of Binary Output by Categorical Column')
#     plt.show()



In [19]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# for col in ['Age','Num','RoomService','Spa','FoodCourt','VRDeck']:


#     sns.boxplot(x='Transported', y=col, data=train_df)
#     plt.title('Box Plot of Continuous Variable by Binary Class')
#     plt.show()
#     # Create a violin plot
#     sns.violinplot(x='Transported', y=col, data=train_df)
#     plt.title('Violin Plot of Continuous Variable by Binary Class')
#     plt.show()
#     sns.histplot(data=train_df, x=col, hue='Transported', kde=True, element='step')
#     plt.title('Histogram of Continuous Variable by Binary Class')
#     plt.show()
    
#     # Alternatively, use KDE plot directly
#     sns.kdeplot(data=train_df, x=col, hue='Transported', fill=True)
#     plt.title('KDE Plot of Continuous Variable by Binary Class')
#     plt.show()

In [20]:

# bins = [0, 10, 18, 30, 40, 50,np.inf]
# train_df['Age_category'] = pd.cut(train_df['Age'], bins=bins)

# # Visualize the relationship using a count plot
# sns.countplot(x='Age_category', hue='Transported', data=train_df)
# plt.title('Count Plot of Binned Continuous Variable vs. Categorical Column')
# plt.xticks(rotation=45)
# plt.show()

# # Alternatively, visualize using a heatmap of the cross-tabulation
# cross_tab = pd.crosstab(train_df['Age_category'], train_df['Transported'])
# sns.heatmap(cross_tab, annot=True, cmap="YlGnBu")
# plt.title('Heatmap of Binned Continuous Variable vs. Categorical Column')
# plt.show()

# # Another option: Stacked bar plot
# cross_tab.plot(kind='bar', stacked=True, colormap='Set2')
# plt.title('Stacked Bar Plot of Binned Continuous Variable vs. Categorical Column')
# plt.xticks(rotation=45)
# plt.ylabel('Count')
# plt.show()

In [21]:
# # Calculate the proportions
# for column in cat_columns:
#     cross_tab = pd.crosstab(train_df[column], train_df['Transported'], normalize='index')
    
#     # Plot stacked bar chart
#     cross_tab.plot(kind='bar', stacked=True, colormap='coolwarm')
#     plt.title('Stacked Bar Plot of Categorical Column by Binary Output')
#     plt.show()


In [22]:
# def mode_percentage_multiple_columns(group, columns):
#     results = {}
#     for col in columns:
#         # Check if all values are NaN
#         if group[col].isna().all():
#             results[col] = {'mode': np.nan, 'mode_percentage': np.nan}
#         else:
#             # Find the mode of the column
#             mode_value = group[col].mode()
#             if mode_value.empty:
#                 results[col] = {'mode': np.nan, 'mode_percentage': np.nan}
#             else:
#                 mode_value = mode_value[0]  # Take the first mode if there are multiple
#                 # Calculate the percentage of this mode in the group
#                 mode_count = (group[col] == mode_value).sum()
#                 total_count = group[col].count()
#                 mode_percentage = (mode_count / total_count) * 100
#                 results[col] = {'mode': mode_value, 'mode_percentage': mode_percentage}
#     return pd.Series(results)

# # List of columns to process


# # Group by 'groupid' and apply the function
# result = train_df.groupby('group_id').apply(lambda x: mode_percentage_multiple_columns(x, cat_columns))
# print(result)

In [23]:
# def fill_missing_values(group, df,columns):
#     for col in columns:
#              if col == 'SecondName':
#                  mode_value_df = 'None'
#              else:
#                  mode_value_df = df[col].mode()[0] 
#              mode_value = group[col].mode()[0] if not group[col].mode().empty else mode_value_df
#              group[col] = group[col].fillna(mode_value)
#     return group

# cat_columns = ['HomePlanet','CryoSleep','VIP','Destination','Deck','Side','Num','Age','SecondName']
# train_df = train_df.groupby('GroupId').apply(fill_missing_values,train_df,cat_columns]).reset_index(drop=True)
# family_count = train_df.groupby('SecondName').size()
# family_count['None']=1
# train_df['FamilyCount'] = train_df['SecondName'].map(family_count)

In [24]:
# train_df_index = train_df['PassengerId']
# test_df_index = test_df['PassengerId']

# def process_columns(df):
#     df['TotalSpendings'] = df[['RoomService', 'FoodCourt', 'Spa', 'VRDeck', 'ShoppingMall']].fillna(0).sum(axis=1)
#     bins = [-np.inf, 10, 18, 30, 40, 50,np.inf]
#     df['Age_category'] = pd.cut(train_df['Age'], bins=bins,labels=[0,1,2,3,4,5])
#     bins=[-np.inf,1,500,1000,3000,6000,10000,np.inf]
#     df['TotalSpendings_cat'] = pd.cut(train_df['TotalSpendings'],bins=bins,labels=[0,1,2,3,4,5,6])
#     df.drop(['RoomService', 'FoodCourt', 'Spa', 'VRDeck', 'ShoppingMall', 'Cabin', 'FirstName', 'PassengerId', 'Id','Name'], axis=1, inplace=True)


# process_columns(train_df)

In [25]:
# non_zero_spendings = train_df[train_df['TotalSpendings'] > 0]

# # Plotting the distribution of Spendings (ignoring zeros)
# plt.figure(figsize=(8, 6))
# bins=[-np.inf,1,500,1000,3000,6000,10000,np.inf]
# sns.histplot(non_zero_spendings['TotalSpendings'], bins=bins, kde=True)
# plt.title('Distribution of Spendings (Ignoring Zeros)')
# plt.xlabel('Spendings')
# plt.ylabel('Frequency')
# plt.show()

In [26]:

# bins = [0, 10, 18, 30, 40, 50,np.inf]
# train_df['Age_category'] = pd.cut(train_df['Age'], bins=bins)

# # Visualize the relationship using a count plot
# sns.countplot(x='Age_category', hue='Transported', data=train_df)
# plt.title('Count Plot of Binned Continuous Variable vs. Categorical Column')
# plt.xticks(rotation=45)
# plt.show()

# # Alternatively, visualize using a heatmap of the cross-tabulation
# cross_tab = pd.crosstab(train_df['Age_category'], train_df['Transported'])
# sns.heatmap(cross_tab, annot=True, cmap="YlGnBu")
# plt.title('Heatmap of Binned Continuous Variable vs. Categorical Column')
# plt.show()

# # Another option: Stacked bar plot
# cross_tab.plot(kind='bar', stacked=True, colormap='Set2')
# plt.title('Stacked Bar Plot of Binned Continuous Variable vs. Categorical Column')
# plt.xticks(rotation=45)
# plt.ylabel('Count')
# plt.show()

In [27]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Assume 'binary_output' is the binary column and 'categorical_column' is a categorical column
# sns.countplot(x='FamilyCount', hue='Transported', data=train_df)
# plt.title('Count Plot of Categorical Column by Binary Output')
# plt.show()
# sns.barplot(x='FamilyCount', y='Transported', data=train_df, errorbar=None)
# plt.title('Bar Plot of Binary Output by Categorical Column')
# plt.show()
