In [1]:
#Imports 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

#preprocessing
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

In [None]:
df = pd.read_csv("data/powerBITrain.csv")
train = pd.read_csv("data/powerBITrain.csv")
test = pd.read_csv("data/powerBITest.csv")

# Load original test data (which has the correct order of PassengerId)
originaltest = pd.read_csv("data/test.csv")

# Sort the test dataframe to match originaltest if needed (assuming index might differ)
test = test.sort_values(by='PassengerId', ascending=True)

print(test)

### Encode objects and fill blanks

In [None]:
# Grouping Ticket by frequency
#train set
encodedTicket = train.groupby('Ticket').size() / len(train)
train.loc[:, "EncodedTicket"] = train['Ticket'].map(encodedTicket)
#test set
encodedTicket = test.groupby('Ticket').size() / len(test)
test.loc[:, "EncodedTicket"] = test['Ticket'].map(encodedTicket)

#Grouping Sex by binary encoding
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the 'Sex' column
#train set
train['SexBinary'] = label_encoder.fit_transform(train['Sex'])
#test set
test['SexBinary'] = label_encoder.fit_transform(test['Sex'])

#Grouping Embarked by one hot encoding
#fill 2 null values with mode before encoding 
embarked_mode = train['Embarked'].mode()[0]  # Calculate mode
train['Embarked'].fillna(embarked_mode, inplace=True)  # Fill NaNs with mode

#train set
train = pd.get_dummies(train, columns=['Embarked'], prefix='Embarked', drop_first=False)
#test set
test = pd.get_dummies(test, columns=['Embarked'], prefix='Embarked', drop_first=False)

# change title from object to int
encodedTitle = train.groupby('Title').size() / len(train)
train.loc[:, "encodedTitle"] = train['Title'].map(encodedTitle)
#test set
encodedTitle = test.groupby('Title').size() / len(test)
test.loc[:, "encodedTitle"] = test['Title'].map(encodedTitle)

print(train)

In [None]:
#make train and test features the same
train.drop(["Survived", "PassengerId", "Sex", "Ticket","Parch", "SibSp", "Title"], axis=1, inplace=True)
test.drop(["PassengerId","Sex", "Ticket","Parch", "SibSp", "Title"], axis=1, inplace=True)
print(train.info())
print(test.info())

In [34]:
#terative imputer
imputer = IterativeImputer(
        missing_values=np.nan,
        random_state=0,
        n_nearest_features=3,
        max_iter=20,
        sample_posterior=True,
    )
train_imputed = imputer.fit_transform(train)
test_imputed = imputer.fit_transform(test)

# Convert imputed values back to DataFrame
train = pd.DataFrame(train_imputed, columns=train.columns)
test = pd.DataFrame(test_imputed, columns=test.columns)

#No null values
#only features to be used are in DF
print(train.info())
print(test.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            712 non-null    float64
 1   FamilyCount    712 non-null    float64
 2   Fare           712 non-null    float64
 3   Pclass         712 non-null    float64
 4   EncodedTicket  712 non-null    float64
 5   SexBinary      712 non-null    float64
 6   Embarked_C     712 non-null    float64
 7   Embarked_Q     712 non-null    float64
 8   Embarked_S     712 non-null    float64
 9   encodedTitle   712 non-null    float64
dtypes: float64(10)
memory usage: 55.8 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            418 non-null    float64
 1   FamilyCount    418 non-null    float64
 2   Fare           418 non-null    flo

Unnamed: 0,Age,FamilyCount,Fare,Pclass,EncodedTicket,SexBinary,Embarked_C,Embarked_Q,Embarked_S,encodedTitle
0,0.0,1.0,8.5167,3.0,0.213483,1.0,1.0,0.0,0.0,0.050562
1,1.0,2.0,15.7417,3.0,0.213483,0.0,1.0,0.0,0.0,0.203652
2,1.0,2.0,37.0042,2.0,0.078652,1.0,1.0,0.0,0.0,0.050562
3,1.0,3.0,19.2583,3.0,0.213483,0.0,1.0,0.0,0.0,0.203652
4,1.0,3.0,19.2583,3.0,0.213483,0.0,1.0,0.0,0.0,0.203652


### Model Training

In [35]:
#split into training and testing sets
X = train.values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# All models to apply
models = {
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=10000),
    "Decision Tree": DecisionTreeClassifier(random_state=0),
    "Random Forest": RandomForestClassifier(random_state=0),
    "Support Vector Machine": SVC(random_state=0),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost": xgb.XGBClassifier(random_state=0, objective='binary:logistic')
}

# Models' hyperparameters for grid search
param_grids = {
    "Logistic Regression": [
        {
            'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'logisticregression__penalty': ['l1', 'l2'],
            'logisticregression__solver': ['liblinear']
        },
        {
            'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'logisticregression__penalty': ['l2'],
            'logisticregression__solver': ['lbfgs']
        }
    ],
    "Decision Tree": {
        'decisiontreeclassifier__criterion': ['gini', 'entropy'],
        'decisiontreeclassifier__max_depth': [None, 5, 10, 20, 30],
        'decisiontreeclassifier__min_samples_split': [2, 5, 10],
        'decisiontreeclassifier__min_samples_leaf': [1, 2, 4],
        'decisiontreeclassifier__max_features': ['sqrt', 'log2', None]
    },
    "Random Forest": {
        'randomforestclassifier__n_estimators': [100, 200],
        'randomforestclassifier__max_depth': [2, 4, 5, 8],
        'randomforestclassifier__min_samples_split': [2, 5, 10],
        'randomforestclassifier__min_samples_leaf': [1, 2, 4, 8, 16],
        'randomforestclassifier__max_features': ['sqrt', 'log2', None],
        'randomforestclassifier__bootstrap': [True, False]
    },
    "Support Vector Machine": {
            'svc__C': [0.1, 1, 10, 100],
            'svc__kernel': ['linear']
    },
    "K-Nearest Neighbors": {
        'kneighborsclassifier__n_neighbors': [3, 5, 7, 10],
        'kneighborsclassifier__weights': ['uniform', 'distance'],
        'kneighborsclassifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'kneighborsclassifier__p': [1, 2]  # p=1 for Manhattan distance, p=2 for Euclidean distance
    },
    "XGBoost": {
        'xgbclassifier__n_estimators': [50, 100, 200, 300],
        'xgbclassifier__max_depth': [3, 4, 5, 6],
        'xgbclassifier__learning_rate': [0.01, 0.1, 0.2, 0.3],
        'xgbclassifier__subsample': [0.5, 0.7, 1],
        'xgbclassifier__colsample_bytree': [0.5, 0.7, 1],
        'xgbclassifier__gamma': [0, 0.1, 0.2, 0.3]
    }
}

In [28]:
#Loop through models and perform GridSearchCV
#Remove model names in front of parameter names from grid search
def strip_prefix(param_dict):
    return {key.split("_", 2)[-1]: value for key, value in param_dict.items()}

best_models = {}
for name, model in models.items():
    print(f"Training {name}...")
    #pipeline
    pipeline = make_pipeline(model)
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grids[name], scoring='accuracy', cv=5, n_jobs=4)
    grid_search.fit(X_train, y_train)

    best_accuracy = grid_search.best_score_
    # Strip the prefixes for other models use
    best_parameters = strip_prefix(grid_search.best_params_)
    
    best_models[name] = {
        "best_accuracy": best_accuracy,
        "best_parameters": best_parameters
    }
    print(f"Highest Accuracy: {round(best_models[name]['best_accuracy'], 3)}")
    

Training Logistic Regression...
Highest Accuracy: 0.808
Training Decision Tree...
Highest Accuracy: 0.821
Training Random Forest...
Highest Accuracy: 0.84
Training Support Vector Machine...
Highest Accuracy: 0.798
Training K-Nearest Neighbors...
Highest Accuracy: 0.731
Training XGBoost...
Highest Accuracy: 0.842


### Prediction

In [36]:
#Apply best model to test set 
originaltest = pd.read_csv("data/test.csv")

#Get best grid searched model
model = models['XGBoost']
best_params = best_models['XGBoost']['best_parameters']
model.set_params(**best_params)

#Predict model from whole test set
model.fit(X, y)
predictions = model.predict(test.values)

try:
    output = pd.DataFrame({'PassengerId': originaltest.PassengerId, 'Survived': predictions})
    output.to_csv('data/submissionV2.csv', index=False)
except Exception:
    print("Failed to save csv")
else:
    print("Your submission was successfully saved!")

Your submission was successfully saved!


Current submission score = 0.76076