In [1]:
!pip install xgboost



In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import precision_score,accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.ensemble import VotingClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn import metrics,svm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

In [3]:
def process_train(file):
    df = pd.read_csv(file)
    categorical = ['Pclass', 'Embarked']

    cols = ["PassengerId","Survived","Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    df = df[cols]

    df['Sex'].replace({'male': 1, 'female': 0}, inplace=True)
    df['Embarked'].fillna('C', inplace=True)
    df['Age'].fillna(df['Age'].mean(), inplace = True)

    ### CREATE ONE HOT ENCODINGS

    for var in categorical:
        df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], axis=1)
        del df[var]

    ### STANDARDIZE DATA

    # a = df["PassengerId"]
    # df = (df - df.min()) / (df.max() - df.min())
    # df["PassengerId"] = a

    return df



def process_test(file):
    df = pd.read_csv(file)
    categorical = ['Pclass', 'Embarked']

    cols = ["PassengerId","Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    df = df[cols]

    df['Sex'].replace({'male': 1, 'female': 0}, inplace=True)
    df['Embarked'].fillna('C', inplace=True)
    df['Age'].fillna(df['Age'].median(), inplace = True)
    df['Fare'].fillna(df['Fare'].median(), inplace = True)

    ### CREATE ONE HOT ENCODINGS

    for var in categorical:
        df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], axis=1)
        del df[var]

    ### STANDARDIZE DATA

    # a = df["PassengerId"]
    # df = (df - df.min()) / (df.max() - df.min())
    # df["PassengerId"] = a

    return df

In [4]:
df = process_train("Downloads/train.csv")

In [5]:
X = df.drop("Survived", axis = 1)

In [6]:
df['Survived'] = df['Survived'].astype('category')
y = df["Survived"]

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.33,
                                                        random_state=0,
                                                        stratify=y)

In [8]:
pca = PCA(n_components='mle', svd_solver='auto')
x_pca = pca.fit_transform(x_train)

In [9]:
# pca.get_feature_names_out(input_features=cols)

In [10]:
# Prepare data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(x_pca,y_train)

In [11]:
# x_train, x_test, y_train, y_test = cross_validation.train_test_split(X,y,train_size=.8, stratify=y)

# y_test

In [12]:
models = [
    (XGBClassifier(), {
        "max_depth": [3, 6,10],
        "learning_rate": [0.01,0.1, 0.001],
        "n_estimators": [100, 300, 500],
        "colsample_bytree": [0.3, 0.5, 0.7]

    }),
        (svm.NuSVC(gamma="auto",probability = True), {
        "nu": [0.25, 0.5, 0.75],
        "kernel": ["linear", "rbf", "poly"]
        }),
    
    (MLPClassifier(max_iter=10000,random_state=1),{
        "alpha" :[1e-5,1e-3],
        "hidden_layer_sizes" : [5,50,100],
        "solver" : ['lbfgs',"adam","sgd"],
        "learning_rate" : ["invscaling","constant"],
        "learning_rate_init" : [0.001,0.01,0.1]
        }),

                  
     (GradientBoostingClassifier(), {
        "max_depth": [3,6,10],
        "learning_rate":[0.01, 0.1, 0.001],
        "n_estimators": [100, 300, 500]
        # "loss": ["squared_error", "absolute_error", "huber"]
    })
]

vr_estimators = []
for model, parameters in models:
    grid_search = GridSearchCV(estimator=model, cv=5, param_grid=parameters, scoring = "roc_auc")
    grid_search.fit(scaled_data, y_train)
    vr_estimators.append((model.__class__.__name__, grid_search.best_estimator_))


# Create the VotingRegressor using the best estimators
vr = VotingClassifier(vr_estimators, voting = "soft")
vote_reg = vr.fit(scaled_data, y_train)

In [13]:
preds = vote_reg.predict(scaled_data)

In [14]:
cm = confusion_matrix(y_train, preds)

# # Visualize confusion matrix
# plt.figure(figsize=(8, 6))
# sns.set(font_scale=1.2)  # Adjust font size
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
#             xticklabels=['Predicted Negative', 'Predicted Positive'],
#             yticklabels=['True Negative', 'True Positive'])
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.title('Confusion Matrix')
# plt.show()

In [15]:
cm

array([[348,  19],
       [ 68, 161]])

In [16]:
accuracy_train = accuracy_score(y_train,preds)
accuracy_train

0.8540268456375839

In [17]:
x_test = pca.transform(x_test)

In [18]:
x_test = scaler.transform(x_test)

In [19]:
pred_val = vote_reg.predict(x_test)

In [20]:
accuracy = accuracy_score(y_test,pred_val)

In [21]:
accuracy

0.8305084745762712

In [22]:
df_val = process_test("Downloads/test.csv")

In [23]:
x_val= pca.transform(df_val)

In [24]:
x_val = scaler.transform(x_val)

In [25]:
pred_valid = vote_reg.predict(x_val)

In [26]:
# Assuming pred_valid is a numpy array and x_val_index is a pandas Index object
# combined_test = pd.DataFrame({'PassengerId': df_val["PassengerId"], 'Survived': pred_valid}, index=df_val.index)


In [27]:
# Create DataFrame without specifying index
combined_test = pd.DataFrame({'PassengerId': df_val["PassengerId"].values, 'Survived': pred_valid})


In [28]:
combined_test

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [29]:
combined_test.to_csv("submission.csv", index = False)