# Machine Learning

### Loading the packages

In [1]:
import warnings
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score

# models
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

warnings.filterwarnings("ignore")

### Preparing the data

In [2]:
data = pd.read_csv("../data/train.csv")

In [3]:
X = data.drop(columns=["PassengerId", "Survived"])
y = data["Survived"]

Standardizing the data using `StandardScaler`.

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

### Choosing the model

Setting the list of all the models into a dictionary.

In [5]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Ridge Classifier": RidgeClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "K-Neighbours Classifier": KNeighborsClassifier(),
    "SVC": SVC()
}

Evaluating all the models using K-Fold Cross Validation and visualising their performances.

In [6]:
performance = {}

for name, model in models.items():
    cv_score = cross_val_score(model, X, y, cv=5).mean()
    performance[name] = cv_score

performance_df = pd.DataFrame({"Model": performance.keys(), "CV Score": performance.values()})
performance_df.sort_values(by="CV Score", ascending=False)

Unnamed: 0,Model,CV Score
4,Gradient Boosting Classifier,0.824939
6,SVC,0.824901
3,Random Forest Classifier,0.814839
5,K-Neighbours Classifier,0.804733
1,Ridge Classifier,0.793491
0,Logistic Regression,0.786768
2,Decision Tree Classifier,0.774433


### Finetuning the model

The best model is `GradientBoostingClassifier`.

In [7]:
model = GradientBoostingClassifier()

Defining a parameter grid to perform the Grid Search tuning.

In [8]:
param_grid = {
    "criterion": ["fridman_mse", "squared_error"],
    "loss": ["log_loss", "deviance", "exponential"],
    "learning_rate": [0.001, 0.01, 0.1],
    "max_depth": [3, 4],
    "n_estimators": [50, 100, 200],
    "tol": [0.00001, 0.0001, 0.001]
}

Performing the Grid Search and visualising the final state of the model.

In [9]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X, y)
model = grid_search.best_estimator_

print("Final Accuracy:", cross_val_score(model, X, y, cv=5).mean())
model.fit(X, y)

Final Accuracy: 0.8384031134266525


### Making a submission

In [10]:
test = pd.read_csv("../data/test.csv")

In [11]:
X = test.drop(columns=["PassengerId"])
X = scaler.transform(X)
prediction = model.predict(X)

In [12]:
prediction = pd.DataFrame({"Survived": prediction})
submission = pd.concat([test["PassengerId"], prediction], axis=1)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


Saving the submission.

In [13]:
submission.to_csv("../data/submission.csv", index=False)