## Binary Classification: Classifying survivers of the titanic

In [7]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

from sklearn.svm import SVC

### Import training and testing data as dataframes

In [2]:
rawTrainingData = pd.read_csv("./train.csv")
rawTestingData = pd.read_csv("./test.csv")
print(rawTrainingData.shape)
print(rawTestingData.shape)

(891, 12)
(418, 11)


### Prepares data for models

In [3]:
# Define target variable
target = rawTrainingData["Survived"]

# Define predictor variables
features = ["Pclass", "Sex", "SibSp", "Parch"]

# get_dummies converts these predictor variables into numerical features
predictor = pd.get_dummies(rawTrainingData[features])
testingPredictor = pd.get_dummies(rawTestingData[features])

### Random Forest Model From SciKit-Learn 77.5% accuracy

In [None]:
# Defines the Random Forest Model with (Number of trees, max depth of each tree, random seed for initialization)
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# Fits the model to the training data
model.fit(predictor, target)
# Predicts the target variables for the test data
predictionsRF = model.predict(testingPredictor)

# Creates a dataFrame for output
outputRF = pd.DataFrame({'PassengerId': rawTestingData.PassengerId, 'Survived': predictionsRF})
outputRF.to_csv('submission.csv', index=False)

### Logistic Regression Model From SciKit-Learn 76.6% accuracy

In [None]:
# Fit a logistic regression model
logreg = LogisticRegression()
logreg.fit(predictor, target)

# Use SelectFromModel to select the most informative features
sfm = SelectFromModel(logreg, prefit=True)
selectedFeatures = sfm.transform(predictor)
selectedTestingFeatures = sfm.transform(testingPredictor)


# Fit a logistic regression model using the selected features
logregSelected = LogisticRegression()
logregSelected.fit(selectedFeatures, target)

# Use the model to predict the target variable for the testing data
predictionsLR = logregSelected.predict(selectedTestingFeatures)

# Creates a dataFrame for output
outputLR = pd.DataFrame({'PassengerId': rawTestingData.PassengerId, 'Survived': predictionsLR})
outputLR.to_csv('submissionLR.csv', index=False)

### Support Vector Machines From SkiKit-Learn

In [None]:
# Create and fit an SVM model using the linear kernel
model = SVC(kernel='linear', C=1)
model.fit(predictor, target)
predictionsSVM = model.predict()