In [1]:
import numpy as np
import pandas as pd
import os

# initialize datasets
gender_submission = None
test = None
train = None

# find file paths for and read in datasets
for subdir, dirs, files in os.walk("titanic\data"):
    for file in files:
        # print(os.path.join(subdir, file))

        if 'gender_submission' in file:
            gender_submission = pd.read_csv(os.path.join(subdir, file))

        if 'test' in file:
            test = pd.read_csv(os.path.join(subdir, file))

        if 'train' in file:
            train = pd.read_csv(os.path.join(subdir, file))

train = train.fillna(0)
test = test.fillna(0)

In [2]:
# find rate (%) of women who survived the Titanic
women = train.loc[train.Sex == 'female']['Survived']
rate_women = sum(women)/len(women)

print(f"% of women who survived: {rate_women}")


# find rate (%) of men who survived the Titanic
men = train.loc[train.Sex == 'male']['Survived']
rate_men = sum(men)/len(men)

print(f"% of men who survived: {rate_men}")

% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924


In [4]:
# random forests
from sklearn.ensemble import RandomForestClassifier

Y = train['Survived']

features = ['Pclass', 'Sex', 'SibSp', 'Parch']
X = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, Y)
print(model.score(X, Y))

predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission_RandomForests.csv', index=False)
print('output successful')

0.8159371492704826
output successful


In [45]:
# logistic regression
from sklearn.linear_model import LogisticRegression
logisticRegressionModel = LogisticRegression()

Y = train['Survived']

features = ['Pclass', 'Sex', 'SibSp', 'Parch']
X_train = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

logisticRegressionModel.fit(X_train, Y)
print(logisticRegressionModel.score(X_train, Y))

predictions = logisticRegressionModel.predict(X_test)

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission_LogisticRegression.csv', index=False)
print('output successful')

0.8002244668911336
output successful


In [55]:
# decision tree regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

Y = train['Survived']
features = ['Pclass', 'Sex', 'SibSp', 'Parch']

X_train = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

decisionTreeRegressorModel = DecisionTreeRegressor(random_state=1)
decisionTreeRegressorModel.fit(X_train, Y);

print(decisionTreeRegressorModel.score(X_train, Y))

predictions = np.array(decisionTreeRegressorModel.predict(X_test))
predictions[predictions < 0.5] = 0
predictions[predictions >= 0.5] = 1

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission_DecisionTreeRegression.csv', index=False)
print('output successful')

0.47112154431804354
output successful
