In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
final_test = pd.read_csv("../data/processed/final_test.csv")
final_train = pd.read_csv("../data/processed/final_train.csv")

In [3]:
feature_columns_to_use = ['Age','TravelAlone','Pclass_1','Pclass_2', 'Fare','Sex_male', 'IsMinor', 'Embarked_C', 'Embarked_S']

In [6]:
# Prepare the inputs for the model
y = final_train['Survived']
X = final_train.loc[:, feature_columns_to_use]

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=104, 
                                                           test_size=0.25, 
                                                           shuffle=True)


In [23]:
# define model
model = LogisticRegression()

In [68]:
# define search space
space = dict()
#solver{‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}, default=’lbfgs’
space['solver'] = ['newton-cg', 'sag']
#penalty{‘l1’, ‘l2’, ‘elasticnet’, None}, default=’l2’
#space['penalty'] = ['l1', 'l2', 'elasticnet']


In [69]:
# define search
search = GridSearchCV(model, space)

In [70]:
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.7935032326909798
Best Hyperparameters: {'solver': 'newton-cg'}




In [71]:
model = LogisticRegression(solver='newton-cg', random_state=4).fit(train_X, train_y)

In [72]:
confusion_matrix(test_y, model.predict(test_X))

array([[120,  25],
       [ 20,  58]], dtype=int64)

In [73]:
final_X = final_test.loc[:, feature_columns_to_use]

In [74]:
predictions = model.predict(final_X)

In [75]:
# Kaggle needs the submission to have a certain format;
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# for an example of what it's supposed to look like.
submission = pd.DataFrame({ 'PassengerId': final_test['PassengerId'],
                            'Survived': predictions })
submission.to_csv("../data/processed/submission.csv", index=False)