In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split

In [50]:
import numpy as np

In [44]:
# preprocessing / pipeline
from sklearn import preprocessing


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [31]:
final_test = pd.read_csv("../data/processed/final_test.csv")
final_train = pd.read_csv("../data/processed/final_train.csv")

In [55]:
x_array = np.array(final_test['Age'])
y_array = np.array(final_train['Age'])

In [75]:
normalizer = preprocessing.Normalizer()

In [87]:
final_test['Age'] = normalizer.fit_transform([x_array])[0]
final_train['Age'] = normalizer.fit_transform([y_array])[0]

In [89]:
x_array = np.array(final_test['Fare'])
y_array = np.array(final_train['Fare'])
final_test['Fare'] = normalizer.fit_transform([x_array])[0]
final_train['Fare'] = normalizer.fit_transform([y_array])[0]

In [90]:
final_test.head()

Unnamed: 0,PassengerId,Age,Fare,TravelAlone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male,IsMinor
0,892,0.052115,0.005788,1,0,0,1,0,1,0,1,0
1,893,0.070997,0.005175,0,0,0,1,0,0,1,0,0
2,894,0.093655,0.007162,1,0,1,0,0,1,0,1,0
3,895,0.040785,0.006404,1,0,0,1,0,0,1,1,0
4,896,0.033233,0.009084,0,0,0,1,0,0,1,0,0


In [91]:
feature_columns_to_use = ['Age','TravelAlone','Pclass_1','Pclass_2', 'Fare','Sex_male', 'IsMinor', 'Embarked_C', 'Embarked_S']

In [92]:
# Prepare the inputs for the model
y = final_train['Survived']
X = final_train.loc[:, feature_columns_to_use]

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=104, 
                                                           test_size=0.25, 
                                                           shuffle=True)


In [96]:
# define model
model = LogisticRegression()

In [107]:
# define search space
space = dict()
#solver{‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}, default=’lbfgs’
space['solver'] = ['newton-cg', 'sag']
#penalty{‘l1’, ‘l2’, ‘elasticnet’, None}, default=’l2’
space['penalty'] = ['l2']


In [108]:
# define search
search = GridSearchCV(model, space)

In [109]:
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.7778042809616471
Best Hyperparameters: {'penalty': 'l2', 'solver': 'newton-cg'}


In [110]:
model = LogisticRegression(solver='newton-cg', penalty = 'l2', random_state=4).fit(train_X, train_y)

In [111]:
confusion_matrix(test_y, model.predict(test_X))

array([[113,  32],
       [ 23,  55]], dtype=int64)

In [112]:
final_X = final_test.loc[:, feature_columns_to_use]

In [113]:
predictions = model.predict(final_X)

In [114]:
# Kaggle needs the submission to have a certain format;
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# for an example of what it's supposed to look like.
submission = pd.DataFrame({ 'PassengerId': final_test['PassengerId'],
                            'Survived': predictions })
submission.to_csv("../data/processed/submission.csv", index=False)

In [115]:
import kaggle

In [116]:
! kaggle competitions submit -c titanic -f ../data/processed/submission.csv -m "Logistic regression with normalisation applied"

Successfully submitted to Titanic - Machine Learning from Disaster


  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 29.8kB/s]
100%|##########| 3.18k/3.18k [00:01<00:00, 1.67kB/s]



