In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import seaborn as sns

from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

import os
print(os.listdir("../input"))

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
y_test_0 = pd.read_csv('../input/gender_submission.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
# Correlation matrix between numerical values (SibSp Parch Age and Fare values) and Survived 
sns.heatmap(train[["Survived","SibSp","Parch","Age","Fare"]].corr(),
                annot=True, fmt = ".2f", cmap = "coolwarm")

In [None]:
test = test.join(y_test_0.set_index('PassengerId'), on='PassengerId')
test.describe()

**Preprocessing**

1. Combining train and test datasets to work with categirial features

In [None]:
data = pd.concat([train, test], ignore_index=True, sort=False)
data.drop('PassengerId', 1, inplace=True)
X = data.drop('Survived', 1)
y = data['Survived']

In [None]:
data.shape

2. Drop raws with NaNs

In [None]:
data.dropna().shape

*This method cant be use because we loose a huge ammount of data *

3. Numerical and categorial columns preprocessing

In [None]:
cat_cols = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
num_cols = list(set(X.columns.values.tolist()) - set(cat_cols))

In [None]:
X_num = X[num_cols].fillna(0)
X_cat = X[cat_cols].fillna('NA', axis=0).applymap(str)

In [None]:
#Categorial features one-hot encoding
encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())

#Numerical features scaling
scaler = StandardScaler()
scaler.fit(X_num, y)
X_num_scaled = scaler.transform(X_num)

#Combine features
X = np.hstack((X_cat_oh,X_num_scaled))

In [None]:
print("Lenth of y:", len(y.values),'\n'"Lenth of X:", len(X))

**Logistic regression**

In [None]:
y = np.reshape(y.values,(len(y),1))

In [None]:
(X_train, X_test, 
 y_train, y_test) = (X[:len(train)],  X[len(train):],
                    y[:len(train)],  y[len(train):])

In [None]:
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
cv = 3

clf = LR(random_state = 0)

grid_cv = GridSearchCV(clf, param_grid, cv = 3)
grid_cv.fit(X_train, y_train)

predicted = grid_cv.best_estimator_.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, predicted)

In [None]:
roc_auc

In [None]:
test_Survived = pd.Series(grid_cv.best_estimator_.predict(X_test), name="Survived")
result = pd.concat([y_test_0.PassengerId,test_Survived],axis=1)
result.to_csv("result.csv",index=False)

In [None]:
result