In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter("ignore")

In [2]:
train_data = pd.read_csv("train.csv")
pred_data = pd.read_csv("test.csv")

In [3]:
from sklearn.base import BaseEstimator,TransformerMixin

In [4]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [5]:
class AttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self,add_family_size = True):
        self.add_family_size = add_family_size
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        X['family_size'] = X['SibSp'] + X['Parch']
        return X

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
from sklearn.pipeline import Pipeline
try:
    from sklearn.impute import SimpleImputer
except:
    from sklearn.preprocessing import Imputer as SimpleImputer

In [8]:
from sklearn.pipeline import Pipeline
try:
    from sklearn.impute import SimpleImputer
except:
    from sklearn.preprocessing import Imputer as SimpleImputer

In [9]:
num_pipeline = Pipeline([
    ("select_numeric",DataFrameSelector(["Age","SibSp","Parch","Fare"])),
    ("att_add",AttributeAdder()),
    ("imputer",SimpleImputer(strategy="median")),
    ("scaling",StandardScaler())
])

In [10]:
num_pipeline.fit_transform(train_data)

array([[-0.56573646,  0.43279337, -0.47367361, -0.50244517,  0.05915988],
       [ 0.66386103,  0.43279337, -0.47367361,  0.78684529,  0.05915988],
       [-0.25833709, -0.4745452 , -0.47367361, -0.48885426, -0.56097483],
       ...,
       [-0.1046374 ,  0.43279337,  2.00893337, -0.17626324,  1.29942929],
       [-0.25833709, -0.4745452 , -0.47367361, -0.04438104, -0.56097483],
       [ 0.20276197, -0.4745452 , -0.47367361, -0.49237783, -0.56097483]])

In [11]:
class MostFrequentImputer(BaseEstimator,TransformerMixin):
    def fit(self, X , y = None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],index=X.columns)
        return self
    def transform(self, X , y = None):
        return X.fillna(self.most_frequent_)

In [12]:
rare_titles = ['Dr.','Rev.','Major.','Col.','Capt.','Don.','Sir.','Lady.']

In [13]:
class CatAttributeAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_att = True):
        self.add_att = add_att
    def fit(self, X , y = None):
        return self
    def transform(self,X ,y = None):
        X['Title'] = X.apply(lambda x:x['Name'].split(',')[1].split()[0],axis=1)
        X['Rare Title'] = X['Title'].apply(lambda x:1 if(x in rare_titles) else 0)
        X.drop(columns = ['Title','Name'],inplace=True)
        return X

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
cat_pipeline = Pipeline([
    ("select_cat",DataFrameSelector(["Name","Pclass","Sex","Embarked"])),
    ("cat_adder",CatAttributeAdder()),
    ("imputer",MostFrequentImputer()),
    ("cat_encoder",OneHotEncoder(sparse=False)),
])

In [16]:
cat_pipeline.fit_transform(train_data)

array([[0., 0., 1., ..., 1., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 1., 1., 0.],
       ...,
       [0., 0., 1., ..., 1., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [17]:
from sklearn.pipeline import FeatureUnion

In [18]:
preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline),
])

In [19]:
X_train = preprocess_pipeline.fit_transform(train_data)

In [20]:
y_train = train_data["Survived"]

In [21]:
X_test = preprocess_pipeline.fit_transform(pred_data)

## Logistic Regression

In [22]:
from sklearn.model_selection import cross_val_score

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
log_reg = LogisticRegression()

In [25]:
log_scores = cross_val_score(log_reg,X_train,y_train,cv=10)

In [26]:
log_scores.mean()

0.7991848257859493

In [27]:
from sklearn.model_selection import cross_val_predict

In [28]:
y_train_pred = cross_val_predict(log_reg,X_train,y_train,cv=10)

In [29]:
from sklearn.metrics import confusion_matrix,precision_recall_curve,roc_auc_score,roc_curve

In [30]:
confusion_matrix(y_train,y_train_pred)

array([[474,  75],
       [104, 238]], dtype=int64)

In [31]:
log_reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
test_pred = log_reg.predict(X_test)

In [33]:
pred_data['Survived'] = test_pred

In [34]:
pred_data[['PassengerId','Survived']].to_csv("log_reg_submission.csv",index=False)

## Linear Discriminant Analysis

In [35]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [37]:
lda = LinearDiscriminantAnalysis(solver='lsqr',shrinkage='auto')

In [38]:
lda.fit(X_train,y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage='auto',
                           solver='lsqr', store_covariance=False, tol=0.0001)

In [39]:
lda_train_pred = lda.predict(X_train)

In [40]:
confusion_matrix(lda_train_pred,y_train)

array([[474, 104],
       [ 75, 238]], dtype=int64)

In [41]:
lda_predictions = lda.predict(X_test)

In [42]:
pred_data['Survived'] = lda_predictions

In [43]:
pred_data[['PassengerId','Survived']].to_csv("lda_submission.csv",index=False)

In [44]:
from sklearn.covariance import OAS

In [45]:
oa = OAS(store_precision=False,assume_centered=False)

In [47]:
lda_2 = LinearDiscriminantAnalysis(solver='lsqr',shrinkage='auto',store_covariance=oa)

In [48]:
lda_2.fit(X_train,y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage='auto',
                           solver='lsqr',
                           store_covariance=OAS(assume_centered=False,
                                                store_precision=False),
                           tol=0.0001)

In [54]:
lda_2_train_pred = lda_2.predict(X_train)

In [55]:
confusion_matrix(lda_2_train_pred,y_train)

array([[474, 104],
       [ 75, 238]], dtype=int64)

In [56]:
lda_2_predictions = lda_2.predict(X_test)

In [57]:
pred_data['Survived'] = lda_2_predictions

In [58]:
pred_data[['PassengerId','Survived']].to_csv("lda_submission_covariance.csv",index=False)