In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import seaborn as sns

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [6]:
train_data.shape,test_data.shape

((891, 12), (418, 11))

In [7]:
train_data.info(),test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float

(None, None)

In [8]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [9]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
from sklearn.base import BaseEstimator,TransformerMixin

In [11]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [15]:
class AttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self,add_family_size = True):
        self.add_family_size = add_family_size
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        X['family_size'] = X['SibSp'] + X['Parch']
        return X

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
from sklearn.pipeline import Pipeline
try:
    from sklearn.impute import SimpleImputer
except:
    from sklearn.preprocessing import Imputer as SimpleImputer

In [16]:
num_pipeline = Pipeline([
    ("select_numeric",DataFrameSelector(["Age","SibSp","Parch","Fare"])),
    ("att_add",AttributeAdder()),
    ("imputer",SimpleImputer(strategy="median")),
    ("scaling",StandardScaler())
])

In [17]:
num_pipeline.fit_transform(train_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


array([[-0.56573646,  0.43279337, -0.47367361, -0.50244517,  0.05915988],
       [ 0.66386103,  0.43279337, -0.47367361,  0.78684529,  0.05915988],
       [-0.25833709, -0.4745452 , -0.47367361, -0.48885426, -0.56097483],
       ...,
       [-0.1046374 ,  0.43279337,  2.00893337, -0.17626324,  1.29942929],
       [-0.25833709, -0.4745452 , -0.47367361, -0.04438104, -0.56097483],
       [ 0.20276197, -0.4745452 , -0.47367361, -0.49237783, -0.56097483]])

In [18]:
class MostFrequentImputer(BaseEstimator,TransformerMixin):
    def fit(self, X , y = None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],index=X.columns)
        return self
    def transform(self, X , y = None):
        return X.fillna(self.most_frequent_)

In [40]:
rare_titles = ['Dr.','Rev.','Major.','Col.','Capt.','Don.','Sir.','Lady.']

In [54]:
class CatAttributeAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_att = True):
        self.add_att = add_att
    def fit(self, X , y = None):
        return self
    def transform(self,X ,y = None):
        X['Title'] = X.apply(lambda x:x['Name'].split(',')[1].split()[0],axis=1)
        X['Rare Title'] = X['Title'].apply(lambda x:1 if(x in rare_titles) else 0)
        X.drop(columns = ['Title','Name'],inplace=True)
        return X

In [55]:
from sklearn.preprocessing import OneHotEncoder

In [56]:
cat_pipeline = Pipeline([
    ("select_cat",DataFrameSelector(["Name","Pclass","Sex","Embarked"])),
    ("cat_adder",CatAttributeAdder()),
    ("imputer",MostFrequentImputer()),
    ("cat_encoder",OneHotEncoder(sparse=False)),
])

In [57]:
cat_pipeline.fit_transform(train_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


array([[0., 0., 1., ..., 1., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 1., 1., 0.],
       ...,
       [0., 0., 1., ..., 1., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [58]:
from sklearn.pipeline import FeatureUnion

In [59]:
preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline),
])

In [60]:
X_train = preprocess_pipeline.fit_transform(train_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [61]:
y_train = train_data["Survived"]

In [62]:
X_test = preprocess_pipeline.fit_transform(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [63]:
from sklearn.ensemble import RandomForestClassifier

In [64]:
from sklearn.model_selection import cross_val_score

In [65]:
forest_clf = RandomForestClassifier(n_estimators=100,random_state=42)

In [66]:
forest_scores = cross_val_score(forest_clf,X_train,y_train,cv=10)

In [67]:
forest_scores.mean()

0.8070372261945297

In [68]:
from sklearn.model_selection import cross_val_predict

In [69]:
y_train_pred = cross_val_predict(forest_clf,X_train,y_train,cv=10)

In [70]:
from sklearn.metrics import confusion_matrix,precision_recall_curve,roc_auc_score,roc_curve

In [71]:
confusion_matrix(y_train,y_train_pred)

array([[465,  84],
       [ 88, 254]], dtype=int64)

In [72]:
forest_clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [73]:
test_pred = forest_clf.predict(X_test)

In [74]:
test_data['Survived'] = test_pred

In [75]:
test_data[['PassengerId','Survived']].to_csv("rf_submission.csv",index=False)

In [76]:
from sklearn.linear_model import LogisticRegression

In [77]:
log_clf = LogisticRegression()

In [78]:
log_clf.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [79]:
log_pred = log_clf.predict(X_test)

In [80]:
y_train_lof = log_clf.predict(X_train)

In [81]:
confusion_matrix(y_train,y_train_lof)

array([[478,  71],
       [102, 240]], dtype=int64)

In [82]:
test_data['Survived'] = log_pred

In [83]:
test_data[['PassengerId','Survived']].to_csv("log_submission.csv",index=False)