In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [5]:
#gerating Namelength
train['NameLength'] = train['Name'].apply(lambda x: len(x))
test['NameLength'] = test['Name'].apply(lambda x: len(x))

In [6]:
#convert sex
sex_mapping = {'female':0,'male':1}
train['Sex'] = train['Sex'].map(sex_mapping)
test['Sex'] = test['Sex'].map(sex_mapping)

In [7]:
#insert missing age values , using median value
from sklearn.preprocessing import Imputer
imr_age = Imputer(missing_values='NaN',strategy='median',axis=0)
imr_age.fit(train[['Age']].values)
train['Age'] = imr_age.transform(train[['Age']].values)
test['Age'] = imr_age.transform(test[['Age']].values)

In [8]:
#insert missing Fare values, using median value only test dataset
imr_fare = Imputer(missing_values='NaN',strategy='median',axis=0)
test['Fare'] = imr_fare.fit_transform(test[['Fare']].values)

In [9]:
feature_column = ['Pclass','Sex','Age','Fare','NameLength','SibSp','Parch']
target_column = ['Survived']

In [10]:
X = train[feature_column].values
y = train[target_column].values[:,0]

In [11]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipe_lr = Pipeline([('scl',StandardScaler()),('clf',LogisticRegression(random_state=0))])
pipe_lr.fit(X_train,y_train)
print(pipe_lr.score(X_test,y_test))

0.80223880597


In [13]:
from sklearn.grid_search import GridSearchCV
params_range = [10**(i) for i in np.arange(-4,5,1)]
param_grid =[{'clf__C':params_range,'clf__penalty':['l1']},
            {'clf__C':params_range,'clf__penalty':['l2']}]

gs = GridSearchCV(estimator=pipe_lr,
                 param_grid=param_grid,
                 scoring='accuracy',
                 cv=10,n_jobs=-1)

gs = gs.fit(X_train,y_train)
print(gs.best_score_)
print(gs.best_params_)

0.800963081862
{'clf__penalty': 'l2', 'clf__C': 0.10000000000000001}


In [14]:
#analysis importance of features using random forest
from sklearn.ensemble import RandomForestClassifier
# craete random forest object
forest = RandomForestClassifier(criterion='entropy',n_estimators=100,random_state=0,n_jobs=-1,max_depth=10)
forest.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [15]:
from sklearn.metrics import accuracy_score
pred_train = forest.predict(X_train)
pred_test = forest.predict(X_test)
print('train accuracy: %.3f') %accuracy_score(y_train,pred_train)
print('test accuracy: %.3f') %accuracy_score(y_test,pred_test)

train accuracy: 0.931
test accuracy: 0.828


In [17]:
test_index = test['PassengerId'].values
test_cal = test[feature_column].values

In [19]:
prediction = forest.predict(test_cal)

In [21]:
submission = pd.DataFrame({
        'PassengerId':test_index,
        'Survived':prediction
    })

In [22]:
submission.to_csv('submission.csv',index=False)