In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

from sklearn.linear_model import LogisticRegression




In [2]:
data = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

In [3]:
def prepareData(data, cabin=False):
    import numpy as np
    import pandas as pd

    #function for separating initials
#     def sepInitials(name):
#         return name.split(',')[1].split('.')[0].strip()
#     df_initials = pd.DataFrame({'Salutation':data['Name'].apply(sepInitials)})
#     df_initials = pd.DataFrame({'Salutation':data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())})
    data = pd.merge(data, pd.DataFrame({'Salutation':data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())}), left_index=True, right_index=True)


    def group_salutation(sa):
        if sa == 'Mr':
            return 'Mr'
        elif sa== 'Mrs':
            return 'Mrs'
        elif sa== 'Miss':
            return 'Miss'
        elif sa=='Master':
            return 'Master'
        else:
            return 'others'
    df_3 = pd.DataFrame({'g_salutation': data['Salutation'].apply(group_salutation)})
    data = pd.merge(data, df_3, left_index=True, right_index=True)

    table = data.pivot_table(values='Age', index=['g_salutation'], columns=['Pclass', 'Sex'], aggfunc=np.median)
    
    def fage(x):
        return table[x['Pclass']][x['Sex']][x['g_salutation']]
    data['Age'].fillna(data[data['Age'].isnull()].apply(fage, axis=1), inplace=True)

    data.drop('Name', axis=1, inplace=True)
    title_dumies = pd.get_dummies(data['g_salutation'], prefix='g_salutation')
    data = pd.concat([data, title_dumies], axis=1)
    data.drop('g_salutation', axis=1, inplace=True)


    data.Embarked.fillna(data.Embarked.max(), inplace=True)
    embarked_dummies = pd.get_dummies(data['Embarked'], prefix='Embarked')
    data = pd.concat([data, embarked_dummies], axis=1)
    data.drop('Embarked', axis=1, inplace=True)


    #processing gender
    data['Sex'] = data['Sex'].map({'male':1, 'female':0})



    #encoding and cleaning cabin
    data.Cabin.fillna('U', inplace=True)
    #mapping each cabin value with the cabin letter
    data['Cabin'] = data['Cabin'].map(lambda c:c[0])
    cabin_dummies = pd.get_dummies(data['Cabin'], prefix='Cabin')
    data = pd.concat([data, cabin_dummies], axis=1)
    data.drop('Cabin', axis=1, inplace=True)


    pclass_dummies = pd.get_dummies(data['Pclass'], prefix='Pclass')
    data = pd.concat([data, pclass_dummies], axis=1)
    data.drop('Pclass', axis=1, inplace=True)

    #creating a new feature called family size
    data['FamilySize'] = data['Parch'] + data['SibSp'] +1 
    data['Singleton'] = data['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    data['SmallFamily'] = data['FamilySize'].map(lambda s: 1 if 2<=s<=4 else 0)
    data['LargeFamily'] = data['FamilySize'].map(lambda s: 1 if 5<=s else 0)

    data.drop(['Salutation','Ticket', 'PassengerId'], axis=1, inplace=True)
    
    if cabin:
        data.drop(['Cabin_T'], axis=1, inplace=True)

    if (data.Age.isnull().sum() > 0):
        data_test.Age.fillna(data_test.Age.median(), inplace=True)
    if (data.Fare.isnull().sum() > 0):
        data_test.Fare.fillna(data_test.Fare.median(), inplace=True)


    return data


In [4]:
data = prepareData(data)

In [5]:
x = data.iloc[:,1:]
targets = data.iloc[:, 0]

In [6]:
def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 2, scoring=scoring)
    return np.mean(xval)


In [7]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(x, targets)

In [8]:
features = pd.DataFrame()
features['feature'] = x.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)

features.plot(kind='barh', figsize=(20,20))

<matplotlib.axes._subplots.AxesSubplot at 0x7f87ce9a14d0>

In [9]:
# #transforming the model, removing the poor performance features
model = SelectFromModel(clf, prefit=True, threshold=0.012120)
train_reduced = model.transform(x)
print train_reduced.shape

(891, 17)


In [10]:
x_train, x_test, y_train, y_test = train_test_split(train_reduced, targets)

In [11]:
# #grid search
# pipeline = Pipeline([
#     ('clf', RandomForestClassifier(criterion='entropy'))
# ])
# parameters = {
#     'clf__n_estimators': (5, 10, 20, 50),
#     'clf__max_depth': (50, 150, 250),
#     'clf__min_samples_split': (1, 2, 3),
#     'clf__min_samples_leaf': (1, 2, 3)
# }

In [13]:
# grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
# grid_search.fit(x_train, y_train)

# print 'Best score: %0.3f' % grid_search.best_score_
# print 'Best parameters set:'

# best_parameters = grid_search.best_estimator_.get_params()
# for param_name in sorted(parameters.keys()):
#     print '\t%s: %r' % (param_name, best_parameters[param_name])
    
# predictions = grid_search.predict(x_test)
# print classification_report(y_test, predictions)

In [14]:
parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}
    
model = RandomForestClassifier(**parameters)
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=6, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [15]:
predictions = model.predict(x_test)

In [16]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, predictions)

0.82511210762331844