## 2015-03-16

- ランダムフォレスト + ロジスティック回帰

In [253]:
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [254]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [255]:
df_train.drop('PassengerId', axis=1, inplace=True)
df_train.head(2)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C


In [256]:
def _extract_title(name):
    if name.find('Mr.') > 0:
        return 'Mr'
    elif name.find('Mrs.') > 0:
        return 'Mrs'
    elif name.find('Master.') > 0:
        return 'Master'
    elif name.find('Miss.') > 0:
        return 'Miss'
    else:
        return None
    
def extract_title(df):
    df['Title'] = df.Name.apply(lambda n: _extract_title(n))
    title_bin = pd.get_dummies(df.Title)
    title_bin.rename(columns=lambda x: 'title' + "_" + str(x), inplace=True)
    df = df.join(title_bin)
    return df

In [257]:
import math
def _fill_fare(row):
    fare = row[0]
    sibsp = row[1]
    parch = row[2]
    pclass = row[3]
    family = min(max(1, sibsp + parch), 4)
    
    if fare is None or fare == 0 or math.isnan(fare):
        if pclass == 1:
            fare = 86
        elif pclass == 2:
            fare = 21
        else:
            fare = 10
    else:
        pass
        #fare = fare/float(family)
    return np.log(fare)
    
    
def fill_fare(df):
    df['FareFill'] = df[['Fare', 'SibSp', 'Parch', 'Pclass']].apply(_fill_fare, axis=1)
    return df

In [262]:
def fill_age(df):
    df['AgeFill'] = df.Age
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 1)] = 40
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 2)] = 31
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 3)] = 26
    df.AgeFill[df.Age.isnull() & (df.Title == 'Master')] = 3.5
    
    df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 1)] = 41.5
    df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 2)] = 32
    df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 3)] = 31
    df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 1)] = 30
    df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 2)] = 24
    df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 3)] = 18
    df.AgeFill[df.AgeFill.isnull() & (df.Sex == 'female')] = 30
    return df

In [263]:
def extract_pclass(df):
    pclass_new = pd.get_dummies(df.Pclass)
    pclass_new.rename(columns=lambda x: 'pclass' + "_" + str(x), inplace=True)
    df = df.join(pclass_new)
    return df

In [264]:
def extract_parch(df):
    dm = pd.get_dummies(df.Parch.apply(lambda p: min(p, 5)))
    dm.rename(columns=lambda x: 'parch' + "_" + str(x), inplace=True)
    df = df.join(dm)
    return df

In [265]:
def extract_sibsp(df):
    dm = pd.get_dummies(df.SibSp.apply(lambda s: min(s, 5)))
    dm.rename(columns=lambda x: 'sibsp' + "_" + str(x), inplace=True)
    df = df.join(dm)
    return df

In [266]:
def convert_sex(df):
    df['male'] = df.Sex.apply(lambda s: 0 if s == 'male' else 1)
    df['female'] = df.Sex.apply(lambda s: 1 if s == 'male' else 0)
    return df

In [267]:
def extract_embarked(df):
    dm = pd.get_dummies(df.Embarked)
    dm.rename(columns=lambda x: 'emb' + "_" + str(x), inplace=True)
    df = df.join(dm)
    return df

In [281]:
def calc_inner_classifier(X, y):
    X = X[[
            'title_Master', 'title_Mrs', 'title_Miss', 'title_Master',
            'AgeFill',
            'pclass_1', 'pclass_2', 'pclass_3',
            'male', 'female',
            'FareFill',
            'emb_C'
            ]]
    clf = LogisticRegression()
    cross_val(X, y, 20, clf=clf)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, random_state=13)
    print('Num of Training Samples: {}'.format(len(X_train)))
    print('Num of Validation Samples: {}'.format(len(X_val)))

    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_val_pred = clf.predict(X_val)
    print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
    print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))
    
    return clf

In [282]:
#calc_new_estimated_feature(X_train, df_train.Survived)

In [283]:
def extract_feature(df):
    df = extract_title(df)
    df = fill_age(df)
    df = extract_pclass(df)
    df = extract_sibsp(df)
    df = extract_parch(df)
    df = convert_sex(df)
    df = fill_fare(df)
    df = extract_embarked(df)
    cols = df.columns
    drop_cols = set(cols).intersection(set(['PassengerId', 'Title', 'Name', 'SibSp', 'Ticket', 'Fare', 'Pclass', 'Survived', 'Parch', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked', 'CCabin']))
    return df.drop(drop_cols, axis=1)

In [284]:
def cross_val(X, y, K, random_state=0, clf=None, ):
    cv = KFold(len(y), K, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=cv)
    #print('Scores:', scores)
    print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
    return scores

In [287]:
def prepare_x(df, y=None, lclf=None):
    X_train = extract_feature(df)
    if y is not None:
        lclf = calc_inner_classifier(X_train, y)
    X_train['est'] = lclf.predict_proba(X_train[[
            'title_Master', 'title_Mrs', 'title_Miss', 'title_Master',
            'AgeFill',
            'pclass_1', 'pclass_2', 'pclass_3',
            'male', 'female',
            'FareFill',
            'emb_C'
            ]])[:,1]
    return X_train, y, lclf

In [288]:
X_train, y_train, lclf = prepare_x(df_train, df_train.Survived)

Mean Score: 0.814 (+/-0.071)
Num of Training Samples: 801
Num of Validation Samples: 90
Accuracy on Training Set: 0.814
Accuracy on Validation Set: 0.844


In [290]:
def grid_search_random_forest(df):
    X_train = extract_feature(df)
    y_train = df['Survived']
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.83, random_state=19)

    survived_weight = .75
    y_weights = np.array([survived_weight if s == 1 else 1 for s in y_train])
    sqrtfeat = int(np.sqrt(X_train.shape[1]))
    test_parameters = {
        'n_estimators': [100, 1000, 5000],
        'max_depth': [5, 6, 7, 8],
        'min_samples_leaf': [1, 2, 3],
        'max_features': np.rint(np.linspace(sqrtfeat, sqrtfeat+2, 3)).astype(int),
        'min_samples_split': np.rint(np.linspace(X_train.shape[0]*.01, X_train.shape[0]*.05, 3)).astype(int)
    }
    
    clf = GridSearchCV(
        RandomForestClassifier(oob_score=True, n_estimators=10000),
        test_parameters,
        cv=25,
        scoring='accuracy',
        n_jobs=-1,
        fit_params={'sample_weight': y_weights}
    )
    clf.fit(X_train, y_train)
    print clf.best_estimator_
    
    print"\n+ トレーニングデータでCVした時の平均スコア:\n"
    for params, mean_score, all_scores in clf.grid_scores_:
        print "{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params)

    print "\n+ テストデータでの識別結果:\n"
    y_true, y_pred = y_val, clf.predict(X_val)
    print classification_report(y_true, y_pred)
    return clf

In [291]:
clf = grid_search_random_forest(df_train)
print "============================================"
#grid_search_svc(df_train)
print "============================================"

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=7, max_features=6,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=7, n_estimators=1000, n_jobs=1,
            oob_score=True, random_state=None, verbose=0)

+ トレーニングデータでCVした時の平均スコア:

0.815 (+/- 0.035) for {'max_features': 5, 'min_samples_split': 7, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}
0.812 (+/- 0.036) for {'max_features': 5, 'min_samples_split': 7, 'n_estimators': 1000, 'max_depth': 5, 'min_samples_leaf': 1}
0.812 (+/- 0.035) for {'max_features': 5, 'min_samples_split': 7, 'n_estimators': 5000, 'max_depth': 5, 'min_samples_leaf': 1}
0.809 (+/- 0.033) for {'max_features': 5, 'min_samples_split': 22, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}
0.811 (+/- 0.035) for {'max_features': 5, 'min_samples_split': 22, 'n_estimators': 1000, 'max_depth': 5, 'min_samples_leaf': 1}
0.811 (+/- 0.035) for {

In [193]:
print "Logistic Regression"
cross_val(X_train, y_train, 20, clf=LogisticRegression(penalty='l2', tol=0.01))

print "Random Forest"
cross_val(X_train, y_train, 20, clf=RandomForestClassifier())
cross_val(X_train, y_train, 20, clf=RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features=5,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=37, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0))

print "SVN (grid best)"
cross_val(X_train, y_train, 20, clf=SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False))
print "SVN (L2)"
cross_val(X_train, y_train, 20, clf=LinearSVC(penalty='l2'))
print "SVN (L1)"
cross_val(X_train, y_train, 20, clf=LinearSVC(penalty='l1', dual=False))
print "SVN"
cross_val(X_train, y_train, 20, clf=SVC())
print "Decision Tree"
cross_val(X_train, y_train, 20, clf=DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=2))

Logistic Regression
Mean Score: 0.827 (+/-0.068)
Random Forest
Mean Score: 0.805 (+/-0.088)
Mean Score: 0.841 (+/-0.092)
SVN (grid best)
Mean Score: 0.828 (+/-0.059)
SVN (L2)
Mean Score: 0.801 (+/-0.172)
SVN (L1)
Mean Score: 0.825 (+/-0.073)
SVN
Mean Score: 0.808 (+/-0.089)
Decision Tree
Mean Score: 0.813 (+/-0.097)


array([ 0.86666667,  0.8       ,  0.82222222,  0.86666667,  0.86666667,
        0.82222222,  0.8       ,  0.8       ,  0.86666667,  0.8       ,
        0.91111111,  0.75      ,  0.81818182,  0.68181818,  0.77272727,
        0.81818182,  0.77272727,  0.81818182,  0.79545455,  0.81818182])

In [250]:
def calc_classifier(X_train, y_train, clf=None):
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.9, random_state=29)
    print('Num of Training Samples: {}'.format(len(X_train)))
    print('Num of Validation Samples: {}'.format(len(X_val)))
    survived_weight = .7
    y_weights = np.array([survived_weight if s == 1 else 1 for s in y_train])
    
    clf.fit(X_train, y_train, sample_weight=y_weights)
    y_train_pred = clf.predict(X_train)
    y_val_pred = clf.predict(X_val)
    print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
    print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))
    return clf

In [251]:
_clf = RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features=5,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=37, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0)
clf = calc_classifier(X_train, y_train, clf=_clf)

Num of Training Samples: 801
Num of Validation Samples: 90
Accuracy on Training Set: 0.853
Accuracy on Validation Set: 0.833


In [241]:
X_train.head()

Unnamed: 0,title_Master,title_Miss,title_Mr,title_Mrs,AgeFill,pclass_1,pclass_2,pclass_3,sibsp_0,sibsp_1,...,parch_2,parch_3,parch_4,male,female,FareFill,emb_C,emb_Q,emb_S,est
0,0,0,1,0,22,0,0,1,0,1,...,0,0,0,0,1,1.981001,0,0,1,0.099526
1,0,0,0,1,38,1,0,0,0,1,...,0,0,0,1,0,4.266662,1,0,0,0.937111
2,0,1,0,0,26,0,0,1,1,0,...,0,0,0,1,0,2.070022,0,0,1,0.565235
3,0,0,0,1,35,1,0,0,0,1,...,0,0,0,1,0,3.972177,0,0,1,0.946941
4,0,0,1,0,35,0,0,1,1,0,...,0,0,0,0,1,2.085672,0,0,1,0.071466


In [244]:
Y, _, _ = prepare_x(df_test, None, lclf)
df_test['Survived'] = clf.predict(Y)
submit_data = df_test[['PassengerId', 'Survived']]

In [246]:
Y.head()

Unnamed: 0,title_Master,title_Miss,title_Mr,title_Mrs,AgeFill,pclass_1,pclass_2,pclass_3,sibsp_0,sibsp_1,...,parch_2,parch_3,parch_4,male,female,FareFill,emb_C,emb_Q,emb_S,est
0,0,0,1,0,34.5,0,0,1,1,0,...,0,0,0,0,1,2.05786,0,1,0,0.07297
1,0,0,0,1,47.0,0,0,1,0,1,...,0,0,0,1,0,1.94591,0,0,1,0.592339
2,0,0,1,0,62.0,0,1,0,1,0,...,0,0,0,0,1,2.270836,0,1,0,0.110694
3,0,0,1,0,27.0,0,0,1,1,0,...,0,0,0,0,1,2.159003,0,0,1,0.083857
4,0,0,0,1,22.0,0,0,1,0,1,...,0,0,0,1,0,2.508582,0,0,1,0.689069


In [247]:
submit_data.to_csv('./submit_20150316_rf_03.csv', index=False)

In [32]:
!open .