## 2015-03-11

In [1]:
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
def plot_confusion_matrix(cm):
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set_title('Confusion Matrix')
    fig.colorbar(im)

    target_names = ['not survived', 'survived']

    tick_marks = np.arange(len(target_names))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(target_names, rotation=45)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(target_names)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    fig.tight_layout()

In [3]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [4]:
df_train.drop('PassengerId', axis=1, inplace=True)
df_train.head(2)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C


In [5]:
def _extract_title(name):
    if name.find('Mr.') > 0:
        return 'Mr'
    elif name.find('Mrs.') > 0:
        return 'Mrs'
    elif name.find('Master.') > 0:
        return 'Master'
    elif name.find('Miss.') > 0:
        return 'Miss'
    else:
        return None
    
def extract_title(df):
    df['Title'] = df.Name.apply(lambda n: _extract_title(n))
    title_bin = pd.get_dummies(df.Title)
    title_bin.rename(columns=lambda x: 'title' + "_" + str(x), inplace=True)
    df = df.join(title_bin)
    return df

In [6]:
import math

In [7]:
def _fill_fare(row):
    fare = row[0]
    sibsp = row[1]
    parch = row[2]
    pclass = row[3]
    family = max(1, sibsp + parch)
    
    if fare is None or fare == 0 or math.isnan(fare):
        if pclass == 1:
            fare = 86
        elif pclass == 2:
            fare = 21
        else:
            fare = 10
    #else:
    #    fare = fare/float(family)
    return np.log(fare)
    
    
def fill_fare(df):
    df['FareFill'] = df[['Fare', 'SibSp', 'Parch', 'Pclass']].apply(_fill_fare, axis=1)
    return df

In [8]:
def fill_age(df):
    df['AgeFill'] = df.Age
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 1)] = 40
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 2)] = 31
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 3)] = 26
    df.AgeFill[df.Age.isnull() & (df.Title == 'Master')] = 3.5
    
    df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 1)] = 41.5
    df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 2)] = 32
    df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 3)] = 31
    df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 1)] = 30
    df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 2)] = 24
    df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 3)] = 18
    df.AgeFill[df.AgeFill.isnull() & (df.Sex == 'female')] = 30
    return df

In [9]:
def extract_pclass(df):
    pclass_new = pd.get_dummies(df.Pclass)
    pclass_new.rename(columns=lambda x: 'pclass' + "_" + str(x), inplace=True)
    df = df.join(pclass_new)
    return df

In [10]:
def extract_parch(df):
    dm = pd.get_dummies(df.Parch.apply(lambda p: min(p, 4)))
    dm.rename(columns=lambda x: 'parch' + "_" + str(x), inplace=True)
    df = df.join(dm)
    return df

In [11]:
def extract_sibsp(df):
    dm = pd.get_dummies(df.SibSp.apply(lambda s: min(s, 4)))
    dm.rename(columns=lambda x: 'sibsp' + "_" + str(x), inplace=True)
    df = df.join(dm)
    return df

In [12]:
def convert_sex(df):
    df['male'] = df.Sex.apply(lambda s: 0 if s == 'male' else 1)
    df['female'] = df.Sex.apply(lambda s: 1 if s == 'male' else 0)
    return df

In [13]:
def extract_feature(df):
    df = extract_title(df)
    df = fill_age(df)
    df = extract_pclass(df)
    df = extract_sibsp(df)
    df = extract_parch(df)
    df = convert_sex(df)
    df = fill_fare(df)
    cols = df.columns
    drop_cols = set(cols).intersection(set(['PassengerId', 'Title', 'Name', 'SibSp', 'Ticket', 'Fare', 'Pclass', 'Survived', 'Parch', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked', 'CCabin']))
    return df.drop(drop_cols, axis=1)

In [14]:
def get_classifier():
    clf = LogisticRegression(C=100, penalty='l2', tol=0.01)
    #clf = RandomForestClassifier()
    #clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=2)
    return clf

In [111]:
def calc_classifier(df, clf=None):
    X_train = extract_feature(df)
    y_train = df['Survived']
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.99, random_state=19)
    print('Num of Training Samples: {}'.format(len(X_train)))
    print('Num of Validation Samples: {}'.format(len(X_val)))
    
    if clf is None:
        clf = get_classifier()
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_val_pred = clf.predict(X_val)
    print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
    print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))
    cm = confusion_matrix(y_val, y_val_pred)
    return clf

In [112]:
def cross_val(X, y, K, random_state=0, clf=None, ):
    if clf is None:
        clf = get_classifier()
    cv = KFold(len(y), K, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=cv)
    print('Scores:', scores)
    print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
    return scores

In [113]:
X_train = extract_feature(df_train)
y_train = df_train.Survived

In [114]:
print "Logistic Regression"
cross_val(X_train, y_train, 5, clf=LogisticRegression(penalty='l2', tol=0.01))
cross_val(X_train, y_train, 5, clf=LogisticRegression(C=1, penalty='l2', tol=0.01))
cross_val(X_train, y_train, 5, clf=LogisticRegression(C=0.5, penalty='l2', tol=0.01))
cross_val(X_train, y_train, 5, clf=LogisticRegression(C=0.1, penalty='l2', tol=0.01))
cross_val(X_train, y_train, 5, clf=LogisticRegression(C=0.01, penalty='l2', tol=0.01))
cross_val(X_train, y_train, 5, clf=LogisticRegression(C=0.4, penalty='l2', tol=0.01))
cross_val(X_train, y_train, 5, clf=LogisticRegression(C=0.3, penalty='l2', tol=0.01))
cross_val(X_train, y_train, 5, clf=LogisticRegression(C=0.2, penalty='l2', tol=0.01))
print "Linear Regression"
cross_val(X_train, y_train, 5, clf=LinearRegression())
print "Random Forest"
cross_val(X_train, y_train, 5, clf=RandomForestClassifier())
print "SVN (L1 regression)"
cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l1', dual=False))
print "SVN (L2 regression and L1 loss)"
cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l2', loss='l1'))
print "SVN (L2)"
cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l2'))
print "SVN"
cross_val(X_train, y_train, 5, clf=LinearSVC())
print "Decision Tree"
cross_val(X_train, y_train, 5, clf=DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=2))

Logistic Regression
('Scores:', array([ 0.79329609,  0.80898876,  0.80898876,  0.83146067,  0.83707865]))
Mean Score: 0.816 (+/-0.032)
('Scores:', array([ 0.79329609,  0.80898876,  0.80898876,  0.83146067,  0.83707865]))
Mean Score: 0.816 (+/-0.032)
('Scores:', array([ 0.80446927,  0.80898876,  0.82022472,  0.83707865,  0.83146067]))
Mean Score: 0.820 (+/-0.025)
('Scores:', array([ 0.78212291,  0.78089888,  0.79213483,  0.8258427 ,  0.83707865]))
Mean Score: 0.804 (+/-0.047)
('Scores:', array([ 0.7877095 ,  0.76966292,  0.81460674,  0.80898876,  0.79775281]))
Mean Score: 0.796 (+/-0.032)
('Scores:', array([ 0.80446927,  0.80337079,  0.81460674,  0.83707865,  0.83146067]))
Mean Score: 0.818 (+/-0.028)
('Scores:', array([ 0.79329609,  0.79775281,  0.82022472,  0.84269663,  0.83707865]))
Mean Score: 0.818 (+/-0.040)
('Scores:', array([ 0.7877095 ,  0.79213483,  0.81460674,  0.8258427 ,  0.83707865]))
Mean Score: 0.811 (+/-0.038)
Linear Regression
('Scores:', array([ 0.44825484,  0.3810767

array([ 0.7877095 ,  0.80898876,  0.80898876,  0.80337079,  0.8258427 ])

In [115]:
clf = calc_classifier(df_train, clf=LogisticRegression(C=0.5, penalty='l2', tol=0.01))

Num of Training Samples: 882
Num of Validation Samples: 9
Accuracy on Training Set: 0.823
Accuracy on Validation Set: 0.778


In [116]:
X_train.head()

Unnamed: 0,title_Master,title_Miss,title_Mr,title_Mrs,AgeFill,pclass_1,pclass_2,pclass_3,sibsp_0,sibsp_1,...,sibsp_3,sibsp_4,parch_0,parch_1,parch_2,parch_3,parch_4,male,female,FareFill
0,0,0,1,0,22,0,0,1,0,1,...,0,0,1,0,0,0,0,0,1,1.981001
1,0,0,0,1,38,1,0,0,0,1,...,0,0,1,0,0,0,0,1,0,4.266662
2,0,1,0,0,26,0,0,1,1,0,...,0,0,1,0,0,0,0,1,0,2.070022
3,0,0,0,1,35,1,0,0,0,1,...,0,0,1,0,0,0,0,1,0,3.972177
4,0,0,1,0,35,0,0,1,1,0,...,0,0,1,0,0,0,0,0,1,2.085672


In [117]:
Y = extract_feature(df_test)
df_test['Survived'] = clf.predict(Y)
submit_data = df_test[['PassengerId', 'Survived']]

In [118]:
Y.head()

Unnamed: 0,title_Master,title_Miss,title_Mr,title_Mrs,AgeFill,pclass_1,pclass_2,pclass_3,sibsp_0,sibsp_1,...,sibsp_3,sibsp_4,parch_0,parch_1,parch_2,parch_3,parch_4,male,female,FareFill
0,0,0,1,0,34.5,0,0,1,1,0,...,0,0,1,0,0,0,0,0,1,2.05786
1,0,0,0,1,47.0,0,0,1,0,1,...,0,0,1,0,0,0,0,1,0,1.94591
2,0,0,1,0,62.0,0,1,0,1,0,...,0,0,1,0,0,0,0,0,1,2.270836
3,0,0,1,0,27.0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,1,2.159003
4,0,0,0,1,22.0,0,0,1,0,1,...,0,0,0,1,0,0,0,1,0,2.508582


In [119]:
submit_data.to_csv('./submit_20150312_10_logistic.csv', index=False)