In [9]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 1. Loading Data

In [55]:
# loading data
train_data = pd.read_csv("/Users/lgrcyanny/Codecookies/machine-learning-workspace/hands-on-ml-wp/hands-on-ml/jupyter-notebooks/koggle/datasets/titanic/train.csv")
test_data = pd.read_csv("/Users/lgrcyanny/Codecookies/machine-learning-workspace/hands-on-ml-wp/hands-on-ml/jupyter-notebooks/koggle/datasets/titanic/test.csv")

In [170]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [108]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [161]:
X_train = train_data.drop(["PassengerId", "Survived"], axis=1).copy()
y_train = train_data.loc[:, "Survived"]

In [61]:
X_test = test_data.drop(["PassengerId"], axis=1).copy()
# y_test = test_data.loc(["Survived"])

In [13]:
train_data.corr()["Survived"]

PassengerId   -0.005007
Survived       1.000000
Pclass        -0.338481
Age           -0.077221
SibSp         -0.035322
Parch          0.081629
Fare           0.257307
Name: Survived, dtype: float64

# 2. Preprocessing Data

In [None]:
#1. transform text

In [90]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

In [148]:
def process_text(text_data, col_name):
    pipeline = Pipeline([
        ("count_vec", CountVectorizer()),
        ("tf_idf", TfidfTransformer(norm='l2', use_idf=True, smooth_idf=False))])
    encoded = pipeline.fit_transform(text_data)
    encoded = encoded.toarray()
    num_rows = encoded.shape[0]
    encoded_df = pd.DataFrame(encoded, index=range(num_rows))
    encoded_cols = encoded_df.columns
    encoded_df.columns = ["{0}_{1}".format(col_name, col) for col in encoded_cols]
    return encoded_df

In [150]:
name_encoded = process_text(X_train["Name"], "Name")

(891, 1509)

In [152]:
ticket_encoded = process_text(X_train["Ticket"], "Ticket")

In [None]:
#2. transform category

In [78]:
def process_cat(cat_cols, data):
    cat_encoded = []
    for col_name in cat_cols:
        encoded_data = pd.get_dummies(data[col_name])
        encoded_cols = encoded_data.columns
        encoded_data.columns = ["{0}_{1}".format(col_name, col) for col in encoded_cols]
        cat_encoded.append(encoded_data)
    return cat_encoded

In [175]:
cat_cols = ["Sex", "Embarked", "Cabin"]
cat_encoded = process_cat(cat_cols, X_train)
cat_encoded = pd.concat(cat_encoded, axis=1)

In [156]:
original_cat = X_train.loc[:, ["Pclass", "SibSp", "Parch"]]

In [86]:
#3. process number columns

In [134]:
def process_num(data, cols):
    num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])
    num_transformed = num_pipeline.fit_transform(data)
    num_rows = num_transformed.shape[0]
    transformed_df = pd.DataFrame(num_transformed, index=range(num_rows))
    transformed_df.columns = cols
    return transformed_df

In [138]:
num_cols = ["Age", "Fare"]
num_data = train_data.loc[:, num_cols]
transformed_num_df = process_num(num_data, num_cols)

In [157]:
X_processed_train = pd.concat([transformed_num_df, original_cat, name_encoded, ticket_encoded, cat_encoded], axis=1)

# 3. Train and Evaluate data, Select Model

In [211]:
# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [183]:
rf_clf = RandomForestClassifier()
y_pred = cross_val_predict(rf_clf, X_processed_train, y_train, cv=3, method="predict_proba")

In [181]:
accuracies = cross_val_score(rf_clf, X_processed_train, y_train, cv=3, scoring="accuracy")
accuracies

array([ 0.80808081,  0.83838384,  0.81481481])

In [185]:
auc_score = roc_auc_score(y_train, y_pred[:, 1])
auc_score

0.84574558740506389

In [238]:
def train_and_eval(clf, X_train, y_train, method="predict_proba"):
    y_pred = cross_val_predict(clf, X_train, y_train, cv=3, method=method)
    if len(y_pred.shape) > 1:
        y_pred = y_pred[:, 1]
    accuracies = cross_val_score(clf, X_train, y_train, cv=3, scoring="accuracy")
    print("accuracies")
    print accuracies
    mean_accuracy = np.mean(accuracies)
    print("mean accuracy", mean_accuracy)
    try:
        auc_score = roc_auc_score(y_train, y_pred)
        print("auc", auc_score)
        f1 = f1_score(y_pred, y_train, average="macro")
        print("f1", f1)
        precision = precision_score(y_pred, y_train)
        print("precision", precision)
        recall = recall_score(y_pred, y_train)
        print("recall", recall)
    except Exception as e:
        pass

In [191]:
rf_clf = RandomForestClassifier()
train_and_eval(rf_clf, X_processed_train, y_train)

accuracies
[ 0.8047138   0.83164983  0.83164983]
('mean accuracy', 0.82267115600448937)
('auc', 0.83977513607942145)


In [190]:
lr_clf = LogisticRegression()
train_and_eval(lr_clf, X_processed_train, y_train)

accuracies
[ 0.7979798   0.82491582  0.81818182]
('mean accuracy', 0.8136924803591471)
('auc', 0.8680375802895216)


In [210]:
tree_clf = DecisionTreeClassifier()
train_and_eval(tree_clf, X_processed_train, y_train)

accuracies
[ 0.79461279  0.81818182  0.82491582]
('mean accuracy', 0.8125701459034792)
('auc', 0.79663982360272267)
('f1', 0.79968228539118602)
('precision', 0.72807017543859653)
('recall', 0.77089783281733748)


In [239]:
gbdt_clf = GradientBoostingClassifier(max_depth=6)
train_and_eval(gbdt_clf, X_processed_train, y_train)

accuracies
[ 0.81144781  0.84175084  0.82828283]
('mean accuracy', 0.8271604938271605)
('auc', 0.86547044599963785)


In [209]:
sgd_clf = SGDClassifier()
train_and_eval(sgd_clf, X_processed_train, y_train, method="predict")

accuracies
[ 0.81144781  0.47474747  0.74074074]
('mean accuracy', 0.6756453423120089)
('auc', 0.77010833093663122)
('f1', 0.76311591634172282)
('precision', 0.76608187134502925)
('recall', 0.67875647668393779)


In [213]:
svm_clf = svm.SVC(kernel="polynomial")
train_and_eval(sgd_clf, X_processed_train, y_train, method="predict")

accuracies
[ 0.73400673  0.76430976  0.54882155]
('mean accuracy', 0.68237934904601572)
('auc', 0.74729172658422016)
('f1', 0.74094278245337275)
('precision', 0.73684210526315785)
('recall', 0.65454545454545454)


In [215]:
# the best model is GBDT
gbdt_clf.fit(X_processed_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [244]:
from xgboost.sklearn import XGBClassifier
import xgboost
xgbt_clf = XGBClassifier(max_depth=8)
train_and_eval(xgbt_clf, X_processed_train, y_train, method="predict")

accuracies
[ 0.81818182  0.83501684  0.83838384]
('mean accuracy', 0.83052749719416374)
('auc', 0.8128654970760234)
('f1', 0.81774081023049772)
('precision', 0.73684210526315785)
('recall', 0.805111821086262)


# 4. Do prediction

## 4.1 process test data

In [220]:
def preprocess_data(X_train):
    name_encoded = process_text(X_train["Name"], "Name")
    ticket_encoded = process_text(X_train["Ticket"], "Ticket")
    cat_cols = ["Sex", "Embarked", "Cabin"]
    cat_encoded = process_cat(cat_cols, X_train)
    cat_encoded = pd.concat(cat_encoded, axis=1)
    original_cat = X_train.loc[:, ["Pclass", "SibSp", "Parch"]]
    num_cols = ["Age", "Fare"]
    num_data = train_data.loc[:, num_cols]
    transformed_num_df = process_num(num_data, num_cols)
    X_processed_train = pd.concat([transformed_num_df, original_cat, name_encoded, ticket_encoded, cat_encoded], axis=1)
    return X_processed_train
X_processed_test = preprocess_data(X_test)
X_processed_test.shape

(891, 1285)

## 4.2 do prediction

In [223]:
# y_pred = gbdt_clf.predict(X_processed_test)