# ADABoost 

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df = df[['age', 'pclass', 'sex', 'survived']]
df.dropna(inplace=True)
df.shape

(714, 4)

In [5]:
X = df[['pclass', 'sex', 'age']]
y = df['survived']

In [6]:
from sklearn.preprocessing import LabelBinarizer
le = LabelBinarizer()
X['sex'] = le.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
X.head()

Unnamed: 0,pclass,sex,age
0,3,1,22.0
1,1,0,38.0
2,3,0,26.0
3,1,0,35.0
4,3,1,35.0


In [8]:
X.describe()

Unnamed: 0,pclass,sex,age
count,714.0,714.0,714.0
mean,2.236695,0.634454,29.699118
std,0.83825,0.481921,14.526497
min,1.0,0.0,0.42
25%,1.0,0.0,20.125
50%,2.0,1.0,28.0
75%,3.0,1.0,38.0
max,3.0,1.0,80.0


In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 3 columns):
pclass    714 non-null int64
sex       714 non-null int64
age       714 non-null float64
dtypes: float64(1), int64(2)
memory usage: 42.3 KB


## Fit model

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [12]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
#     print accuracy score, classification report, confusion metrics
    if train:
#         training performance
        print('Train Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report : \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy : {0:.4f}\n'.format(np.mean(res)))
        print('Accuracy SD : {0:.4f}\n'.format(np.std(res)))
        
    elif train == False:
#         test performance
        print('Test Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report : \n {}\n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))
        

## AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier

In [14]:
ada_clf = AdaBoostClassifier()

In [15]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [16]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = True)

Train Result : 

Accuracy Score 0.8319

Classification Report : 
              precision    recall  f1-score   support

          0       0.88      0.83      0.85       337
          1       0.77      0.84      0.80       234

avg / total       0.84      0.83      0.83       571
 

Confusion Metrics : 
 [[279  58]
 [ 38 196]] 

Average Accuracy : 0.8073

Accuracy SD : 0.0346



In [17]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = False)

Test Result : 

Accuracy Score 0.7063

Classification Report : 
              precision    recall  f1-score   support

          0       0.79      0.70      0.74        87
          1       0.61      0.71      0.66        56

avg / total       0.72      0.71      0.71       143


Confusion Metrics : 
 [[61 26]
 [16 40]] 




## Adaboost with Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
ada_clf = AdaBoostClassifier(RandomForestClassifier())

In [20]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [21]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = True)

Train Result : 

Accuracy Score 0.9072

Classification Report : 
              precision    recall  f1-score   support

          0       0.90      0.95      0.92       337
          1       0.92      0.85      0.88       234

avg / total       0.91      0.91      0.91       571
 

Confusion Metrics : 
 [[319  18]
 [ 35 199]] 

Average Accuracy : 0.8073

Accuracy SD : 0.0587



In [22]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = False)

Test Result : 

Accuracy Score 0.7622

Classification Report : 
              precision    recall  f1-score   support

          0       0.81      0.79      0.80        87
          1       0.69      0.71      0.70        56

avg / total       0.76      0.76      0.76       143


Confusion Metrics : 
 [[69 18]
 [16 40]] 



## Gradient Boosting Machine

In [23]:
from sklearn.ensemble import GradientBoostingClassifier

In [24]:
gb_clf = GradientBoostingClassifier()

In [25]:
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [26]:
print_score(gb_clf, X_train, X_test, y_train, y_test, train = True)

Train Result : 

Accuracy Score 0.8722

Classification Report : 
              precision    recall  f1-score   support

          0       0.88      0.91      0.89       337
          1       0.86      0.82      0.84       234

avg / total       0.87      0.87      0.87       571
 

Confusion Metrics : 
 [[306  31]
 [ 42 192]] 

Average Accuracy : 0.8303

Accuracy SD : 0.0559



In [27]:
print_score(gb_clf, X_train, X_test, y_train, y_test, train = False)

Test Result : 

Accuracy Score 0.7483

Classification Report : 
              precision    recall  f1-score   support

          0       0.80      0.78      0.79        87
          1       0.67      0.70      0.68        56

avg / total       0.75      0.75      0.75       143


Confusion Metrics : 
 [[68 19]
 [17 39]] 

