In [1]:
#importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# loading data using pandas
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# looking for data types of train data
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
# checking for null values
train_data.apply(lambda x: sum(x.isnull()))

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
full_data = train_data.append(test_data,sort=False)

# Fetaure Selection

In [6]:
# dropping the features that do not play any role in prediction
drop_columns = ["Name", "Age", "SibSp", "Ticket", "Cabin", "Parch", "Embarked"]
full_data.drop(labels=drop_columns, axis=1, inplace=True)

In [7]:
#converting the Sex column into dummy variable
full_data = pd.get_dummies(full_data, columns=["Sex"])
full_data.fillna(value=0.0, inplace=True)

In [8]:
full_data.apply(lambda x: sum(x.isnull()))

PassengerId    0
Survived       0
Pclass         0
Fare           0
Sex_female     0
Sex_male       0
dtype: int64

In [9]:
# Setting the target variable
df_target = full_data["Survived"]

In [10]:
# dropping the target from the data frame 
df = full_data.drop("Survived", axis=1)

In [11]:
df_target.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

In [12]:
df.dtypes

PassengerId      int64
Pclass           int64
Fare           float64
Sex_female       uint8
Sex_male         uint8
dtype: object

In [13]:

#Splitting the data into train and test
state = 12  
test_size = 0.30  
  
X_train, X_test, y_train, y_test = train_test_split(df, df_target,  
    test_size=0.30, random_state=12)


# Regular Gradient Boosting Classifier 

In [15]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf1 = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf1.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf1.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf1.score(X_test, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.835
Accuracy score (validation): 0.832
Learning rate:  0.075
Accuracy score (training): 0.853
Accuracy score (validation): 0.860
Learning rate:  0.1
Accuracy score (training): 0.847
Accuracy score (validation): 0.835
Learning rate:  0.25
Accuracy score (training): 0.867
Accuracy score (validation): 0.832
Learning rate:  0.5
Accuracy score (training): 0.882
Accuracy score (validation): 0.852
Learning rate:  0.75
Accuracy score (training): 0.893
Accuracy score (validation): 0.824
Learning rate:  1
Accuracy score (training): 0.903
Accuracy score (validation): 0.832


From above,it is clear that a learning rate of 0.5 gives us the best performance on the validation set and good performance on the training set.

Now we can evaluate the classifier by checking its accuracy and creating a confusion matrix. Let's create a new classifier and specify the best learning rate we discovered.

In [16]:

gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[256  26]
 [ 32  79]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.89      0.91      0.90       282
         1.0       0.75      0.71      0.73       111

    accuracy                           0.85       393
   macro avg       0.82      0.81      0.81       393
weighted avg       0.85      0.85      0.85       393



# XGBoost Classifier

In [17]:
# XGBoost actually stands for "eXtreme Gradient Boosting"
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [18]:
score = xgb_clf.score(X_test, y_test)
print(score)

0.8727735368956743
