In [24]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [25]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

In [26]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [27]:
train_data.shape

(891, 12)

In [28]:
y_train = train_data["Survived"]
train_data.drop(labels="Survived", axis=1, inplace=True)

In [29]:
full_data = train_data.append(test_data)

In [30]:
full_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
full_data.shape

(1309, 11)

In [32]:
drop_columns = ["Name", "Age", "SibSp", "Ticket", "Cabin", "Parch", "Embarked"]
full_data.drop(labels=drop_columns, axis=1, inplace=True)

In [33]:
full_data = pd.get_dummies(full_data, columns=["Sex"])
full_data.fillna(value=0.0, inplace=True)

In [34]:
full_data.head()

Unnamed: 0,PassengerId,Pclass,Fare,Sex_female,Sex_male
0,1,3,7.25,0,1
1,2,1,71.2833,1,0
2,3,3,7.925,1,0
3,4,1,53.1,1,0
4,5,3,8.05,0,1


In [36]:
X_train = full_data.values[0:891]

In [42]:
X_train

array([[  1.    ,   3.    ,   7.25  ,   0.    ,   1.    ],
       [  2.    ,   1.    ,  71.2833,   1.    ,   0.    ],
       [  3.    ,   3.    ,   7.925 ,   1.    ,   0.    ],
       ...,
       [889.    ,   3.    ,  23.45  ,   1.    ,   0.    ],
       [890.    ,   1.    ,  30.    ,   0.    ,   1.    ],
       [891.    ,   3.    ,   7.75  ,   0.    ,   1.    ]])

In [44]:
X_test = full_data.values[891:]
X_test

array([[8.92000e+02, 3.00000e+00, 7.82920e+00, 0.00000e+00, 1.00000e+00],
       [8.93000e+02, 3.00000e+00, 7.00000e+00, 1.00000e+00, 0.00000e+00],
       [8.94000e+02, 2.00000e+00, 9.68750e+00, 0.00000e+00, 1.00000e+00],
       ...,
       [1.30700e+03, 3.00000e+00, 7.25000e+00, 0.00000e+00, 1.00000e+00],
       [1.30800e+03, 3.00000e+00, 8.05000e+00, 0.00000e+00, 1.00000e+00],
       [1.30900e+03, 3.00000e+00, 2.23583e+01, 0.00000e+00, 1.00000e+00]])

In [45]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [46]:
X_train

array([[0.        , 1.        , 0.01415106, 0.        , 1.        ],
       [0.0011236 , 0.        , 0.13913574, 1.        , 0.        ],
       [0.00224719, 1.        , 0.01546857, 1.        , 0.        ],
       ...,
       [0.99775281, 1.        , 0.04577135, 1.        , 0.        ],
       [0.9988764 , 0.        , 0.0585561 , 0.        , 1.        ],
       [1.        , 1.        , 0.01512699, 0.        , 1.        ]])

In [47]:
X_test

array([[1.0011236 , 1.        , 0.01528158, 0.        , 1.        ],
       [1.00224719, 1.        , 0.01366309, 1.        , 0.        ],
       [1.00337079, 0.5       , 0.01890874, 0.        , 1.        ],
       ...,
       [1.46741573, 1.        , 0.01415106, 0.        , 1.        ],
       [1.46853933, 1.        , 0.01571255, 0.        , 1.        ],
       [1.46966292, 1.        , 0.0436405 , 0.        , 1.        ]])

In [55]:
state = 12  
test_size = 0.30  
  
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,  
    test_size=test_size, random_state=state)

In [57]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))

Learning rate:  0.05
Accuracy score (training): 0.826
Accuracy score (validation): 0.702
Learning rate:  0.075
Accuracy score (training): 0.826
Accuracy score (validation): 0.702
Learning rate:  0.1
Accuracy score (training): 0.826
Accuracy score (validation): 0.702
Learning rate:  0.25
Accuracy score (training): 0.852
Accuracy score (validation): 0.710
Learning rate:  0.5
Accuracy score (training): 0.875
Accuracy score (validation): 0.679
Learning rate:  0.75
Accuracy score (training): 0.889
Accuracy score (validation): 0.702
Learning rate:  1
Accuracy score (training): 0.905
Accuracy score (validation): 0.679


In [58]:
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))

print("Classification Report")
print(classification_report(y_val, predictions))

Confusion Matrix:
[[68  7]
 [35 21]]
Classification Report
              precision    recall  f1-score   support

           0       0.66      0.91      0.76        75
           1       0.75      0.38      0.50        56

    accuracy                           0.68       131
   macro avg       0.71      0.64      0.63       131
weighted avg       0.70      0.68      0.65       131

