# Bài tập MLP

## Sử dụng MLPClassifier để phân nhóm cho tập Titanic dataset
- Huỳnh Minh Trí
- Lớp Cao học HTTT-TDMU
- dataset: 'Titanic_train.csv', 'Titanic_test.csv'


In [186]:
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

%matplotlib inline
# Suppress warnings
warnings.filterwarnings("ignore")


In [187]:
# Load the Titanic_train dataset
train_raw = pd.read_csv('Titanic_train.csv')
test_raw = pd.read_csv('Titanic_test.csv')
test_ids = test_raw['PassengerId'].values

# Kết hợp dữ liệu train và test.
train_raw['train'] = 1
test_raw['train'] = 0
data = train_raw.append(test_raw, sort=False)

## 1) Khám Phá dữ liệu

In [188]:
data.shape

(1309, 13)

In [189]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


## 2)Tiền Xử Lý

In [190]:
features = ['Age', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp']
target = 'Survived'

data = data[features + [target] + ['train']]
# Chuyển các cột qua giá trị số
data['Sex'] = data['Sex'].replace(["female", "male"], [0, 1])
data['Embarked'] = data['Embarked'].replace(['S', 'C', 'Q'], [1, 2, 3])
data['Age'] = pd.qcut(data['Age'].rank(method='first'), 10, labels=False)

In [191]:
data.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,train
0,2.0,1.0,7.25,0,3,1,1,0.0,1
1,7.0,2.0,71.2833,0,1,0,1,1.0,1
2,4.0,1.0,7.925,0,3,0,0,1.0,1
3,6.0,1.0,53.1,0,1,0,1,1.0,1
4,6.0,1.0,8.05,0,3,1,0,0.0,1


In [192]:
# Chia data thành tập train và test
train = data.query('train == 1')
test = data.query('train == 0')

# Bỏ các giá trị bị thiếu ra khỏi tập train
train.dropna(axis=0, inplace=True) ### X
labels = train[target].values ### Y

In [193]:
train.shape

(712, 9)

In [194]:
train.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,train
0,2.0,1.0,7.25,0,3,1,1,0.0,1
1,7.0,2.0,71.2833,0,1,0,1,1.0,1
2,4.0,1.0,7.925,0,3,0,0,1.0,1
3,6.0,1.0,53.1,0,1,0,1,1.0,1
4,6.0,1.0,8.05,0,3,1,0,0.0,1


In [195]:
# Split the data into 70% training data and 30% test data
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.3, random_state=0)

In [196]:
X_train[:5]

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,train
202,6.0,1.0,6.4958,0,3,1,0,0.0,1
439,5.0,1.0,10.5,0,2,1,0,0.0,1
102,2.0,1.0,77.2875,1,1,1,0,0.0,1
118,3.0,2.0,247.5208,1,1,1,0,0.0,1
625,9.0,1.0,32.3208,0,1,1,0,0.0,1


## 3) Chuẩn bị dữ liệu

In [197]:
sc = StandardScaler()
sc.fit(X_train)

StandardScaler()

In [198]:
# Apply the scaler to the X training data
X_train_std = sc.transform(X_train)

# Apply the SAME scaler to the X test data
X_test_std = sc.transform(X_test)

In [199]:
X_train_std[:5]

array([[ 0.58024708, -0.51287764, -0.50843012, -0.50118147,  0.88208282,
         0.73931309, -0.54307331, -0.81581361,  0.        ],
       [ 0.23209883, -0.51287764, -0.43584459, -0.50118147, -0.32472279,
         0.73931309, -0.54307331, -0.81581361,  0.        ],
       [-0.81234592, -0.51287764,  0.77483576,  0.64899536, -1.5315284 ,
         0.73931309, -0.54307331, -0.81581361,  0.        ],
       [-0.46419767,  1.42206983,  3.86071427,  0.64899536, -1.5315284 ,
         0.73931309, -0.54307331, -0.81581361,  0.        ],
       [ 1.62469184, -0.51287764, -0.04029132, -0.50118147, -1.5315284 ,
         0.73931309, -0.54307331, -0.81581361,  0.        ]])

In [200]:
X_test_std[:5]

array([[-0.11604942, -0.51287764, -0.36514792,  0.64899536,  0.88208282,
        -1.35260691,  0.49712096, -0.81581361,  0.        ],
       [ 0.23209883, -0.51287764, -0.39052621, -0.50118147, -0.32472279,
         0.73931309, -0.54307331, -0.81581361,  0.        ],
       [-1.50864242, -0.51287764,  2.12101805,  1.79917218, -1.5315284 ,
         0.73931309,  0.49712096,  1.22577019,  0.        ],
       [ 0.58024708,  1.42206983, -0.39279213, -0.50118147, -0.32472279,
         0.73931309, -0.54307331, -0.81581361,  0.        ],
       [ 1.27654358, -0.51287764, -0.49475847, -0.50118147,  0.88208282,
         0.73931309, -0.54307331, -0.81581361,  0.        ]])

## 4) Xây dựng model Perceptron

In [201]:
# Create a perceptron object 
perceptron = Perceptron()

In [202]:
# Train the perceptron
perceptron.fit(X_train_std, y_train)

Perceptron()

In [203]:
# Apply the trained perceptron on the X data to make predicts for the y test data
y_pred = perceptron.predict(X_test_std)

In [177]:
score = perceptron.score(X_train, y_train)
print("Accuracy score (train): %.2f" % score)

Accuracy score (train): 0.83


In [178]:
# View the accuracy of the model, which is: 1 - (observations predicted wrong / total observations)
print('Accuracy score (test): %.2f' % accuracy_score(y_test, y_pred))

Accuracy score (test): 1.00


## 5) Xây dựng model MLPClassifier

In [179]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(8,5 ), activation='tanh', solver='sgd', 
                    alpha=0.0000001, batch_size=4, learning_rate='constant', learning_rate_init=0.005, 
                    power_t=0.5, max_iter=500, shuffle=True, random_state=11, tol=0.00001, 
                    verbose=True, warm_start=False, momentum=0.8, nesterovs_momentum=True, 
                    early_stopping=False, validation_fraction=0.2, 
                    beta_1=0.9, beta_2=0.999, epsilon=1e-08)

In [180]:
# Train the perceptron
clf.fit(X_train_std, y_train)

Iteration 1, loss = 0.35359073
Iteration 2, loss = 0.10952470
Iteration 3, loss = 0.05548211
Iteration 4, loss = 0.03542505
Iteration 5, loss = 0.02547519
Iteration 6, loss = 0.01966151
Iteration 7, loss = 0.01590180
Iteration 8, loss = 0.01328377
Iteration 9, loss = 0.01137288
Iteration 10, loss = 0.00991996
Iteration 11, loss = 0.00878333
Iteration 12, loss = 0.00786857
Iteration 13, loss = 0.00711669
Iteration 14, loss = 0.00649078
Iteration 15, loss = 0.00596318
Iteration 16, loss = 0.00551071
Iteration 17, loss = 0.00511935
Iteration 18, loss = 0.00477610
Iteration 19, loss = 0.00447619
Iteration 20, loss = 0.00421056
Iteration 21, loss = 0.00397316
Iteration 22, loss = 0.00376009
Iteration 23, loss = 0.00356847
Iteration 24, loss = 0.00339443
Iteration 25, loss = 0.00323599
Iteration 26, loss = 0.00309150
Iteration 27, loss = 0.00295853
Iteration 28, loss = 0.00283646
Iteration 29, loss = 0.00272370
Iteration 30, loss = 0.00261927
Iteration 31, loss = 0.00252241
Iteration 32, los

MLPClassifier(activation='tanh', alpha=1e-07, batch_size=4,
              hidden_layer_sizes=(8, 5), learning_rate_init=0.005, max_iter=500,
              momentum=0.8, random_state=11, solver='sgd', tol=1e-05,
              validation_fraction=0.2, verbose=True)

In [181]:
clf.classes_ 

array([0., 1.])

In [182]:
clf.n_layers_ 

4

In [183]:
clf.n_outputs_ 

1

In [184]:
clf.out_activation_ 

'logistic'

In [185]:
# Evaluate acuracy on test data
score = clf.score(X_test_std,y_test)
print("Acuracy (on test set) = ", score)

Acuracy (on test set) =  1.0
