In [4]:
import numpy as np
import cv2
import pandas as pd
import os
import pickle
from google.colab import drive
from google.colab.patches import cv2_imshow
from sklearn.model_selection import train_test_split

In [7]:
# Connect Google Drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/test_openCV/'
os.listdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['image_100.jpg',
 'image_120.jpg',
 'image_140.jpg',
 'image_160.jpg',
 'image_180.jpg',
 'image_200.jpg',
 'image_220.jpg',
 'image_240.jpg',
 'image_260.jpg',
 'image_300.jpg',
 'image_280.jpg',
 'image_320.jpg',
 'image_360.jpg',
 'image_340.jpg',
 'image_380.jpg',
 'image_400.jpg',
 'image_420.jpg',
 'image_440.jpg',
 'image_460.jpg',
 'image_480.jpg',
 'image_500.jpg',
 'image_540.jpg',
 'image_520.jpg',
 'image_560.jpg',
 'image_580.jpg',
 'image_600.jpg',
 'image_620.jpg',
 'image_640.jpg',
 'image_660.jpg',
 'image_680.jpg',
 'image_720.jpg',
 'image_700.jpg',
 'image_740.jpg',
 'image_760.jpg',
 'image_780.jpg',
 'image_800.jpg',
 'image_840.jpg',
 'image_820.jpg',
 'image_860.jpg',
 'image_880.jpg',
 'image_900.jpg',
 'image_920.jpg',
 'image_940.jpg',
 'image_960.jpg',
 'image_980.jpg',
 'image_1000.jpg',
 'image_1020.jpg',
 'image_1040.jpg',
 'image_1080.jpg',
 'image_1060.jpg',
 'image_1100.jpg',
 'image_1140.jpg',
 'image_1120.jpg',
 'image_1160.jpg',
 'image_1200.jpg',


In [8]:
# 1. Data

In [9]:
# Load data from pickle file
data = pickle.load(open("/content/drive/MyDrive/test_openCV/data_face_features.pickle", mode="rb"))

In [10]:
x = np.array(data['data'])
y = np.array(data['label'])

print(x)
print(y)

[[[ 0.0196759   0.16772693  0.1007451  ... -0.03204957  0.11536875
    0.07170223]]

 [[ 0.00048526  0.16548792  0.09117572 ... -0.03187338  0.11723722
    0.09059   ]]

 [[ 0.04667978  0.16546842  0.00638199 ... -0.0629909   0.1310076
    0.09049864]]

 ...

 [[ 0.07341399  0.06518976 -0.10309623 ... -0.0834958   0.08147261
    0.0745829 ]]

 [[ 0.07607125  0.05514786 -0.07174882 ... -0.02091191  0.10496895
    0.12454553]]

 [[ 0.15304564 -0.07706067 -0.08126479 ... -0.11379657  0.00884555
    0.00933749]]]
['1721031618_TruongThanhLuan' '1721031618_TruongThanhLuan'
 '1721031618_TruongThanhLuan' ... '162000309_LeHoangQuan'
 '162000309_LeHoangQuan' '162000309_LeHoangQuan']


In [11]:
x.shape

(4507, 1, 128)

In [12]:
y.shape

(4507,)

In [13]:
x = x.reshape(-1, 128)

In [14]:
x.shape

(4507, 128)

In [15]:
# chia nhỏ dữ liệu
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.9, random_state=1)

In [16]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4056, 128), (451, 128), (4056,), (451,))

In [17]:
# 2. Train machine learning

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [19]:
def get_report(model, x_train, y_train, x_test, y_test):
  y_pred_train = model.predict(x_train)
  y_pred_test = model.predict(x_test)

  # accuracy score
  acc_train = accuracy_score(y_train, y_pred_train)
  acc_test = accuracy_score(y_test, y_pred_test)

  # f1 score
  f1_score_train = f1_score(y_train, y_pred_train, average="macro")
  f1_score_test = f1_score(y_test, y_pred_test, average="macro")

  # print
  print('Accurency Train = %0.2f'%acc_train)
  print('Accurency Test = %0.2f'%acc_test)
  print('F1 Score Train = %0.2f'%f1_score_train)
  print('F1 Score Test = %0.2f'%f1_score_test)

In [20]:
# Logistic regression
model_logistic = LogisticRegression()
model_logistic.fit(x_train, y_train) # training

In [21]:
get_report(model_logistic, x_train, y_train, x_test, y_test)

Accurency Train = 0.72
Accurency Test = 0.70
F1 Score Train = 0.51
F1 Score Test = 0.52


In [22]:
# Support Vector Machines
model_svc = SVC(probability=True)
model_svc.fit(x_train, y_train)

In [23]:
get_report(model_svc, x_train, y_train, x_test, y_test)

Accurency Train = 0.91
Accurency Test = 0.87
F1 Score Train = 0.82
F1 Score Test = 0.80


In [24]:
# Random forest
model_rf = RandomForestClassifier(n_estimators=20,)
model_rf.fit(x_train, y_train)

In [25]:
get_report(model_rf, x_train, y_train, x_test, y_test)

Accurency Train = 1.00
Accurency Test = 0.77
F1 Score Train = 1.00
F1 Score Test = 0.66


In [26]:
# Voting Classifier
model_voting = VotingClassifier(estimators=[
    ('logistic', LogisticRegression()),
    ('svm', SVC(probability=True)),
    ('rf', RandomForestClassifier(n_estimators=20,)),
], voting='soft', weights=[2,3,1])

In [27]:
model_voting.fit(x_train, y_train)

In [28]:
get_report(model_voting, x_train, y_train, x_test, y_test)

Accurency Train = 0.97
Accurency Test = 0.85
F1 Score Train = 0.96
F1 Score Test = 0.80


In [29]:
# 3. Parameter Tuning
from sklearn.model_selection import GridSearchCV

model_grid = GridSearchCV(model_voting, param_grid={
    'svm__C': [3,5,7,10],
    'svm__gamma': [0.1, 0.3, 0.5],
    'rf__n_estimators': [5, 10,20],
    'rf__max_depth': [3, 5, 7],
    'voting': ['soft', 'hard']
}, scoring='accuracy', cv=3, n_jobs=1, verbose=2)

In [30]:

model_grid.fit(x_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits




[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=soft; total time=   8.3s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=soft; total time=   9.2s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=soft; total time=   7.0s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=hard; total time=   9.2s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=hard; total time=   6.2s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.1, voting=hard; total time=   9.2s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.3, voting=soft; total time=   6.3s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.3, voting=soft; total time=   8.5s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gamma=0.3, voting=soft; total time=   6.1s
[CV] END rf__max_depth=3, rf__n_estimators=5, svm__C=3, svm__gam

In [31]:
model_best_estimator = model_grid.best_estimator_

In [32]:
model_grid.best_score_

0.8456607495069033

In [33]:
# 4. Save Model
pickle.dump(model_best_estimator, open("/content/drive/MyDrive/test_openCV/ml_face_person_identity.pkl", mode='wb'))