In [1]:
# 人臉識別

In [2]:
import os
import pathlib
import glob   
import cv2

In [3]:
from sklearn.decomposition import PCA

In [4]:
import pandas as pd
import numpy as np

In [5]:
import itertools
import re

In [6]:
# 設定專案位置
os.getcwd()
os.chdir("D:/PKU/python/hw/faceDB/" )

In [7]:
# 讀取資料

In [8]:
# 列出database資料夾下的所有檔案
files = os.listdir("database/") 
print(files[:5])
print(files[100:])

['1', '10', '100', '101', '102']
['87', '88', '89', '9', '90', '91', '92', '93-', '94', '95', '96', '97', '98', '99', 'readme.txt']


In [9]:
# 儲存串列中的串列形式
# 注意串列的順序與實際順序並不相同
face = []
for i in range(len(files)-1): # 不需要存儲最後一筆資料，即 readme.txt
    face.append([])
    for img in glob.glob("database/" + files[i] + "/*.jpg"):
        n = cv2.imread(img, 0) ## 圖片默認為rgb型式，設定為儲存成灰階
        face[i].append(n)

In [10]:
y = files[0:114]
for i in range(0, 114):
    y[i] = int(re.sub("\D", "", y[i]))
print(y[:5])
print(y[100:])

[1, 10, 100, 101, 102]
[87, 88, 89, 9, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [11]:
# 檢查資料

In [12]:
# 確認串列長度與欲讀取資料夾個數相同       
len(face)

114

In [13]:
# 確認資料夾中人臉圖片數不為7的個數是否正確
count = 0
for i in range(len(files)-1):
    if len(face[i]) != 7:
        count += 1    
print(count) # 正確

4


In [14]:
# 開啟圖片

In [15]:
# 單張圖片維度
face[0][1].shape

(480, 640)

In [16]:
cv2.imshow('image', face[0][1]); cv2.waitKey(); cv2.destroyAllWindows()

In [17]:
# 訓練模型

In [18]:
# 取前五張照片為訓練集
train = [item[0:5] for item in face]
test = [item[5:len(item)] for item in face]
y_train = list(itertools.chain.from_iterable(itertools.repeat(i, 5) for i in y))
rep = []
for i in range(len(files)-1):  
    if len(face[i]) == 7:
        rep.append(2)
    else:
        rep.append(len(face[i])-5)
y_test = list(y[i] for i in range(len(y)) for j in range(rep[i]))

In [19]:
print(len(train)) # 共有114位受試者
print(len(train[0])) # 第一位受試者的圖片張數
print(train[0][1].shape) # 人臉圖片的格式
print(len(test))
print(len(test[2]))
print(len(test[23]))  # file 44+

114
5
(480, 640)
114
2
4


In [20]:
# 降維

In [27]:
def DR(list_of_imgs, n):
    data = []
    for i in range(len(list_of_imgs)):
        r, c = list_of_imgs[i].shape # 取出圖像的列與行
        temp_mat = list_of_imgs[i].reshape(r*c)  # 轉換為一行
        temp_mat = temp_mat -  np.mean(temp_mat) # 中心化
        data.append(temp_mat) # 疊
    pca = PCA(n_components = n)
    newdata = pca.fit_transform(data)
    return(newdata)

由於pca的p無法大於n，故即使設定了主成分個數，仍舊最多只能提出5個，因此先將list of list重新整理。

In [22]:
DR(train[0], 30) # wrong
# for i in range(len(train)):
#    train_dr.append(DR(train[i]))

array([[ 1.39870788e+04,  9.07522081e+03, -1.33736389e+03,
         1.08922670e+03,  1.42794683e-10],
       [ 4.31045650e+03, -9.06760311e+03, -3.85746655e+03,
        -5.75611133e+03,  1.42794683e-10],
       [ 1.30663256e+03, -8.20434983e+03,  4.47508038e+03,
         6.25614845e+03,  1.42794683e-10],
       [-7.48331477e+03,  4.79158016e+03,  8.23927523e+03,
        -3.85147279e+03,  1.42794683e-10],
       [-1.21208531e+04,  3.40515196e+03, -7.51952517e+03,
         2.26220897e+03,  1.42794683e-10]])

In [28]:
train2 = [val for sublist in train for val in sublist]
test2 = [val for sublist in test for val in sublist]   

train_dr = []    
train_dr = DR(train2, 100) 
train_dr.shape
test_dr = []    
test_dr = DR(test2, 100)
train_dr.shape

(570, 100)

In [29]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [30]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4, 0.01, 0.001, ], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(), tuned_parameters)
clf.fit(train_dr, y_train)
#  列印最佳參數
print("Best parameters set found on development set: ", clf.best_params_)
print(clf.best_params_)

y_true, y_pred = y_test, clf.predict(test_dr)
print("ACC : ", accuracy_score(y_true, y_pred))

Best parameters set found on development set:  {'C': 1, 'kernel': 'linear'}
{'C': 1, 'kernel': 'linear'}
ACC :  0.0


In [None]:
## 重新讀取資料

In [31]:
x = []
y = []
for i in range(len(files)-1):
    count = 0
    for img in glob.glob("database/" + files[i] + "/*.jpg"):
        n = cv2.imread(img, 0)    
        r, c = n.shape
        n = n.reshape(r*c)  #.astype("float32")
        n = n - np.mean(n)      
        x.append(n)
        count += 1
    y.append([files[i]] * count)
x = np.array(x)
x.shape
y  = [val for sublist in y for val in sublist]
import re
for i in range(len(y)):
    y[i] = int(re.sub("\D", "", y[i]))

pca = PCA(n_components = 100)
newdata = pca.fit_transform(x)
newdata.shape

(797, 100)

In [32]:
from sklearn.model_selection import KFold

In [33]:
def SVM(X, y, k):
    kf = KFold(n_splits = k, shuffle = True)
    precision_average = 0.0
    tuned_parameters = [{'kernel': ['rbf'], 
                         'gamma': [0.001, 0.01, 0.05 , 0.1],
                         'C': [1, 2, 3, 5, 10, 100]},
                        {'kernel': ['linear'], 'C': [1, 2, 3, 5, 10, 100]}]
    
    clf = GridSearchCV(SVC(), tuned_parameters)
    X = np.array(X)
    y = np.array(y)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = clf.fit(X_train, y_train)
        test_pred = clf.predict(X_test)
        # 計算平均準確率
        precision = 0
        for i in range(0, len(y_test)):
            if (y_test[i] == test_pred[i]):
                precision = precision + 1
        precision_average = precision_average + float(precision)/len(y_test)
    precision_average = precision_average / k    
    return precision_average

In [34]:
SVM(newdata, y, 10)

0.3575949367088608