# Homework 2
此部分作业要求使用PCA技术对给定的人脸数据集进行处理
本次使用到的人脸数据集是 ORL人脸数据集，共包含40个不同人的400张图像。此数据集下包含40个目录，每个目录下有10张图像，每个目录表示一个不同的人。所有的图像是以PGM格式存储，灰度图，图像大小宽度为92，高度为112。

# 用到的包和辅助函数
在我的代码中包含两个.py文件，part2和utils，其中part2为主框架代码，utils为用到的部分辅助函数

In [3]:
# utils所用包
import os
import numpy as np
import skimage.io
from Homework1.part1 import linalg
from Homework1.part1 import imageManip
import matplotlib.pyplot as plt

# part2所用包
from Homework1.part1 import linalg
from Homework1.part1 import imageManip

import os
import numpy as np
from skimage import io
import utils
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


# utils中所有的函数

In [None]:
def collect_images(root_path):
    """
    批量读入根目录下所有图片
    :param root_path: 数据集根目录
    :return:
    """
    images = []
    # 对目录下的文件进行遍历
    for file in os.listdir(root_path):
        # 对于是文件的情况
        if os.path.isfile(os.path.join(root_path, file)):
            c = os.path.basename(file)
            name = root_path + '\\' + c
            img = skimage.io.imread(name, as_gray=True)
            # 需要对图像进行扁平化处理
            flattened_img = img.flatten()
            # img = skimage.transform.resize(img, (112, 92))
            images.append(flattened_img)
        # 对于是文件夹的情况，递归调用
        else:
            sub_images = collect_images(os.path.join(root_path, file))
            images.append(sub_images)
    return images


def cal_eigenvalues_and_eigenvectors(data_set, dimension):
    """
    计算数据集的特征值和特征向量
    :param data_set: 人脸数据集
    :param dimension: 维度，即特征值和特征向量个数
    :return: 含有dimension个特征值和特征向量的list
    """
    eig = []
    for people in range(len(data_set)):
        cur_people_images = data_set[people]
        cur_people_eig = []
        for img in cur_people_images:
            # 计算特征值和特征向量, 取前100个
            eigenvalues, eigenvectors = linalg.get_eigen_values_and_vectors(img, dimension)
            cur_people_eig.append([eigenvalues, eigenvectors])
        eig.append(cur_people_eig)
    return eig


def covert_3d_to_2d(data_set):
    """
    将三维的数据结构转化为2维的
    :param data_set: 原始数据集
    :return: 转化后的数据集
    """
    people_index, img_num, img_size = data_set.shape
    return data_set.reshape(people_index * img_num, img_size)


def compare_original_and_reconstructed_image(original_data_set, reconstructed_data_set, img_num, title):
    """
    重建数据集
    :param original_data_set: 原始数据集
    :param reconstructed_data_set: 重建后的数据集
    :param img_num: 需要展示的图片数量
    :param title: 图片标题
    :return:
    """
    fig, axes = plt.subplots(img_num, 2, figsize=(10, 10))
    for index in range(img_num):
        original_img = original_data_set[index].reshape(112, 92)
        reconstructed_img = reconstructed_data_set[index].reshape(112, 92)

        # 原始图像
        axes[index, 0].imshow(original_img, cmap='gray')
        axes[index, 0].set_title("Original")

        # 恢复图像
        axes[index, 1].imshow(reconstructed_img, cmap='gray')
        axes[index, 1].set_title("Reconstructed")
        # io.imshow(original_img)
        # io.show()
        # io.imshow(reconstructed_img)
        # io.show()
    # 调整子图间距
    plt.tight_layout()
    # 调整间距
    fig.subplots_adjust(top=0.9)
    # 添加总标题
    fig.suptitle(title)
    # 显示图像
    plt.show()


# 步骤1
将数据集划分为80%的训练集，20%的测试集，在训练集上使用PCA将特征维度降为100，即得到100个特征和其对应的特征向量，并使用训练得到的PCA将测试集维度也压缩到100，输出：压缩后的训练集维度和测试集维度、经过PCA得到的特征向量维度。

In [None]:
# 数据集根路径
data_path = r'E:\大三下科目\计算机视觉\实验\实验1\Homework1\part2\data'
# 读入数据集
images_divided_by_people = utils.collect_images(data_path)
# for index in range(len(images_divided_by_people)):
#     print("current index: {} images num: {}".format(index, len(images_divided_by_people[index])))
# print(len(images_divided_by_people[0]))

# 划分训练集和测试集
training_rate = 0.8
train_set = []
test_set = []
for people in range(len(images_divided_by_people)):
    cur_people_images = images_divided_by_people[people]
    train_length = int(len(cur_people_images) * training_rate)
    train_set.append(cur_people_images[:train_length])
    test_set.append(cur_people_images[train_length:])

train_set = np.array(train_set)
test_set = np.array(test_set)

# 将训练集和测试集进行降维
# 因为按照不同的人像进行划分
train_people_index, train_img_num, train_img_size = train_set.shape
test_people_index, test_img_num, test_img_size = test_set.shape
# print(people_index, img_num, img_size)
# train_set = train_set.reshape(people_index * img_num, img_size)
# print(train_set)
train_set = utils.covert_3d_to_2d(train_set)
test_set = utils.covert_3d_to_2d(test_set)
print("train data set dimensions: ", train_set.shape)
print("test data set dimensions: ", test_set.shape)

# 使用PCA降维并输出训练集、测试集维度和得到的特征向量维度
pca = PCA(n_components=100)
train_set_compressed = pca.fit_transform(train_set)
test_set_compressed = pca.transform(test_set)
print("train set compressed shape: ", train_set_compressed.shape)
print("test set compressed shape: ", test_set_compressed.shape)

# 步骤2
