In [6]:
#putting matlab data into pandas DF
from scipy.io import loadmat
import pandas as pd
import numpy as np
from skimage.io import imread, imsave
from sklearn import svm, metrics
from skimage.transform import resize
import shutil
import os
from matplotlib import pyplot as plt
import math
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

#Reads mat file and returns a dataframe with matlab IMDb data
def readmat(mat_file):
    data = loadmat(mat_file)
    mdata = data['imdb']
    mdtype = mdata.dtype
    names = [x[0] for x in mdtype.fields.items()]
    ndata = {n: mdata[n][0,0] for n in mdtype.names}
    columns = [n for n, v in ndata.items() if v.size == ndata['dob'].size]
    test = np.concatenate([ndata[c] for c in columns], axis=0)
    test = np.column_stack(test)
    df = pd.DataFrame(test, columns=columns)
    return df

#Crops and save images according to the face location in matfile. 
def face_crop(df):
    photo_wd = os.getcwd() + '/1000_Photos' #location of photos
    new_wd = os.getcwd() + '/Cropped_1000' #folder for new photos

    for index, row in df.iterrows():
        path = np.array2string(row['full_path']).strip("[]").strip('\'')[2:]
        img = imread(photo_wd + path)
        face_location = row['face_location'][0].astype(int)
        face_image = img[face_location[1]:face_location[3],face_location[0]:face_location[2]]
        imsave(new_wd + path, face_image)
               
#Zips photos in a folder
def zip_photos():
    zipfile_name = '' #name for zip file
    folder = os.getcwd() + '' #WD of files
    shutil.make_archive(zipfile_name, 'zip', folder)
    
#Unpacks an archive
def unpack(archive):
    extract_dir = ''
    try:
        shutil.unpack_archive(archive, extract_dir = extract_dir)
        print('Archive extracted')
    except FileNotFoundError:
        print('File or folder not found')
    
#Crops, resizes and appends images, gender to list
def prep_images(df):
    imgs = []
    flat_data = []
    genders = []
    cwd = os.getcwd() + '/1000_Photos'

    for index, row in df.iterrows():
        path = np.array2string(row['full_path']).strip("[]").strip('\'')[2:]
        img = imread(cwd+path, as_gray=True)
        face_location = row['face_location'][0].astype(int)
        face_image = img[face_location[1]:face_location[3],face_location[0]:face_location[2]]
        face_image = resize(face_image, (100,100), anti_aliasing=True, mode='reflect')
        flat_data.append(face_image.flatten())
        imgs.append(face_image)
        genders.append(row['gender'])
        
        if index % 100 == 0:
            print(index, ' completed')
            
    return flat_data, genders

def svm_clf(genders, flat_data):
    flat_data = np.array(flat_data)

    genders = np.array(genders)
    for i, gender in enumerate(genders):
        if math.isnan(gender):
            genders[i] = 1.0

    print('Males: ', list(genders).count(1.0))
    print('Females: ', list(genders).count(0))


    x_train, x_test, y_train, y_test = train_test_split(
        flat_data, genders, test_size=0.3,random_state=100)


    svc = svm.SVC(gamma=0.001, class_weight='balanced')
    clf = svc.fit(x_train, y_train)


    y_pred = clf.predict(x_test)


    print("Classification report for - \n{}:\n{}\n".format(
        clf, metrics.classification_report(y_test, y_pred)))

    test_acc = accuracy_score(y_test, y_pred)
    train_acc = clf.score(x_train, y_train)

    print('Testing Accuracy: ', test_acc)
    print('Training Accuracy: ', train_acc)
    
    return test_acc, train_acc 

#mat_file = 'imdb_1000.mat'
#df = readmat(mat_file)
#flat_data, genders = prep_images(df)
test_acc_1000, train_acc_1000 = svm_clf(genders, flat_data)

Males:  721
Females:  279
Classification report for - 
SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

        0.0       0.40      0.67      0.50        76
        1.0       0.86      0.67      0.75       224

avg / total       0.74      0.67      0.69       300


Testing Accuracy:  0.6666666666666666
Training Accuracy:  0.8128571428571428
