# ML Face Image Classification

## 1. Baseline Model--Boosted Decision Stump

### Import necessary libraries
Set os.environ["R_USER"] to user name in windows

In [53]:
import pandas as pd
import numpy as np
import pyreadr
import import_ipynb
import time

In [54]:
import os
os.environ["R_USER"] = "Jiyoung Sim" # user name

In [55]:
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri

### Step 0: Provide directories for training/testing images. Images and fiducial points will be in different subfolders. 

In [56]:
train_dir = './data/test_set/' # This will be modified for different data sets.
train_image_dir = train_dir + 'images/'
train_pt_dir = train_dir + 'points/'
train_label_path = './data/train_set/' + 'label.csv'

In [57]:
test_dir = './data/test_set/' # This will be modified for different data sets.
test_image_dir = test_dir + 'images/'
test_pt_dir = test_dir + 'points/'
test_label_path = test_dir + 'label.csv'

### Step 1: set up controls for evaluation experiments.
Set baseline_tt_split = True if you want to do train_test_split for data in the same directory. Set it to False if you have test data in a different directory

In [58]:
baseline_feature_train = True # process features for training set
baseline_train = False # train model
baseline_cv= False # hyperparameter tuning by GridSearchCV when training
baseline_feature_test = True # process features for test set
baseline_test = True # run evaluation on an independent test set
baseline_tt_split = False

### Step 2: import data and train-test split if wanted

In [59]:
info = pd.read_csv(train_label_path)
if(baseline_tt_split):
    from sklearn.model_selection import train_test_split
    train_idx_py, test_idx_py = train_test_split(range(len(info)), test_size=0.2, random_state = 0)
    train_idx_r = [i+1 for i in train_idx_py]
    test_idx_r = [i+1 for i in test_idx_py]
    info_test = info
else:
    info_test = pd.read_csv(test_label_path)
    train_idx_py = list(range(len(info)))
    train_idx_r = [i+1 for i in train_idx_py]
    test_idx_py = list(range(len(info_test)))
    test_idx_r = [i+1 for i in test_idx_py]

### Step 3: construct features and responses

In [60]:
# Import feature.R
feature = robjects.r(
    '''
    source('./lib/feature.R')
    '''
)[0]

In [61]:
# function to read fiducial points
def readMat(index):
    import scipy.io
    numpy2ri.activate()
    try:
        mat = np.round(scipy.io.loadmat(train_pt_dir + '{:04n}.mat'.format(index))['faceCoordinatesUnwarped'])
    except KeyError:
        mat = np.round(scipy.io.loadmat(train_pt_dir + '{:04n}.mat'.format(index))['faceCoordinates2'])
    nr,nc = mat.shape
    mat_r = robjects.r.matrix(mat, nrow=nr, ncol=nc)
    robjects.r.assign("mat", mat_r)
    return mat_r

#load fiducial points
start = time.time()
n_files = len(os.listdir(train_pt_dir))
fiducial_pt_list = [readMat(index) for index in range(1, n_files+1)]
end = time.time()
tm_fid_pt_train = end-start

if(baseline_tt_split):
    fiducial_pt_test = fiducial_pt_list
    tm_fid_pt_test = tm_fid_pt_train
else:
    start = time.time()
    n_files = len(os.listdir(test_pt_dir))
    fiducial_pt_test = [readMat(index) for index in range(1, n_files+1)]
    end = time.time()
    tm_fid_pt_test = end-start

In [62]:
# convert pandas dataframe to R dataframe
from rpy2.robjects import pandas2ri
pandas2ri.activate()
info_rdf = pandas2ri.py2ri(info)
info_test_rdf = pandas2ri.py2ri(info_test)

In [63]:
# extract features from fiducial points
as_factor = robjects.r('''as.factor''')
if(baseline_feature_train):
    start = time.time()
    dat_train_r = feature(fiducial_pt_list, train_idx_r, info_rdf)
    end = time.time()
    dat_train_py = pandas2ri.ri2py_dataframe(dat_train_r)
    dat_train_r[-1] = as_factor(dat_train_r[-1])
    tm_feature_train_baseline = end - start + tm_fid_pt_train
#     dat_train_py.to_csv('dat_train_py.csv', index=False)

if(baseline_feature_test):
    start = time.time()
    dat_test_r = feature(fiducial_pt_test, test_idx_r, info_test_rdf)
    end = time.time()
    dat_test_py = pandas2ri.ri2py_dataframe(dat_test_r)
    dat_test_r[-1] = as_factor(dat_test_r[-1])
    tm_feature_test_baseline = end - start + tm_fid_pt_test
#     dat_test_py.to_csv('dat_test_py.csv', index=False)

  res = PandasDataFrame.from_items(items)


In [64]:
info_test.head()

Unnamed: 0,Index,Baseline,Advanced
0,1,0,0
1,2,0,0
2,3,0,0
3,4,0,0
4,5,0,0


### Step 4: Train a classification model with training features and responses

In [65]:
# train baseline model
baseline_dir = 'baseline_train_main.sav' #'baseline_train_alldata.sav' #'baseline_train_alldata.sav'#'baseline_train_main.sav' 
if (baseline_train==True):
    import train_baseline
    tm_train_baseline, baseline = train_baseline.gbm_fn(dat_train_py.iloc[:,:-1], dat_train_py.iloc[:,-1], baseline_cv)
    
    from sklearn.externals import joblib
    joblib.dump(baseline, baseline_dir) # save the model to disk

# test
if (baseline_test==True):
    import test_baseline_2
    start= time.time()
    baseline_preds = test_baseline_2.test_clf(dat_test_py, baseline_dir = baseline_dir) 
    end = time.time()
    tm_test_baseline = end-start

In [66]:
info_test['Baseline'] = baseline_preds

In [69]:
info_test

Unnamed: 0,Index,Baseline,Advanced
0,1,18,0
1,2,18,0
2,3,4,0
3,4,1,0
4,5,7,0
5,6,4,0
6,7,12,0
7,8,8,0
8,9,5,0
9,10,15,0


In [29]:
# test1 = info_test['Baseline']

### Step 5: Summarize Running Time and Accuracy

In [70]:
# print('training feature extraction took: {}'.format(tm_feature_train_baseline))
print('testing feature extraction took: {}'.format(tm_feature_test_baseline))
# print('model training took: {}'.format(tm_train_baseline))
print('model testing took: {}'.format(tm_test_baseline))
# print('model accuracy: {}'.format(baseline_acc))

testing feature extraction took: 5.018312931060791
model testing took: 2.250392198562622


## 2. Improved Model--Voting Classifier (Combines Light GBM (dart), Logistic Regression, Linear SVM, and Random Forest)
### Step 1: set up controls for evaluation experiments.
Set baseline_tt_split = True if you want to do train_test_split for data in the same directory. Set it to False if you have test data in a different directory

In [71]:
voting_feature_train = True # process features for training set
voting_train = False # train model
voting_cv= False # hyperparameter tuning by GridSearchCV when training
voting_feature_test = True # process features for test set
voting_test = True # run evaluation on an independent test set
voting_tt_split = False

### Step 2: import data and train-test split if wanted--identical to Step 2 in previous part
### Step 3: construct features and responses

In [72]:
# import myfeature2.R
myfeature2 = robjects.r(
    '''
    source('./lib/myfeature2.R')
    '''
)[0]

In [73]:
info_rdf

Unnamed..0,Index,identity,emotion_idx,emotion_cat,type
1,1,111,1,'Neutral','simple'
2,2,114,1,'Neutral','simple'
3,3,115,1,'Neutral','simple'
4,4,116,1,'Neutral','simple'
...,...,...,...,...,...
2497,2497,386,22,'Sadly di...,'compound'
2498,2498,391,22,'Sadly di...,'compound'
2499,2499,392,22,'Sadly di...,'compound'
2500,2500,393,22,'Sadly di...,'compound'


In [74]:
# extract features from fiducial points
start = time.time()
myfeature_train_r = myfeature2(info_rdf, fiducial_pt_list)
myfeature_train_py = pandas2ri.ri2py_dataframe(myfeature_train_r)
myfeature_train_r[-1] = as_factor(myfeature_train_r[-1])
end = time.time()
tm_feature_train_voting = end - start + tm_fid_pt_train

if(voting_tt_split):   
    train_df = myfeature_train_py.iloc[train_idx_py].reset_index(drop=True)
    test_df = myfeature_train_py.iloc[test_idx_py].reset_index(drop=True)
    tm_feature_test_voting = tm_feature_train_voting
else:
    train_df = myfeature_train_py
    
    start = time.time()
    myfeature_test_r = myfeature2(info_rdf, fiducial_pt_test)
    myfeature_test_py = pandas2ri.ri2py_dataframe(myfeature_test_r)
    myfeature_test_r[-1] = as_factor(myfeature_test_r[-1])
    end = time.time()
    
    tm_feature_test_voting = end - start + tm_fid_pt_test
    test_df = myfeature_test_py

  res = PandasDataFrame.from_items(items)


In [75]:
test_df = test_df.iloc[:,:-1]
test_df.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature89,feature90,feature91,feature92,feature93,feature94,feature95,feature96,feature97,feature98
0,55.0,118.0,58.0,63.0,3.0,60.0,28.0,19.0,12.0,9.0,...,20.0,86.0,169.0,302.0,106.0,189.0,322.0,83.0,216.0,133.0
1,66.0,124.0,63.0,58.0,3.0,61.0,16.0,1.0,10.0,15.0,...,5.0,74.0,170.0,313.0,69.0,165.0,308.0,96.0,239.0,143.0
2,68.0,139.0,69.0,71.0,1.0,70.0,8.0,7.0,10.0,1.0,...,4.0,129.0,215.0,345.0,125.0,211.0,341.0,86.0,216.0,130.0
3,64.0,131.0,64.0,67.0,0.0,67.0,16.0,16.0,15.0,0.0,...,20.0,79.0,162.0,291.0,99.0,182.0,311.0,83.0,212.0,129.0
4,62.0,130.0,66.0,68.0,4.0,64.0,24.0,18.0,7.0,6.0,...,12.0,69.0,152.0,278.0,81.0,164.0,290.0,83.0,209.0,126.0


### Step 4: Train a classification model with training features and responses

In [76]:
# train improved model
voting_dir = 'voting_train_main2.sav' #'voting_train_alldata.sav'#'voting_train_main.sav' 
if (voting_train==True):
    import train_voting
    tm_train_voting, voting = train_voting.train_fn(train_df.iloc[:,:-1], train_df.iloc[:,-1], voting_cv)
    
    from sklearn.externals import joblib
    joblib.dump(voting, voting_dir) # save the model to disk

# test the model
if (voting_test==True):
    import test_voting_2
    start= time.time()
    voting_preds = test_voting_2.test_fn(test_df, voting_dir) 
    end = time.time()
    tm_test_voting = end-start

In [77]:
info_test['Advanced'] = voting_preds

In [78]:
info_test

Unnamed: 0,Index,Baseline,Advanced
0,1,18,18
1,2,18,18
2,3,4,4
3,4,1,3
4,5,7,7
5,6,4,13
6,7,12,12
7,8,8,8
8,9,5,5
9,10,15,15


In [79]:
info_test.to_csv('new_test_labels.csv', index = False)

### Step 5: Summarize Running Time and Accuracy

In [57]:
# print('training feature extraction took: {}'.format(tm_feature_train_voting))
print('testing feature extraction took: {}'.format(tm_feature_test_voting))
# print('model training took: {}'.format(tm_train_voting))
print('model testing took: {}'.format(tm_test_voting))
# print('model accuracy: {}'.format(voting_acc))

testing feature extraction took: 5.673455238342285
model testing took: 6.759052038192749
