In [2]:
import numpy as np
from numpy.random import RandomState
from glob import glob
from PIL import Image
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

STANDARD_SIZE = (100,120) #Chose a relatively small one to speed up the process

#To trasform the file it into a numpy array of RGB pixels
def img_to_array(filename): 
    img = Image.open(filename)
    img = img.resize(STANDARD_SIZE)
    img = list(img.getdata())
    img = map(list, img)
    img = np.array(img)
    s = img.shape[0] * img.shape[1]
    img_wide = img.reshape(1, s)
    return img_wide[0]

man_files = glob('Sample/Man/*.JPEG')
woman_files = glob('Sample/Woman/*.JPEG')

process_file = img_to_array
raw_data = [(process_file(filename),'woman',filename) for filename in woman_files] + \
           [(process_file(filename),'man',filename) for filename in man_files]

# pull out the features and the labels
data = np.array([cd for (cd,_y,f) in raw_data])
gender = np.array([_y for (cd,_y,f) in raw_data])
labels = [1 if label == 'man' else 0 for label in gender]

In [3]:
print data.shape

(600, 36000)


**1.Build a simple linear classifier using the original pixel data. What is your error rate on the training data? What is your error rate on your testing data?**

In [69]:
#logreg
train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.2, random_state = 123)
logreg = LogisticRegression()
logreg.fit(train_x,train_y)

predictions_train = logreg.predict(train_x)
print metrics.classification_report(test_y,predictions)

predictions_test = logreg.predict(test_x)
print metrics.classification_report(test_y,predictions_test)

print logreg.score(test_x,predictions_test)

             precision    recall  f1-score   support

          0       0.65      0.65      0.65        60
          1       0.65      0.65      0.65        60

avg / total       0.65      0.65      0.65       120

             precision    recall  f1-score   support

          0       0.60      0.63      0.62        60
          1       0.61      0.58      0.60        60

avg / total       0.61      0.61      0.61       120

1.0


**2.Train the same linear model as in question 1, but now on the reduced representation that you created using PCA. What is your error rate on the training data? What is your error rate on your testing data?**

In [71]:
#PCA
N_COMPONENTS = 10
pca = PCA(n_components=N_COMPONENTS, random_state=0)
X_pca = pca.fit_transform(data)

pca_train_x, pca_test_x, pca_train_y, pca_test_y = train_test_split(X_pca, labels, test_size=0.2, random_state = 234)
logreg_pca = LogisticRegression()
logreg_pca.fit(pca_train_x,pca_train_y)

predictions_pca_train = logreg_pca.predict(pca_train_x)
print metrics.classification_report(pca_train_y, predictions_pca_train)

predictions_pca_test = logreg_pca.predict(pca_test_x)
print metrics.classification_report(pca_test_y, predictions_pca_test)

print logreg_pca.score(pca_test_x,predictions_pca_test)

             precision    recall  f1-score   support

          0       0.66      0.67      0.67       242
          1       0.66      0.65      0.65       238

avg / total       0.66      0.66      0.66       480

             precision    recall  f1-score   support

          0       0.61      0.66      0.63        58
          1       0.66      0.61      0.63        62

avg / total       0.63      0.63      0.63       120

1.0


**3.Train the same linear model as in question 1, but now on the reduced representation that you created using LDA. What is your error rate on the training data? What is your error rate on your testing data?**

In [72]:
#LDA
N = 10
lda = LinearDiscriminantAnalysis(n_components=N)
X_lda = lda.fit_transform(data,labels)

lda_train_x, lda_test_x, lda_train_y, lda_test_y = train_test_split(
    X_lda, labels, test_size=0.2, random_state = 123)

lda.fit(lda_train_x,lda_train_y)

logreg_lda = LogisticRegression()
logreg_lda.fit(lda_train_x,lda_train_y)

predictions_lda_train = logreg_lda.predict(lda_train_x)
print metrics.classification_report(lda_train_y, predictions_lda_train)

predictions_lda_test = logreg_lda.predict(lda_test_x)
print metrics.classification_report(lda_test_y, predictions_lda_test)

print "LDA score",lda.score(lda_test_x,lda_test_y)

             precision    recall  f1-score   support

          0       0.93      0.92      0.93       240
          1       0.92      0.93      0.93       240

avg / total       0.93      0.93      0.93       480

             precision    recall  f1-score   support

          0       0.93      0.92      0.92        60
          1       0.92      0.93      0.93        60

avg / total       0.93      0.93      0.92       120

LDA score 0.916666666667


**4.Write three paragraphs, describing and interpreting your results from questions 1, 2, and 3. Make a recommendation on which classifier you would prefer, and why.**


Overall, the results show that with this dataset specifically, LDA achieve the highest accuracy, PCA and logistic regression attain similar accuracies with PCA being slightly better. Since the original dataset was very large, I randomly chose 300 pictures for both men and women to run the code above. 

PCA is an unsupervised method that reduces dimensions by looking for the variables that have the most variances, which in some cases, are not necessarily the most predictive one. This could explain why logistic regression on data processed by PCA didn't perform significantly better than simply running logistic regression. 

In contrast, LDA is a supervised method and takes the lable of data into account, which makes it more idea in this case since we have the labels available. It yields a nice outcome, with mean accuracy of 91.6% and high precisions for the test dataset.

In choosing classification methods, we need to first see if the class of data is available, as LDA requires that information and PCA doesn't. LDA and PCA can be used on datasets with more than two classes versus Logistical Regression can only work with datasets with 2 classes. I would prefer LDA most of the time, especially if the constraints are met.


*Reference: For some parts of the image processing, I used the code in eigenfashion blogpost for class: https://github.com/joelgrus/shirts/blob/master/visuals.py*