In [1]:
import graphlab
from graphlab import SFrame
from __future__ import division
import numpy

A newer version of GraphLab Create (v1.9) is available! Your current version is v1.8.5.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.


### Load the SFrames

In [3]:
chinese_sframe = graphlab.load_sframe('data/normalized/asian_sframe_new/');
white_sframe = graphlab.load_sframe('data/normalized/non_asian_sframe_new/');
hispanic_sframe = graphlab.load_sframe('data/normalized/hispanic_sframe_new/');

print("Chinese: "+str(len(chinese_sframe)));
print("White: "+str(len(white_sframe)));
print("Hispanic: "+str(len(hispanic_sframe)));
print(str(len(chinese_sframe)+len(white_sframe)+len(hispanic_sframe)) + " images in total");

Chinese: 188
White: 110
Hispanic: 48
346 images in total


In [3]:
data_sframe = chinese_sframe.append(white_sframe)
data_sframe = data_sframe.append(hispanic_sframe)

In [4]:
chinese_train, chinese_test = chinese_sframe.random_split(0.8, seed=0)
print(len(chinese_train));
print(len(chinese_test));

159
29


In [5]:
white_train, white_test = white_sframe.random_split(0.8, seed=0)
print(len(white_train));
print(len(white_test));

95
15


In [6]:
hispanic_train, hispanic_test = hispanic_sframe.random_split(0.8, seed=0)
print(len(hispanic_train));
print(len(hispanic_test));

41
7


In [7]:
train_sframe = chinese_train.append(white_train)
train_sframe = train_sframe.append(hispanic_train)

test_sframe = chinese_test.append(white_test)
test_sframe = test_sframe.append(hispanic_test)

### Features array

In [8]:
features = ['chin',
 'chin_mouth',
 'eye',
 'eye_inner',
 'eye_outer',
 'eyebrow_inner',
 'eyebrow_outer',
 'mouth',
 'nose',
 'nose_mouth']

# Chinese Classifier

## SVM

In [9]:
svm_chinese = graphlab.svm_classifier.create(train_sframe, features=features, target='chinese', validation_set=None, max_iterations=30)

### Accuracy of this model against the Chinese test set

In [55]:
svm_chinese.evaluate(chinese_test)

{'accuracy': 1.0, 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 1
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        1        |   29  |
 +--------------+-----------------+-------+
 [1 rows x 3 columns], 'f1_score': 1.0, 'precision': 1.0, 'recall': 1.0}

### Accuracy of this model against the White and Hispanic

In [11]:
print("Accuracy against white test set: "+str(svm_chinese.evaluate(white_test)['accuracy']))
print("Accuracy against hispanic test set: "+str(svm_chinese.evaluate(hispanic_test)['accuracy']))

Accuracy against white test set: 0.666666666667
Accuracy against hispanic test set: 0.714285714286


## Logistic Classifier

In [12]:
logistic_chinese = graphlab.logistic_classifier.create(train_sframe, features=features, target='chinese', validation_set=None,
                                                      l1_penalty=0, l2_penalty=0)

### Accuracy of this model against the Chinese test set

In [49]:
logistic_chinese.evaluate(chinese_test)['accuracy']

0.9655172413793104

### Accuracy of this model against the White and Hispanic test set

In [14]:
print("Accuracy against white test set: "+str(logistic_chinese.evaluate(white_test)['accuracy']))
print("Accuracy against hispanic test set: "+str(logistic_chinese.evaluate(hispanic_test)['accuracy']))

Accuracy against white test set: 0.866666666667
Accuracy against hispanic test set: 0.857142857143


This is a better model than the one using SVM.

## Neural Net Classifier

In [38]:
neural_chinese = graphlab.neuralnet_classifier.create(train_sframe, features=features, target='chinese', validation_set=None)

Using network:

### network layers ###
layer[0]: FullConnectionLayer
  init_sigma = 0.01
  init_random = gaussian
  init_bias = 0
  num_hidden_units = 10
layer[1]: SigmoidLayer
layer[2]: FullConnectionLayer
  init_sigma = 0.01
  init_random = gaussian
  init_bias = 0
  num_hidden_units = 2
layer[3]: SoftmaxLayer
### end network layers ###

### network parameters ###
learning_rate = 0.001
momentum = 0.9
### end network parameters ###



### Accuracy of this model against the Chinese test set

In [52]:
neural_chinese.evaluate(chinese_test)['accuracy']

1.0

### Accuracy of this model against the White and Hispanic test set

In [40]:
print("Accuracy against white test set: "+str(neural_chinese.evaluate(white_test)['accuracy']))
print("Accuracy against hispanic test set: "+str(neural_chinese.evaluate(hispanic_test)['accuracy']))

Accuracy against white test set: 0.0
Accuracy against hispanic test set: 0.0


This model simply predicts all the examples as Chinese. Neural nets require tens of thousands of data to train them to a good degree of accuracy.

# Hispanic Classifier

## SVM Classifier

In [24]:
svm_hispanic = graphlab.svm_classifier.create(train_sframe, features=features, target='hispanic', validation_set=None, max_iterations=30)

### Accuracy of this model against the Hispanic test set

In [25]:
svm_hispanic.evaluate(hispanic_test)['accuracy']

1.0

### Accuracy of this model against the Chinese and White test set

In [26]:
print("Accuracy against chinese test set: "+str(svm_hispanic.evaluate(chinese_test)['accuracy']))
print("Accuracy against white test set: "+str(svm_hispanic.evaluate(white_test)['accuracy']))

Accuracy against chinese test set: 0.0
Accuracy against white test set: 0.0


As we can see, the SVM Classifier on trained on recognizing hispanic features is not good as it wrongly predicts all the examples in the White and Chinese test set as Hispanics. This can be attributed to having only around 38 training samples to train the SVM classifier. Let us now try logistic regression on the Hispanic test set

## Logistic Classifier

In [27]:
logistic_hispanic = graphlab.logistic_classifier.create(train_sframe, features=features, target='hispanic', validation_set=None,
                                                       l1_penalty=0, l2_penalty=0)

### Accuracy of this model against the Hispanic test set

In [28]:
logistic_hispanic.evaluate(hispanic_test)['accuracy']

0.5714285714285714

### Accuracy of this model against the White and Chinese test set

In [29]:
print("Accuracy against chinese test set: "+str(logistic_hispanic.evaluate(chinese_test)['accuracy']))
print("Accuracy against white test set: "+str(logistic_hispanic.evaluate(white_test)['accuracy']))

Accuracy against chinese test set: 1.0
Accuracy against white test set: 0.866666666667


As we can see, because of the lack of data, this model does not have a good success rate in predicting Hispanics. But it does a good job in predicting **non**-hispanics

## Neural Net Classifier

In [30]:
neural_hispanic = graphlab.neuralnet_classifier.create(train_sframe, features=features, target='hispanic', validation_set=None)

Using network:

### network layers ###
layer[0]: FullConnectionLayer
  init_sigma = 0.01
  init_random = gaussian
  init_bias = 0
  num_hidden_units = 10
layer[1]: SigmoidLayer
layer[2]: FullConnectionLayer
  init_sigma = 0.01
  init_random = gaussian
  init_bias = 0
  num_hidden_units = 2
layer[3]: SoftmaxLayer
### end network layers ###

### network parameters ###
learning_rate = 0.001
momentum = 0.9
### end network parameters ###



### Accuracy of this model against the Hispanic test set

In [46]:
neural_hispanic.evaluate(hispanic_test)

{'accuracy': 0.0, 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 1
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        0        |   7   |
 +--------------+-----------------+-------+
 [1 rows x 3 columns]}

### Accuracy of this model against the White and Chinese test set

In [45]:
print("Accuracy against chinese test set: "+str(neural_hispanic.evaluate(chinese_test)['accuracy']))
print("Accuracy against white test set: "+str(neural_hispanic.evaluate(white_test)['accuracy']))

Accuracy against chinese test set: 1.0
Accuracy against white test set: 1.0


# White Classifier

## SVM Classifier

In [56]:
svm_white = graphlab.svm_classifier.create(train_sframe, features=features, target='white', validation_set=None, max_iterations=30)

### Accuracy of this model agains the White test set

In [60]:
svm_white.evaluate(chinese_test)

{'accuracy': 0.0, 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 1
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        0        |   95  |
 +--------------+-----------------+-------+
 [1 rows x 3 columns], 'f1_score': 0.0, 'precision': None, 'recall': 0.0}

### Accuracy of this model against the Hispanic and Chinese test sets

In [62]:
print("Accuracy against hispanic test set: "+str(svm_white.evaluate(hispanic_test)['accuracy']))
print("Accuracy against chinese test set: "+str(svm_white.evaluate(chinese_test)['accuracy']))

Accuracy against hispanic test set: 1.0
Accuracy against chinese test set: 1.0


## Logistic Classifier

In [9]:
logistic_white = graphlab.logistic_classifier.create(train_sframe, features=features, target='white', validation_set=None,
                                                    l1_penalty=0, l2_penalty=0)

### Accuracy of this model against the White test set

In [10]:
logistic_white.evaluate(white_test)

{'accuracy': 0.6666666666666666, 'auc': 0.0, 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 2
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        1        |   10  |
 |      1       |        0        |   5   |
 +--------------+-----------------+-------+
 [2 rows x 3 columns], 'f1_score': 0.8, 'log_loss': 0.5524594737085454, 'precision': 1.0, 'recall': 0.6666666666666666, 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+----+---+
 | threshold | fpr | tpr | p  | n |
 +-----------+-----+-----+----+---+
 |    0.0    | nan | 1.0 | 15 | 0 |
 |   1e-05   | nan | 1.0 | 15 | 0 |
 |   2e-05   | nan | 1.0 | 15 | 0 |
 |   3e-05   | nan | 1.0 | 15 | 0 |
 |   4e-05   | nan | 1.0 | 15 | 0 |
 |   5e-05   | nan | 1.0 | 15 | 0 |
 |   6e-05   | nan | 1.0 | 15 | 0 |
 |  

### Accuracy of this model against the Chinese and Hispanic test sets

In [11]:
print("Accuracy against hispanic test set: "+str(logistic_white.evaluate(hispanic_test)['accuracy']))
print("Accuracy against chinese test set: "+str(logistic_white.evaluate(chinese_test)['accuracy']))

Accuracy against hispanic test set: 0.571428571429
Accuracy against chinese test set: 1.0
