<a href="https://colab.research.google.com/github/jfogarty/machine-learning-intro-workshop/blob/master/LearnLatentDirections.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Originally from https://github.com/Puzer/stylegan-encoder/blob/master/Learn_direction_in_latent_space.ipynb
Modifications and additions by Kevin Sikorski

# Learn Latent Space Directions

### Imports

In [0]:
%cd /content
%rm -rf stylegan

In [0]:
#!git clone https://github.com/NVlabs/stylegan  # doesn't have encoder or the prepackaged latent directions.
!git clone https://github.com/Puzer/stylegan
%cd stylegan

import os
import pickle
import config
import dnnlib
import gzip
import json
import numpy as np
from tqdm import tqdm_notebook
import warnings
import matplotlib.pylab as plt
%matplotlib inline
warnings.filterwarnings("ignore")

### Load the data

In [0]:
LATENT_TRAINING_DATA = 'https://drive.google.com/uc?id=1xMM3AFq0r014IIhBLiMCjKJJvbhLUQ9t'
    
with dnnlib.util.open_url(LATENT_TRAINING_DATA, cache_dir=config.cache_dir) as f:
    qlatent_data, dlatent_data, labels_data = pickle.load(gzip.GzipFile(fileobj=f))

    
labels_data[0] 

### 

In [0]:
X_data = dlatent_data.reshape((-1, 18*512))
y_age_data = np.array([x['faceAttributes']['age'] for x in labels_data])
y_gender_data = np.array([x['faceAttributes']['gender'] == 'male' for x in labels_data]) # sry girls :(

assert(len(X_data) == len(y_age_data) == len(y_gender_data))
len(X_data)

In [0]:
plt.hist(y_age_data[y_gender_data], bins=30, color='blue', alpha=0.5, label='male')
plt.hist(y_age_data[~y_gender_data], bins=30, color='red', alpha=0.5, label='female')
plt.legend()
plt.title('Distribution of age within gender')
plt.xlabel('Age')
plt.ylabel('Population')
plt.show()


### Training a linear model for obtaining gender direction in latent space

In [0]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

In [0]:
%%time

clf = LogisticRegression(class_weight='balanced').fit(X_data, y_gender_data)
gender_direction = clf.coef_.reshape((18, 512))

In [0]:
%%time
clf = SGDClassifier('log', class_weight='balanced') # SGB model for performance sake
scores = cross_val_score(clf, X_data, y_gender_data, scoring='accuracy', cv=5)
clf.fit(X_data, y_gender_data)

print(scores)
print('Mean: ', np.mean(scores))

### Effect of age on gender detection errors

In [0]:

bins, bin_edges = np.histogram(y_age_data, bins=30)
errors,_ = np.histogram(y_age_data[clf.predict(X_data) != y_gender_data], bin_edges)

plt.plot(errors / bins)
plt.title('Dependency of gender detection errors on age')
plt.ylabel('Gender detection error rate')
plt.xlabel('Age')
plt.show()

### Effect of training data size on accuracy

In [0]:
%%time

nb_folds = 5
splits = 20
scores = np.zeros((splits, nb_folds))
dataset_size = list()

for fold_id, (train_idx, test_idx) in enumerate(StratifiedKFold(nb_folds, True, 42).split(X_data, y_gender_data)):
    X_train, X_test = X_data[train_idx][:1000], X_data[test_idx]
    y_train, y_test = y_gender_data[train_idx][:1000], y_gender_data[test_idx]
    
    for split_id in range(splits):
        nb_samples = int((len(X_train)/splits) * (split_id+1))
        dataset_size.append(nb_samples)
        clf = SGDClassifier('log', class_weight='balanced').fit(X_train[:nb_samples], y_train[:nb_samples])
        scores[split_id][fold_id] = accuracy_score(y_test, clf.predict(X_test))
        

plt.plot(dataset_size[:splits], scores.mean(axis=1))
plt.title('Dependency of accuracy on training data size')
plt.xlabel('Dataset size')
plt.ylabel('Accuracy')
plt.show()

### Find importants of feature in layers

In [0]:
scores = list()
for layer in tqdm_notebook(range(18)):
    clf = SGDClassifier('log', class_weight='balanced')
    scores.append(cross_val_score(clf, X_data.reshape((-1, 18, 512))[:,layer], y_gender_data, scoring='accuracy', cv=5).mean())
    
plt.plot(np.arange(0,18), scores)
plt.xlabel('Layer')
plt.ylabel('Accuracy')
plt.show()

### Visualization of gender transformation

In [0]:
%%time
# Do you remember gender_direction ? 
# Anyway let's train it one more time

clf = LogisticRegression(class_weight='balanced')
clf.fit(X_data.reshape((-1, 18*512)), y_gender_data)
gender_direction = clf.coef_.reshape((18, 512))

In [0]:
import PIL.Image
import dnnlib.tflib as tflib
import encoder
from encoder.generator_model import Generator

URL_FFHQ = 'https://drive.google.com/uc?id=1MEGjdvVpUsu1jB4zrXZN7Y4kBBOzizDQ'

tflib.init_tf()
with dnnlib.util.open_url(URL_FFHQ, cache_dir=config.cache_dir) as f:
    generator_network, discriminator_network, Gs_network = pickle.load(f)

In [0]:
print(Gs_network)
print(generator_network)
print(discriminator_network)

generator = Generator(Gs_network, batch_size=1, randomize_noise=False)

In [0]:
def generate_image(latent_vector):
    latent_vector = latent_vector.reshape((1, 18, 512))
    generator.set_dlatents(latent_vector)
    img_array = generator.generate_images()[0]
    img = PIL.Image.fromarray(img_array, 'RGB')
    return img.resize((256, 256))

def move_and_show(latent_vector, direction, coeffs):
    fig,ax = plt.subplots(1, len(coeffs), figsize=(15, 10), dpi=80)
    for i, coeff in enumerate(coeffs):
        new_latent_vector = latent_vector.copy()
        new_latent_vector[:8] = (latent_vector + coeff*direction)[:8]
        ax[i].imshow(generate_image(new_latent_vector))
        ax[i].set_title('Coeff: %0.1f' % coeff)
    [x.axis('off') for x in ax]
    plt.show()

In [0]:
# For generating these face untruncated sampling was used

for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], gender_direction, [-5, -1.5, 0, 1.5, 5])

## Glasses?

In [0]:
X_data = dlatent_data.reshape((-1, 18*512))
y_age_data = np.array([x['faceAttributes']['age'] for x in labels_data])
y_glasses_data = np.array([x['faceAttributes']['glasses'] == 'NoGlasses' for x in labels_data])

assert(len(X_data) == len(y_age_data) == len(y_glasses_data))
len(X_data)

In [0]:
plt.hist(y_age_data[y_glasses_data], bins=30, color='blue', alpha=0.5, label='male')
plt.hist(y_age_data[~y_glasses_data], bins=30, color='red', alpha=0.5, label='female')
plt.legend()
plt.title('Distribution of age within glasses')
plt.xlabel('Age')
plt.ylabel('Population')
plt.show()

In [0]:
%%time

clf = LogisticRegression(class_weight='balanced').fit(X_data, y_glasses_data)
glasses_direction = clf.coef_.reshape((18, 512))
np.save("/content/glasses.npy", glasses_direction)

In [0]:
for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], glasses_direction, [-4, -1, -0.5, 0, 0.5, 1, 4])

## 'Stache?

In [0]:
X_data = dlatent_data.reshape((-1, 18*512))
y_age_data = np.array([x['faceAttributes']['age'] for x in labels_data])
y_stache_data = np.array([x['faceAttributes']['facialHair']['moustache'] < 0.5  for x in labels_data])

assert(len(X_data) == len(y_age_data) == len(y_stache_data))
len(X_data)

In [0]:
plt.hist(y_age_data[y_stache_data], bins=30, color='blue', alpha=0.5, label='male')
plt.hist(y_age_data[~y_stache_data], bins=30, color='red', alpha=0.5, label='female')
plt.legend()
plt.title('Distribution of age within stache')
plt.xlabel('Age')
plt.ylabel('Population')
plt.show()

It is interesting to see how moustaches evolve over time.  It seems only male children have them, and at an impressive frequency, too.  They seem to peak in popularity among men from 20-33 years.  They remain at a rather constant low frequency in women from age 30 and on.

In [0]:
%%time

clf = LogisticRegression(class_weight='balanced').fit(X_data, y_stache_data)
stache_direction = clf.coef_.reshape((18, 512))
np.save("/content/moustache.npy", stache_direction)

In [0]:
for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], stache_direction, [-2, -1.3, -1, -0.5, 0, 1])

As one might expect, the presence of a moustache beard is highly corrolated with beardliness, gender and age. More moustache implies a beard, male, and older.  Less Moustache implies no beard, female, and young.

## Hair Color

In [0]:
def fancy_hair_scan(data, wanted_color):
  dict = data['faceAttributes']['hair']['hairColor']
  for element in dict:
    if (element['color'] == wanted_color):
      return element['confidence']
  return 0.0

#fancy_hair_scan(labels_data[0], 'brown')

In [0]:
X_data = dlatent_data.reshape((-1, 18*512))
y_age_data = np.array([x['faceAttributes']['age'] for x in labels_data])
y_brownhair_data = np.array([fancy_hair_scan(x, 'brown') > 0.8  for x in labels_data])
y_grayhair_data = np.array([fancy_hair_scan(x,  'gray') > 0.8  for x in labels_data])
y_blondhair_data = np.array([fancy_hair_scan(x, 'blond') > 0.8  for x in labels_data])
y_blackhair_data = np.array([fancy_hair_scan(x, 'black') > 0.8  for x in labels_data])
y_redhair_data = np.array([fancy_hair_scan(x,   'red') > 0.8  for x in labels_data])
y_otherhair_data = np.array([fancy_hair_scan(x, 'other') > 0.8  for x in labels_data])

assert(len(X_data) == len(y_age_data) == len(y_stache_data))
len(X_data)

In [0]:
plt.hist(y_age_data[y_brownhair_data], bins=30, color='blue', alpha=0.5, label='male')
plt.hist(y_age_data[~y_brownhair_data], bins=30, color='red', alpha=0.5, label='female')
plt.legend()
plt.title('Distribution of age within brown hair')
plt.xlabel('Age')
plt.ylabel('Population')
plt.show()

In [0]:
%%time

clf = LogisticRegression(class_weight='balanced').fit(X_data, y_brownhair_data)
hair_brown_direction = clf.coef_.reshape((18, 512))
np.save("/content/brownhair.npy", hair_brown_direction)
clf = LogisticRegression(class_weight='balanced').fit(X_data, y_grayhair_data)
hair_gray_direction = clf.coef_.reshape((18, 512))
np.save("/content/grayhair.npy", hair_gray_direction)
clf = LogisticRegression(class_weight='balanced').fit(X_data, y_blondhair_data)
hair_blond_direction = clf.coef_.reshape((18, 512))
np.save("/content/blondhair.npy", hair_blond_direction)
clf = LogisticRegression(class_weight='balanced').fit(X_data, y_blackhair_data)
hair_black_direction = clf.coef_.reshape((18, 512))
np.save("/content/blackhair.npy", hair_black_direction)
clf = LogisticRegression(class_weight='balanced').fit(X_data, y_redhair_data)
hair_red_direction = clf.coef_.reshape((18, 512))
np.save("/content/redhair.npy", hair_red_direction)
clf = LogisticRegression(class_weight='balanced').fit(X_data, y_otherhair_data)
hair_other_direction = clf.coef_.reshape((18, 512))
np.save("/content/otherhair.npy", hair_other_direction)

### Brown Hair People

In [0]:
for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], hair_brown_direction, [-6, -4, -2, 0, 2, 4, 6])

### Gray Hair People

In [0]:
for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], hair_gray_direction, [-6, -4, -2, 0, 2, 4, 6])

### Blond Hair People

In [0]:
for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], hair_blond_direction, [-6, -4, -2, 0, 2, 4, 6])

### Black Hair People

In [0]:
for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], hair_black_direction, [-4, -2, -1, 0, 1, 2, 4])

### Red Hair People

In [0]:
for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], hair_red_direction, [-4, -2, -1, 0, 1, 2, 4])

### Other Hair People

In [0]:
for i in range(10):
    move_and_show(X_data.reshape((-1, 18, 512))[i], hair_other_direction, [-3, -2, -1, 0, 1, 2, 3])