Merge branch 'release/v0.1dev' into main

leap-ec · Oct 14, 2021 · 0ebe928 · 0ebe928
2 parents 2f0517d + 13952f0
commit 0ebe928
Show file tree

Hide file tree

Showing 16 changed files with 862 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,83 @@
+/tmp.*
+/local
+*.log
+*.pyc
+__pycache__
+docker-nom/build/
+\#*\#
+.\#*
+venv/
+*.egg-info/
+docs/build/
+.coverage*
+htmlcov/
+.ipynb_checkpoints/
+.pytest_cache
+build/
+dist/
+
+# Subversion
+.svn
+
+
+# Vim (and some others)
+*~
+*.swp
+
+# Eclipse
+.cache
+.classpath
+.project
+.settings
+build-eclipse
+
+# Gradle
+.gradle
+gradle-app.setting
+
+# Sublime Text
+*.sublime-workspace
+
+# NetBeans
+.netbeans
+catalog.xml
+generated
+nb-configuration.xml
+
+# Mac OS
+.DS_Store
+__MACOSX
+
+# Intellij
+.idea/
+.idea/workspace.xml
+.idea/libraries
+.idea/kotlinc.xml
+.idea/tasks.xml
+.idea/dictionaries
+.idea/scopes
+.idea/artifacts
+.idea/uiDesigner.xml
+.idea/dataSources.local.xml
+.idea/dataSources.local.xml
+.idea/dataSources.ids
+.idea/dataSources.xml
+.idea/dataSources
+.idea/kotlinc.xml
+.idea/sonar*
+*.iws
+*.iml
+.idea/compiler.xml
+.idea/kotlinc.xml
+.idea/inspectionProfiles/profiles_settings.xml
+.idea/misc.xml
+.idea/modules.xml
+
+# vscode
+.vscode
+
+# dask
+dask-worker-space/
+
+# Generated data
+*.csv
diff --git a/CHANGELOG b/CHANGELOG
@@ -0,0 +1,9 @@
+Version 0.1dev Migrated to github, 10/14/2021
+
+This version moved from code.ornl.gov repository to github to facilitate
+use as an open-source project.
+
+Version 0.0 Migrated from internal repository, 7/13/2021
+
+Migrated from internal git repository to code.ornl.gov, and generalized source to be more
+readily applicable to new problems.
diff --git a/README.md b/README.md
@@ -0,0 +1,60 @@
+# Gremlin
+
+Gremlin is a machine learning model evaluator. Find out where your model performs poorly.
+
+## Requires
+* Python 3.[78]
+* LEAP (https://github.com/AureumChaos/LEAP)
+
+## How it works
+It utilizes an adversarial evolutionary algorithm (EA) to find features where a model
+performs poorly. The intent is for the user to leverage that information to tune training
+data for subsequent model retraining to improve performance in those poor performing situations.
+
+## Configuration
+At a bare minimum, Gremlin needs an algorithm, a `Problem`, and a `Representation`. The
+`Problem` and `Representation` should inherit from `leap_ec.problem.Problem` and
+`leap_ec.representation.Representation`, respectively. The model to evaluate should be
+handled within the custom `Problem` class.
+
+Example configuration:
+
+```
+evolution:
+    name: leap_ec.algorithm.generational_ea *or* custom_generator_function
+    params:
+        max_generations: 50
+        pop_size: 25
+        problem:
+            name: leap_ec.problem.Problem *or* custom_class
+            params:
+                maximize: False
+        representation:
+            name: leap_ec.representation.Representation *or* custom_class
+            params:
+                initialize:
+                    name: curried_initializer_function (see leap_ec.int_rep.create_int_vector)
+                    params: {}
+analysis:
+    name: analysis_function
+```
+
+The `name:` field specifies the function or class to import. If this field is followed
+by `params:` it will attempt to instantiate the function or class with the arguments that
+follow prior to running the evolutionary algorithm.
+
+## Example
+Example code and configuration for a real problem can be found in `examples/MNIST`.
+This problem involves Gremlin evolving patterns of occlusion (graying-out pixels of an
+image) in order to cause a convolutional neural network to perform poorly on digit
+recognition.
+
+This can be run simply by (must be in `examples/MNIST` directory):
+
+```
+$ gremlin MNIST_config.yml
+```
+
+## Sub-directories
+* `gremlin/` -- main `gremlin` code
+* `examples/` -- examples for using gremlin
diff --git a/examples/MNIST/MNIST_config.yml b/examples/MNIST/MNIST_config.yml
@@ -0,0 +1,60 @@
+# Configuration file for Gremlin
+# Usage:
+#   $ gremlin MNIST_config.yml
+
+# these variables are defined in multiple places
+pop_size: 50
+bounds: [[0, 55], [0, 55], [0, 55], [0, 55], [0, 55], [0, 55]]
+
+
+# this defines the evolutionary algorithm and its parameters
+evolution:
+  name: leap_ec.algorithm.generational_ea
+  params:
+    max_generations: 30
+    pop_size: ${pop_size}
+    problem:
+      name: gremlin.problem.DatasetProblem
+      params:
+        maximize: True
+        model:
+          name: MNIST_example.LeNet
+          params:
+            checkpoint_path: ./data/model.pt
+        metric:
+          name: torch.nn.functional.cross_entropy
+        generator:
+          name: MNIST_example.MNISTRowColOcclusionGenerator
+          params:
+            batch_size: 500
+    representation:
+      name: leap_ec.representation.Representation
+      params:
+        initialize:
+          name: leap_ec.int_rep.initializers.create_int_vector
+          params:
+            bounds: ${bounds}
+        decoder:
+          name: MNIST_example.RowColDecoder
+          params: {}
+    pipeline:
+      - name: leap_ec.ops.tournament_selection
+        params:
+          k: 5
+      - name: leap_ec.ops.clone
+        params: {}
+      - name: leap_ec.ops.uniform_crossover
+        params: {}
+      - name: leap_ec.int_rep.ops.mutate_binomial
+        params:
+          std: 2.5
+          bounds: ${bounds}
+          expected_num_mutations: 3
+      - name: leap_ec.ops.evaluate
+        params: {}
+      - name: leap_ec.ops.pool
+        params:
+          size: ${pop_size}
+
+analysis:
+  name: MNIST_example.MNIST_heatmap
diff --git a/examples/MNIST/MNIST_example.py b/examples/MNIST/MNIST_example.py
@@ -0,0 +1,138 @@
+'''
+MNIST_example.py
+
+MNIST occlusion problem.
+
+Gremlin will find patterns of row/column occlusion that
+cause the model to perform poorly.
+
+This file defines the model, decoder, generator, and analyzer
+dynamically imported, instantiated, and used by the Gremlin
+interface.
+
+Training the model is separate from Gremlin.
+'''
+import copy
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.datasets import MNIST
+from torchvision.transforms import ToTensor
+from leap_ec.decoder import Decoder
+
+
+class RowColDecoder(Decoder):
+    '''
+    Decide which rows and columns are grayed out
+    '''
+    def decode(self, genome, *args, **kwargs):
+        row_indices = genome[genome < 28]
+        col_indices = genome[genome >= 28]
+        col_indices = col_indices - 28
+        return [row_indices, col_indices]
+
+
+class LeNet(nn.Module):
+    def __init__(self, checkpoint_path=None):
+        super(LeNet, self).__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5, 1)
+        self.conv2 = nn.Conv2d(20, 50, 5, 1)
+        self.lin1 = nn.Linear(4*4*50, 500)
+        self.lin2 = nn.Linear(500, 10)
+        if checkpoint_path is not None:
+            ckpt = torch.load(checkpoint_path)
+            self.load_state_dict(ckpt['model_state_dict'])
+            self.eval()
+
+    def forward(self, xx):
+        xx = F.relu(self.conv1(xx))
+        xx = F.max_pool2d(xx, 2, 2)
+        xx = F.relu(self.conv2(xx))
+        xx = F.max_pool2d(xx, 2, 2)
+        xx = xx.view(-1, 4*4*50)
+        xx = F.relu(self.lin1(xx))
+        return self.lin2(xx)
+
+
+def MNIST_heatmap(population):
+    '''
+    Generate heatmaps of the population
+    genome using Gremlin's output
+    '''
+    genomes = [ind.genome for ind in population]
+    genomes = np.stack(genomes)
+    rows = genomes[genomes < 28]
+    cols = genomes[genomes >= 28] - 28
+    heatmap = np.zeros((28, 28))
+    for row in rows:
+        heatmap[row, :] += 1
+    for col in cols:
+        heatmap[:, col] += 1
+    plt.imshow(heatmap, cmap='hot')
+    plt.title('Population Occlusion Frequency')
+    plt.savefig('MNIST_heatmap.png')
+    plt.show()
+
+
+class MNISTRowColOcclusionGenerator:
+    '''
+    Modifies a dataset of images by "graying-out"
+    rows and columns of an image
+
+    The dataset must be of the form (N, C, H, W)
+    where N is the number of images, C is the number
+    of channels (only supports 1 and 3),
+    H is the height of the image,
+    and W is the width of the image.
+
+    Attributes
+    ----------
+    dataset : np.array
+        set of images to alter
+        required dimensions (N, C, H, W)
+
+    Methods
+    -------
+    transform(image, rows, columns)
+        grey out a row/column of an image
+    '''
+    def __init__(self, batch_size, **kwargs):
+        dataset = MNIST('./data/', transform=ToTensor(),
+                        train=False, download=True)
+        loader = torch.utils.data.DataLoader(dataset=dataset,
+                                             batch_size=batch_size,
+                                             shuffle=True)
+        self.images, self.labels = next(iter(loader))
+
+    def transform(self, image, rows, columns):
+        # supports grayscale, rgb, rgba
+        if image.shape[0] in [1, 3, 4]:
+            for c in range(image.shape[0]):
+                image[c, rows, :] = 0.5
+                image[c, :, columns] = 0.5
+        else:
+            raise ValueError(
+                f'Unsupported image dimensions {image.shape}')
+        return image
+
+    def __call__(self, features):
+        '''
+        Generate a new dataset modifying by features
+
+        Parameters
+        ----------
+        features : list
+           which rows and columns to obfuscate
+           features[0] has rows
+           features[1] has columns
+        '''
+        # transform images in the dataset
+        images = copy.deepcopy(self.images)
+        for i in range(len(images)):
+            images[i] = self.transform(images[i],
+                                       features[0],
+                                       features[1])
+        return images