In [1]:
__authors__ = "Anton Gochev, Jaro Habr, Yan Jiang, Samuel Kahn"
__version__ = "XCS224u, Stanford, Spring 2021"

## Contents

1. [Setup](#Setup)
1. [Dataset](#Dataset)
1. [Baseline-System](#Baseline-System)
1. [Transformers](#Transformers)
1. [Few-Short-Learning](#Few-Shot-Learning)

## Setup

In [2]:
from colors import ColorsCorpusReader
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch_color_describer import ContextualColorDescriber, create_example_dataset
import utils
import matplotlib.pyplot as plt
import matplotlib.patches as mpatch
import numpy as np
from baseline import BaselineTokenizer

## Dataset

In [3]:
COLORS_SRC_FILENAME = os.path.join(
    "data", "colors", "filteredCorpus.csv"
)

In [4]:
corpus = ColorsCorpusReader(
    COLORS_SRC_FILENAME,
    word_count=None,
    normalize_colors=True
)

In [None]:
examples = list(corpus.read())

In [None]:
len(examples)

In [None]:
close_examples = [example for example in examples if example.condition == "close"]
split_examples = [example for example in examples if example.condition == "split"]
far_examples = [example for example in examples if example.condition == "far"]

In [None]:
print(f" close: {len(close_examples)}")
print(f" split: {len(split_examples)}")
print(f" close: {len(close_examples)}")

In [None]:
plt.bar(
    np.arange(3),
    height=[len(close_examples), len(split_examples), len(far_examples)],
    tick_label = ['close','split','far']
)

### Number of words used for description

In [None]:
description_words = dict()
description_words_sorted = dict()
tokenizer = BaselineTokenizer()

for example in examples:
    words = tokenizer.encode(example.contents) # use content tokenizer
    description_length = len(words) - 2 # don't count start and end symbols
    
    if description_length in description_words.keys():
        description_words[description_length] += 1
    else:
        description_words[description_length] = 1
        
for key in sorted(description_words):
    description_words_sorted[key] = description_words[key]
        
assert np.sum(list(description_words_sorted.values())) == len(examples)

In [None]:
description_words_sorted

In [None]:
plt.figure(figsize=(20, 4.8))
plt.bar(
    np.arange(len(list(description_words_sorted.keys()))),
    height=list(description_words_sorted.values()),
    tick_label=list(description_words_sorted.keys())
)
plt.xticks(rotation=45)
plt.show()

## Baseline-System

In [None]:
# baseline system based on assignment 4 (LSTM, GRU)

## Transformers

In [None]:
# transformers experiments

## Few-Shot-Learning

In [None]:
# some more experiment with GPT-3 maybe to compare with few-shot-learning

# conditioning sequence:
#I am a highly intelligent question answering bot that can describe colors with human readable text as precisely as possible. Given an array of three context colors with their RGB (red, green, blue) values, describe the target color in such a way that a human could recognize the target color even if the colors were reordered. The target color is the last color in the array.

for example in (split_examples + close_examples)[:10]:
    print(f"\nQ: Describe the target color of {example.rgb_colors()}.")
    print(f"A: {example.contents}.")
    example.display(print_contents=False)