## Import Library

In [1]:
from utils.colors import ColorsCorpusReader
from nltk.translate.bleu_score import corpus_bleu
from collections import Counter
import re
import torch.nn as nn
import numpy as np
import os, sys
from sklearn.model_selection import train_test_split
from scipy.fft import fft
import colorsys
from itertools import product
from utils.torch_color_describer import (ContextualColorDescriber, create_example_dataset)
import utils
from utils.utils import UNK_SYMBOL
import utils.model_utils as mu
%load_ext autoreload

In [2]:
from baseline.model import (
    BaselineTokenizer, BaselineColorEncoder,
    BaselineEmbedding, BaselineDescriber, GloVeEmbedding
)
from experiment.word_embeddings.helper import Embedding, EmbeddingType

from transformers import (
    BertTokenizer, BertModel,
    XLNetTokenizer, XLNetModel,
    RobertaTokenizer, RobertaModel,
    ElectraTokenizer, ElectraModel,    
)

## Dataset

In [3]:
utils.utils.fix_random_seeds()
COLORS_SRC_FILENAME = os.path.join(
    "data", "colors", "filteredCorpus.csv")

dev_corpus = ColorsCorpusReader(
    COLORS_SRC_FILENAME,
    word_count=None,
    normalize_colors=True)

dev_examples = list(dev_corpus.read())
dev_rawcols, dev_texts = zip(*[[ex.colors, ex.contents] for ex in dev_examples])
dev_rawcols_train, dev_rawcols_test, dev_texts_train, dev_texts_test = \
    train_test_split(dev_rawcols, dev_texts)

In [4]:
len(dev_rawcols), len(dev_texts)

(46994, 46994)

## Color Representation

In [5]:
import torch
import colorsys

def load_model_feature_extractor(model_arch='vgg11'):
    model = torch.hub.load('pytorch/vision:v0.6.0', model_arch, pretrained=True)
    model.classifier = model.classifier[:-3]
    feature_extractor = model
    return feature_extractor

feature_extractor = load_model_feature_extractor()
feature_extractor.eval()

Using cache found in /Users/yanjiang/.cache/torch/hub/pytorch_vision_v0.6.0


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
 

### Convert Color Rep

In [2]:
def convert_color_to_rgb(color):
    # Convert from HLS to RGB
    rgb = colorsys.hls_to_rgb(color[0],color[1],color[2])
    return rgb

def convert_to_imagenet_input(hsl):
    rgb = convert_color_to_rgb(hsl)
    r = torch.full((224,224),rgb[0]).unsqueeze(2)
    g = torch.full((224,224),rgb[1]).unsqueeze(2)
    b = torch.full((224,224),rgb[2]).unsqueeze(2)
    expanded_rep = torch.cat((r,g,b),2)    
#     print(type(expanded_rep), expanded_rep.shape)
    expanded_rep = expanded_rep.permute(2,1,0).unsqueeze(0)
    return expanded_rep

def convert_color_tuple(colors):
    converted_colors = [[convert_to_imagenet_input(col) for col in cols] for cols in colors ] 
#     print (len(converted_colors), len(converted_colors[0]), converted_colors[0][0].shape)
    return converted_colors

In [7]:
def extract_features_from_batch(extractor, examples):
    output = extractor(examples)
    shape = output.shape
    output = output.reshape((shape[0],shape[1]))
#     print("shape", shape, "reshape", output.shape)
    return output

In [8]:
def get_color_representation(rawcols):

    converted_data = convert_color_tuple(rawcols)
    print("input:", len(rawcols), "length of converted_data:", len(converted_data))
    
    extracted_features  = []
    with torch.no_grad():
        for colors in converted_data:
            # Convert to 3x224x224 matrix
            cols_batch = torch.cat((colors[0],
                                colors[1],
                                colors[2]))
#             print("cols_batch:", cols_batch.shape)

            # Run color through the feature extractor
            batch_extraction = extract_features_from_batch(feature_extractor, cols_batch)
#             print("batch_extraction:", batch_extraction.shape)

            # convert to numpy array
            batch_extraction_np = batch_extraction.numpy()
#             print("batch_extraction_np", len(batch_extraction_np), type(batch_extraction_np[0]), batch_extraction_np[0].size)

            # append to list
            extracted_features.append(batch_extraction)

            # Print some stats 
            length = len(extracted_features)
            print(length)
            if length%100==0:
                total_size = sys.getsizeof(extracted_features)
                print(f"Running batch number: {length}, Size of array: {total_size/(1024**2)} Megabytes")

    return extracted_features

In [9]:
len(dev_rawcols[:3])

3

In [10]:
x = get_color_representation(dev_rawcols[:3])

input: 3 length of converted_data: 3
1
2
3


In [11]:
len(x), x[0].shape

(3, torch.Size([3, 4096]))

In [12]:
# len(extracted_features), len(extracted_features[0]), len(extracted_features[0][0]), type(extracted_features[0][0])

In [13]:
import pickle
pickle.dump( extracted_features, open( "data/colors/vgg11_color_embeddings_test.pickle", "wb" ) )

NameError: name 'extracted_features' is not defined

In [None]:
objects = []
with (open("data/colors/resnet18_color_embeddings.pickle", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break

### Full data

In [None]:
color_encoder = BaselineColorEncoder()

In [None]:
def create_data(tokenizer, include_position=False):    
    rawcols, texts = zip(*[[ex.colors, ex.contents] for ex in examples])

    raw_colors_train, raw_colors_test, texts_train, texts_test = \
        train_test_split(rawcols, texts)

    tokens_train = [
        mu.tokenize_colour_description(text, tokenizer, include_position) for text in texts_train
    ]
#     colors_train = [
#         color_encoder.encode_color_context(colors) for colors in raw_colors_train
#     ]
    colors_train = [
        get_color_representation(colors) for colors in raw_colors_train
    ]

    return colors_train, tokens_train, raw_colors_test, texts_test

In [None]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base') 
colors_train, tokens_train, raw_colors_test, texts_test = create_data(roberta_tokenizer)

In [None]:
len(colors_train), len(colors_train[0]), len(colors_train[0][0]), type(colors_train[0][0])

In [None]:
%time roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(texts_test, roberta_model, roberta_tokenizer)

In [None]:
def get_roberta_embedding(dev_texts, if_contextual=False):
    roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    roberta_model = RobertaModel.from_pretrained('roberta-base')    
    
    roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(dev_texts, roberta_model, roberta_tokenizer)
    roberta_contextual_embeddings, roberta_contextual_vocab = mu.extract_positional_embeddings(dev_texts, roberta_model, roberta_tokenizer)
    
    if not if_contextual:
        return (roberta_vocab, roberta_embeddings)
    else:
        return roberta_contextual_vocab, roberta_contextual_embeddings

In [None]:
baseline_model = BaselineDescriber(
    roberta_vocab,
    embedding=roberta_embeddings,
    early_stopping=True
)

In [None]:
%time _ = baseline_model.fit(colors_train[:5], tokens_train[:5])

In [None]:
def evaluate(trained_model, tokenizer, color_seqs_test, texts_test):
    tok_seqs = [mu.tokenize_colour_description(text, tokenizer) for text in texts_test]
    col_seqs = [color_encoder.encode_color_context(colors) for colors in color_seqs_test]

    return trained_model.evaluate(col_seqs, tok_seqs)

In [None]:
evaluate(baseline_model, roberta_tokenizer, raw_colors_test, texts_test)