## Import Library

In [11]:
from utils.colors import ColorsCorpusReader
from nltk.translate.bleu_score import corpus_bleu
from collections import Counter
import re
import torch.nn as nn
import numpy as np
import os, sys
from sklearn.model_selection import train_test_split
from scipy.fft import fft
import colorsys
from itertools import product
from utils.torch_color_describer import (ContextualColorDescriber, create_example_dataset)
import utils
from utils.utils import UNK_SYMBOL
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
from baseline.model import (
    BaselineTokenizer, BaselineColorEncoder,
    BaselineEmbedding, BaselineDescriber, GloVeEmbedding
)
from experiment.word_embeddings.helper import Embedding, EmbeddingType

from transformers import (
    BertTokenizer, BertModel,
    XLNetTokenizer, XLNetModel,
    RobertaTokenizer, RobertaModel,
    ElectraTokenizer, ElectraModel,    
)

import utils.model_utils as mu

## Dataset

In [13]:
def get_roberta_embedding(dev_texts, if_contextual=False):
    roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    roberta_model = RobertaModel.from_pretrained('roberta-base')    
    
    roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(dev_texts, roberta_model, roberta_tokenizer)
    roberta_contextual_embeddings, roberta_contextual_vocab = mu.extract_positional_embeddings(dev_texts, roberta_model, roberta_tokenizer)
    
    if not if_contextual:
        return (roberta_vocab, roberta_embeddings)
    else:
        return roberta_contextual_vocab, roberta_contextual_embeddings

In [28]:
utils.utils.fix_random_seeds()
COLORS_SRC_FILENAME = os.path.join(
    "data", "colors", "filteredCorpus.csv")

dev_corpus = ColorsCorpusReader(
    COLORS_SRC_FILENAME,
    word_count=None,
    normalize_colors=True)

examples = list(dev_corpus.read())
rawcols, texts = zip(*[[ex.colors, ex.contents] for ex in examples])
rawcols_train, rawcols_test, texts_train, texts_test = \
    train_test_split(rawcols, texts)

In [39]:
len(rawcols), type(rawcols[0]), len(rawcols[0]), len(rawcols[0][0])

(46994, list, 3, 3)

## Color Representation

In [15]:
import torch
import colorsys

def load_model_feature_extractor(model_arch='resnet18'):
    model = torch.hub.load('pytorch/vision:v0.6.0', model_arch, pretrained=True)
    feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])
    return feature_extractor

feature_extractor = load_model_feature_extractor()

Using cache found in /Users/yanjiang/.cache/torch/hub/pytorch_vision_v0.6.0


### Convert Color Rep

In [19]:
def convert_color_to_rgb(color):
    # Convert from HLS to RGB
    rgb = colorsys.hls_to_rgb(color[0],color[1],color[2])
    return rgb

def convert_to_imagenet_input(hsl):
    rgb = convert_color_to_rgb(hsl)
#     print("rgb", len(rgb))
    r = torch.full((224,224),rgb[0]).unsqueeze(2)
    g = torch.full((224,224),rgb[1]).unsqueeze(2)
    b = torch.full((224,224),rgb[2]).unsqueeze(2)
    expanded_rep = torch.cat((r,g,b),2)    
#     print(type(expanded_rep), expanded_rep.shape)
    expanded_rep = expanded_rep.permute(2,1,0).unsqueeze(0)
    return expanded_rep


def convert_color_tuple(colors):
    converted_colors = [[convert_to_imagenet_input(col) for col in cols] for cols in colors ] 
    return converted_colors

def extract_features_from_batch(extractor, examples):
    output = extractor(examples)
    shape = output.shape
    output = output.reshape((shape[0],shape[1]))
#     print("shape", shape, "reshape", output.shape)
    return output

def convert_color_tuple_v2(colors):
    extracted_features  = []
    with torch.no_grad():
        for cols in colors:
            cols_group = []
            for col in cols:
                converted_colors = convert_to_imagenet_input(col)     # [1, 3, 224, 224]
    #             print(converted_colors.shape)
                cols_group.append(converted_colors)
    #         print(len(cols_group), cols_group[0].shape) 
            cols_batch = torch.cat((cols_group[0], cols_group[1], cols_group[2]))
            batch_extraction = extract_features_from_batch(feature_extractor, cols_batch)
#             print("batch_extraction:", batch_extraction.shape)

            # convert to numpy array
            batch_extraction_np = batch_extraction.numpy()
            print("batch_extraction_np", len(batch_extraction_np), type(batch_extraction_np[0]), batch_extraction_np[0].size)

            # append to list
            extracted_features.append(batch_extraction_np)

            # Print some stats 
            length = len(extracted_features)
            if length%100==0:
                total_size = sys.getsizeof(extracted_features)
                print(f"Running batch number: {length}, Size of array: {total_size/(1024**2)} Megabytes")
    return extracted_features        

In [20]:
converted_data = convert_color_tuple_v2(rawcols[:2])

batch_extraction_np 3 <class 'numpy.ndarray'> 512
batch_extraction_np 3 <class 'numpy.ndarray'> 512


In [41]:
import pickle
pickle.dump(converted_data, open( "data/colors/resnet18_color_embeddings_test.pickle", "wb" ) )

In [49]:
def get_color_representation(rawcols):
    converted_data = convert_color_tuple(rawcols)

    extracted_features  = []
    with torch.no_grad():
        for colors in converted_data:
            # Convert to 3x224x224 matrix
            print(type(colors), len(colors), colors[0].shape)
            cols_batch = torch.cat((colors[0],
                                colors[1],
                                colors[2]))
            print("cols_batch:", cols_batch.shape)

            # Run color through the feature extractor
            batch_extraction = extract_features_from_batch(feature_extractor, cols_batch)
#             print("batch_extraction:", batch_extraction.shape)

            # convert to numpy array
            batch_extraction_np = batch_extraction.numpy()
#             print("batch_extraction_np", len(batch_extraction_np), type(batch_extraction_np[0]), batch_extraction_np[0].size)

            # append to list
            extracted_features.append(batch_extraction)

            # Print some stats 
            length = len(extracted_features)
            if length%100==0:
                total_size = sys.getsizeof(extracted_features)
                print(f"Running batch number: {length}, Size of array: {total_size/(1024**2)} Megabytes")

    return extracted_features

In [50]:
x = get_color_representation(rawcols[:2])

<class 'list'> 3 torch.Size([1, 3, 224, 224])
cols_batch: torch.Size([3, 3, 224, 224])
<class 'list'> 3 torch.Size([1, 3, 224, 224])
cols_batch: torch.Size([3, 3, 224, 224])


In [245]:
len(extracted_features), len(extracted_features[0]), len(extracted_features[0][0]), type(extracted_features[0][0])

(2, 3, 512, torch.Tensor)

### Full data

In [248]:
def create_data(tokenizer, include_position=False):    
    
    rawcols, texts = zip(*[[ex.colors, ex.contents] for ex in examples])

    raw_colors_train, raw_colors_test, texts_train, texts_test = \
        train_test_split(rawcols, texts)

    tokens_train = [
        mu.tokenize_colour_description(text, tokenizer, include_position) for text in texts_train
    ]
#     color_encoder = BaselineColorEncoder()
#     colors_train = [
#         color_encoder.encode_color_context(colors) for colors in raw_colors_train
#     ]
    colors_train = [
        get_color_representation(colors) for colors in raw_colors_train
    ]

    return colors_train, tokens_train, raw_colors_test, texts_test

In [249]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base') 
colors_train, tokens_train, raw_colors_test, texts_test = create_data(roberta_tokenizer)

424
Running batch number: 100, Size of array: 0.00087738037109375 Megabytes
Running batch number: 200, Size of array: 0.0016021728515625 Megabytes
Running batch number: 300, Size of array: 0.0024261474609375 Megabytes
Running batch number: 400, Size of array: 0.0031585693359375 Megabytes
Running batch number: 100, Size of array: 0.00087738037109375 Megabytes
Running batch number: 200, Size of array: 0.0016021728515625 Megabytes
Running batch number: 300, Size of array: 0.0024261474609375 Megabytes
Running batch number: 400, Size of array: 0.0031585693359375 Megabytes
Running batch number: 100, Size of array: 0.00087738037109375 Megabytes
Running batch number: 200, Size of array: 0.0016021728515625 Megabytes
Running batch number: 300, Size of array: 0.0024261474609375 Megabytes


KeyboardInterrupt: 

In [193]:
len(colors_train), len(colors_train[0]), len(colors_train[0][0]), type(colors_train[0][0])

(35245, 3, 54, list)

In [73]:
%time roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(texts_test, roberta_model, roberta_tokenizer)

CPU times: user 3.21 s, sys: 56.2 ms, total: 3.26 s
Wall time: 3.41 s


In [74]:
baseline_model = BaselineDescriber(
    roberta_vocab,
    embedding=roberta_embeddings,
    early_stopping=True
)

In [78]:
%time _ = baseline_model.fit(colors_train[:5], tokens_train[:5])

Stopping after epoch 12. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 6.363071918487549

CPU times: user 460 ms, sys: 37.7 ms, total: 498 ms
Wall time: 499 ms


In [79]:
def evaluate(trained_model, tokenizer, color_seqs_test, texts_test):
    tok_seqs = [mu.tokenize_colour_description(text, tokenizer) for text in texts_test]
    col_seqs = [color_encoder.encode_color_context(colors) for colors in color_seqs_test]

    return trained_model.evaluate(col_seqs, tok_seqs)

In [80]:
evaluate(baseline_model, roberta_tokenizer, raw_colors_test, texts_test)

{'listener_accuracy': 0.3486254149289301, 'corpus_bleu': 0.054725444702242845}