## Import Library

In [57]:
from colors import ColorsCorpusReader
from nltk.translate.bleu_score import corpus_bleu
from collections import Counter
import re
import torch.nn as nn
import numpy as np
import os
from sklearn.model_selection import train_test_split
from scipy.fft import fft
import colorsys
from itertools import product
from torch_color_describer import (ContextualColorDescriber, create_example_dataset)
import utils
from utils import START_SYMBOL, END_SYMBOL, UNK_SYMBOL
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
from baseline import (
    BaselineEmbedding, BaselineDescriber, GloVeEmbedding
)

from transformers import (
    BertTokenizer, BertModel,
    XLNetTokenizer, XLNetModel,
    RobertaTokenizer, RobertaModel,
    ElectraTokenizer, ElectraModel,    
)

import model_utils as mu

## Dataset

In [59]:
COLORS_SRC_FILENAME = os.path.join(
    "data", "colors", "filteredCorpus.csv"
)

In [60]:
corpus = ColorsCorpusReader(
    COLORS_SRC_FILENAME,
    word_count=None, #2
    normalize_colors=True
)

In [5]:
examples = list(corpus.read())

In [36]:
def get_roberta_embedding(dev_texts, if_contextual=False):
    roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    roberta_model = RobertaModel.from_pretrained('roberta-base')
    
    %time \
    roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(dev_texts, roberta_model, roberta_tokenizer)
    %time \
    roberta_contextual_embeddings, roberta_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, roberta_model, roberta_tokenizer)
    
    if not if_contextual:
        return roberta_vocab, roberta_embeddings
    else:
        return roberta_contextual_vocab, roberta_contextual_embeddings

### DEV data

In [25]:
def create_dev_data():
    dev_color_seqs, dev_word_seqs, dev_vocab = create_example_dataset(
        group_size=50,
        vec_dim=2
    )

    dev_colors_train, dev_colors_test, dev_words_train, dev_words_test = \
        train_test_split(dev_color_seqs, dev_word_seqs)
    
    return dev_vocab, dev_colors_train, dev_words_train, dev_colors_test, dev_words_test

In [27]:
dev_vocab, dev_colors_train, dev_tokens_train, dev_colors_test, dev_texts_test = \
    create_dev_data()

In [72]:
dev_vocab, dev_embedding = get_roberta_embedding(dev_vocab)

CPU times: user 795 ms, sys: 12.3 ms, total: 808 ms
Wall time: 859 ms
CPU times: user 2min 49s, sys: 3.5 s, total: 2min 53s
Wall time: 2min 56s


In [63]:
embedding = BaselineEmbedding()
dev_glove_embedding, dev_glove_vocab = embedding.create_glove_embedding(dev_vocab)

In [70]:
dev_model = BaselineDescriber(
    dev_vocab,
    embedding=dev_embedding,
    early_stopping=True
)

In [73]:
%time _ = dev_model.fit(dev_colors_train, dev_tokens_train)

TypeError: new(): data must be a sequence (got dict)

In [77]:
type(dev_embedding)

dict

In [69]:
dev_model.evaluate(dev_colors_test, dev_texts_test)

{'listener_accuracy': 0.23684210526315788, 'corpus_bleu': 0.05000000000000001}

### Full data

In [None]:
def create_data():    
    rawcols, texts = zip(*[[ex.colors, ex.contents] for ex in examples])

    raw_colors_train, raw_colors_test, texts_train, texts_test = \
        train_test_split(rawcols, texts)

    tokens_train = [tokenizer.encode(text) for text in texts_train]
    colors_train = [
        color_encoder.encode_color_context(colors) for colors in raw_colors_train
    ]   

    return vocab, colors_train, tokens_train, raw_colors_test, texts_test

In [None]:
roberta_vocab, roberta_embedding = get_roberta_embedding(dev_texts) 

In [None]:
vocab, colors_train, tokens_train, raw_colors_test, texts_test = create_data()

In [None]:
%time _ = baseline_model.fit(colors_train, tokens_train)

In [None]:
roberta_model = BaselineDescriber(
    roberta_vocab,
    embedding=roberta_embeddings,
    early_stopping=True
)

In [None]:
def evaluate(trained_model, color_seqs_test, texts_test):
    tok_seqs = [tokenizer.encode(text) for text in texts_test]
    col_seqs = [color_encoder.encode_color_context(colors) for colors in color_seqs_test]

    return trained_model.evaluate(col_seqs, tok_seqs)

In [None]:
evaluate(roberta_model, raw_colors_test, texts_test)