# Convert to Style Transfer
Converts data to style transfer format required by deep-latent-sequence-model.

We want to perform yelp to sentiment treebank style transfer and back. Since both of these are english language (altough different domains), we will use shared vocabulary. TODO: is there an option to use separate vocabularies? Should be, since authors were doing serbian-bosnian translation (but it probably was shared vocab too).

In [1]:
import os
from collections import Counter

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from read_utils import read_sentiment_treebank
from style_transfer import get_dataset

In [2]:
sst_df = read_sentiment_treebank('data/stanfordSentimentTreebank')
sst_df.head()

Unnamed: 0,sentence_index,sentence,sentiment,splitset_label
0,1,The Rock is destined to be the 21st Century 's...,0.69444,1
1,2,The gorgeously elaborate continuation of `` Th...,0.83333,1
2,3,Effective but too-tepid biopic,0.51389,2
3,4,If you sometimes like to go to the movies to h...,0.73611,2
4,5,"Emerges as something rare , an issue movie tha...",0.86111,2


In [3]:
yelp_train = get_dataset('yelp', 'train')['text'].values
yelp_dev = get_dataset('yelp', 'dev')['text'].values
yelp_test = get_dataset('yelp', 'test')['text'].values
len(yelp_train), len(yelp_dev), len(yelp_test)

(444101, 63483, 126670)

Folder structure we want:
* sentiment
   * train_0.txt
   * train_0.attr
   * dev_0.txt
   * dev_0.attr
   * test_0.txt
   * test_0.txt
   * text.vocab

In [4]:
sentiment_path = 'data/sentiment'
if not os.path.exists(sentiment_path):
    os.mkdir(sentiment_path)

## Splitting data

In [5]:
np.random.seed(42)
sst_train, sst_other = train_test_split(sst_df['sentence'].values, train_size=0.8)
sst_dev, sst_test = train_test_split(sst_other, test_size=0.5)

In [6]:
len(sst_train), len(sst_dev), len(sst_test)

(9484, 1185, 1186)

In [7]:
1 - len(sst_train) / (len(sst_train) + len(yelp_train))

0.9790910193238312

In [8]:
len(yelp_train) / len(sst_train)

46.826339097427244

## Creating vocab

In [9]:
text = np.concatenate([sst_train, yelp_train])
sst_attr = ['sentiment_treebank' for _ in sst_train]
yelp_attr = ['yelp' for _ in yelp_train]
attr = np.array(sst_attr + yelp_attr)

In [10]:
text_df = pd.DataFrame({'text': text, 'attr': attr})
text_df.sample(3)

Unnamed: 0,text,attr
20552,i have never received worse customer service .,yelp
321139,i loved those .,yelp
203933,drinks are good and the cocktail list is long .,yelp


In [11]:
vocab = Counter()
for text in text_df['text'].values:
    vocab.update(text.lower().split())

In [12]:
len(vocab)

21814

In [14]:
n_words = 0
yelp_vocab = Counter()
for text in yelp_train:
    words = text.lower().split()
    yelp_vocab.update(words)
    n_words += len(words)
print(f'vocabulary size: {len(yelp_vocab)}, number of words: {n_words}')

vocabulary size: 9599, number of words: 3967766


In [15]:
n_words = 0
sst_vocab = Counter()
for text in sst_train:
    words = text.lower().split()
    sst_vocab.update(words)
    n_words += len(words)
print(f'vocabulary size: {len(sst_vocab)}, number of words: {n_words}')

vocabulary size: 17517, number of words: 181484


Even though yelp train dataset is 46 times bigger than stanford sentiment treebank, it has richer vocabulary. It may be simply because of longer sentences, but yelp still has 20 times more words.

In [16]:
cnt = 0
for text in yelp_test:
    words = text.lower().split()
    for word in words:
        if word not in vocab:
            print(word)
            print(text)
            print()
            cnt += 1
            break
    if cnt >= 5:
        break

monti
monti 's used to be so good ... now it 's awful .

monti
we had n't been to monti 's in quite awhile .

monti
something 's very wrong at monti 's .

caffe
caffe boa is about as exciting as bulk trash pickup day .

caffe
caffe boa used to have the best happy hour on mill .



Some proper nouns are not included in the vocabulary. That means, that in theory a model would not be able to represent all words. How is it able to copy words though?

In [17]:
min_count = 4
vocab_cut = {word: count for word, count in vocab.items() if count >= min_count}
len(vocab_cut)

11330

In [19]:
vocab_words = ['<pad>\n', '<unk>\n', '<s>\n', '</s>\n'] + sorted([word + '\n' for word in vocab_cut])
print(vocab_words[:15])

with open(os.path.join(sentiment_path, 'text.vocab'), 'w') as f:
    f.writelines(vocab_words)
attr_vocab = ['yelp\n', 'sentiment_treebank\n']
with open(os.path.join(sentiment_path, 'attr.vocab'), 'w') as f:
    f.writelines(attr_vocab)

['<pad>\n', '<unk>\n', '<s>\n', '</s>\n', '!\n', '#\n', '$\n', '%\n', '&\n', "'\n", "''\n", "'60s\n", "'70s\n", "'burgh\n", "'d\n"]


In [31]:
files = [
    (yelp_train, 'train_0', 'yelp'), 
    (yelp_dev, 'dev_0', 'yelp'), 
    (yelp_test, 'test_0', 'yelp'), 
    (sst_train, 'train_1', 'sentiment_treebank'), 
    (sst_dev, 'dev_1', 'sentiment_treebank'), 
    (sst_test, 'test_1', 'sentiment_treebank')
]
for texts, name, attr in files:
    text_lines = [text + '\n' for text in texts]
    with open(os.path.join(sentiment_path, name + '.txt'), 'w') as f:
        f.writelines(text_lines)
    attr_lines = [attr + '\n' for _ in texts]
    with open(os.path.join(sentiment_path, name + '.attr'), 'w') as f:
        f.writelines(attr_lines)

In [32]:
for split in ('train', 'dev', 'test'):
    subfiles = list()
    for texts, name, attr in files:
        if split in name:
            subfiles.append((texts, name, attr))
    text_all = np.concatenate([texts for texts, _, _ in subfiles])
    attr_all = [attr for texts, _, attr in subfiles for _ in texts]
    with open(os.path.join(sentiment_path, split + '.txt'), 'w') as f:
        text_lines = [text + '\n' for text in text_all]
        f.writelines(text_lines)
    with open(os.path.join(sentiment_path, split + '.attr'), 'w') as f:
        attr_lines = [attr + '\n' for attr in attr_all]
        assert len(attr_lines) == len(text_lines)
        f.writelines(attr_lines)

In [36]:
SAMPLE = min(len(sst_dev), len(sst_test))
print(SAMPLE)
np.random.seed(42)
for split in ('dev', 'test'):
    subfiles = list()
    for texts, name, attr in files:
        if split in name:
            subfiles.append((np.random.choice(texts, SAMPLE, replace=False), name, attr))
    text_all = np.concatenate([texts for texts, _, _ in subfiles])
    attr_all = [attr for texts, _, attr in subfiles for _ in texts]
    with open(os.path.join(sentiment_path, split + '_li.txt'), 'w') as f:
        text_lines = [text + '\n' for text in text_all]
        f.writelines(text_lines)
    with open(os.path.join(sentiment_path, split + '_li.attr'), 'w') as f:
        attr_lines = [attr + '\n' for attr in attr_all]
        assert len(attr_lines) == len(text_lines)
        f.writelines(attr_lines)

1185
