# Data exploration
Exploration of data used in 'A Probabilistic Formulation of Unsupervised Text Style Transfer' (https://github.com/cindyxinyiwang/deep-latent-sequence-model).

In [30]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.style.use('fivethirtyeight')
pd.set_option('max_colwidth', 400)

## Yelp 

In [31]:
data_path = '../deep-latent-sequence-model/data'
yelp_path = os.path.join(data_path, 'yelp')

In [32]:
def read_column(path):
    with open(path) as f:
        lines = [line.strip() for line in f.readlines()]
    return lines

In [36]:
def get_dataset(dataset, split_type='train'):
    root_data_path = '../deep-latent-sequence-model/data'
    data_path = os.path.join(root_data_path, dataset)
    text = read_column(os.path.join(data_path, f'{split_type}.txt'))
    attr = read_column(os.path.join(data_path, f'{split_type}.attr'))
    train_df = pd.DataFrame({'text': text, 'attr': attr})
    return train_df

In [37]:
yelp_train = get_dataset('yelp', 'train')
yelp_train.head()

Unnamed: 0,text,attr
0,i was sadly mistaken .,negative
1,"so on to the hoagies , the italian is general run of the mill .",negative
2,minimal meat and a ton of shredded lettuce .,negative
3,nothing really special & not worthy of the $ _num_ price tag .,negative
4,"second , the steak hoagie , it is atrocious .",negative


## Shakespeare

In [38]:
shake_train = get_dataset('shakespeare', 'train')
shake_train.head(10)

Unnamed: 0,text,attr
0,I have a mind to strike thee ere thou speak'st .,original
1,"Yet if thou say Antony lives , is well , Or friends with Caesar , or not captive to him , I'll set thee in a shower of gold and hail Rich pearls upon thee .",original
2,"Madam , he's well .",original
3,Well said .,original
4,And friends with Caesar .,original
5,Th' art an honest man .,original
6,Caesar and he are greater friends than ever .,original
7,Make thee a fortune from me .,original
8,"But yet , madam , I do not like ""But yet."" It does allay The good precedence .",original
9,"Fie upon ""But yet."" ""But yet"" is as a jailer to bring forth Some monstrous malefactor .",original


## Decipher

In [39]:
decipher_train = get_dataset('decipher', 'train')
decipher_train.head(10)

Unnamed: 0,text,attr
0,the pad thai was bland and seemed to lack ingredients .,original
1,she 's pretty happy with both the quality and the quantity of the food .,original
2,the seats to sit on were either cut open or falling off .,original
3,good pricing .,original
4,fantastic jalapeo chicken .,original
5,such a great deal !,original
6,greg and his staff always provide the best service .,original
7,the outdoor seating area on the roof is the perfect outdoor dining experience !,original
8,excellent food !,original
9,even the kids love it .,original


In [42]:
decipher_train[decipher_train['attr'] != 'original'].head(10)

Unnamed: 0,text,attr
200000,__great__ __store__ __to__ __shop__ __!__,cipher
200001,__food__ __was__ __so-so__ __and__ __the__ __service__ __was__ __bad__ __.__,cipher
200002,__they__ __have__ __a__ __great__ __selection__ __.__,cipher
200003,__the__ __staff__ __is__ __terrific__ __and__ __helpful__ __.__,cipher
200004,__the__ __park__ __is__ __very__ __clean__ __and__ __friendly__ __.__,cipher
200005,"__hey__ __chipotle__ __,__ __where__ __is__ __the__ __meat__ __?__",cipher
200006,__this__ __is__ __the__ __worst__ __store__ __.__,cipher
200007,__perfect__ __!__,cipher
200008,"__unfortunately__ __,__ __the__ __customer__ __service__ __,__ __myself__ __and__ __my__ __group__ __experienced__ __was__ __below__ __par__ __.__",cipher
200009,__selection__ __is__ __excellent__ __.__,cipher


## Sr_bos
Dataset with serbian and bosnian texts (for unsupervised translation).

In [43]:
sr_train = get_dataset('sr_bos', 'train')
sr_train.head(10)

Unnamed: 0,text,attr
0,"Наводно , одсутни радник је био запослен у Школи са пуним радним временом .",sr
1,У Пониквама су живели само Милан Котлаја и жена му Јока .,sr
2,"Ми имамо огромно искуство , али смо почели превише приземно да копирамо Запад .",sr
3,Једном математичарком и пијанисткињом .,sr
4,Тамо није било руских система противваздушне одбране .,sr
5,Међупростори су затварани тзв .,sr
6,"Исто је и са концентрацијама тешких метала – олова , кадмијума и цинка .",sr
7,"На делатност личности гледао је идеалистички , ван везе с друштвеним условима .",sr
8,Чланарина за приправне чланове је 500 динара .,sr
9,Да су те признали као уметника .,sr


In [46]:
sr_train[sr_train['attr'] != 'sr']

Unnamed: 0,text,attr
160000,"Ljudsko srce uopšte je neobična stvar , a žensko srce osobito !",bos
160001,"Zakon kao zakon , postoji da bi se kršio , i to dobro .",bos
160002,Na farmi je smješteno 95 rasnih krava simentalki uvezenih iz Njemačke i Češke .,bos
160003,Već su se javili određeni problemi i otpor prema radu agencije .,bos
160004,Anglo-američki odnosi dosegli su najnižu tačku od kraja tridesetih godina .,bos
...,...,...
295888,Suštinsko obilježje terorizma jeste politički motiv počinilaca .,bos
295889,Putem mobilnih timova pravo da glasa imao je 10.181 birača .,bos
295890,I pored toga je prohodnost putnih pravaca dobra zahvaljujući intervencijama ekipa putne operative .,bos
295891,Na području općine Živinice poplavljeno je oko 400 Ha poljoprivrednog zemljišta .,bos
