In [1]:
import json
import numpy as np
import collections
import pandas as pd
import csv
import random

## Pre-Process Real Reviews

In [2]:
#build a list of list of characters from the 5-star reviews
def preprocess_review_series(review_series):
    review_list = []
    for new_review in review_series:
        clipped_review = new_review[2:-1]
        char_list = list(clipped_review.lower())
        semifinal_review = []
        last_char = ''
        for ascii_char in char_list:
            if ascii_char == '\\' or last_char == '\\':
                pass
            else:#an explicit check for ascii-characters not needed
                #isascii = lambda s: len(s) == len(s.encode())
                semifinal_review.append(ascii_char)
            last_char = ascii_char
        if len(semifinal_review) > 300:
            final_review = ['<SOR>'] + semifinal_review + ['<EOR>']
            #print(final_review)
            review_list.append(final_review)
    return review_list

def get_review_series(review_path = '/home/kalvin_kao/yelp_challenge_dataset/review.csv'):
    #review_path = '/home/kalvin_kao/yelp_challenge_dataset/review.csv'
    review_df = pd.read_csv(review_path)
    five_star_review_df = review_df[review_df['stars']==5]
    #five_star_review_series = five_star_review_df['text']
    return five_star_review_df['text']

def get_business_list(business_path = '/home/kalvin_kao/yelp_challenge_dataset/business.csv'):
    #business_path = '/home/kalvin_kao/yelp_challenge_dataset/business.csv'
    return pd.read_csv(business_path)

def split_train_test(review_list, training_samples, test_samples):
    #pass in randomized review list
    train_len = int(np.floor(0.8*len(review_list)))
    test_len = int(np.floor(0.2*len(review_list)))
    training_review_list = review_list[:train_len]
    testing_review_list = review_list[-test_len:]
    randomized_training_list = random.sample(training_review_list, training_samples)
    randomized_testing_list = random.sample(testing_review_list, test_samples)
    training_review_list = [item for sublist in randomized_training_list for item in sublist]
    print("number of training characters", len(training_review_list))
    test_review_list = [item for sublist in randomized_testing_list for item in sublist]
    print("number of test characters", len(test_review_list))
    return randomized_training_list, randomized_testing_list

def make_train_test_data(five_star_review_series, training_samples=25000, test_samples=6250):
    #fix randomization to prevent evaluation on trained samples
    review_list = preprocess_review_series(five_star_review_series)
    np.random.shuffle(review_list)
    #split into 3 for 1) attack, 2) defense, and 3) GAN
    one_third_len = int(np.floor(0.33*len(review_list)))
    attack_review_list = review_list[:one_third_len]
    #attack_training_samples = 25000
    #attack_test_samples = 6250
    defense_review_list = review_list[one_third_len:(2*one_third_len)]
    #defense_training_samples = 25000
    #defense_test_samples = 6250
    gan_review_list = review_list[-one_third_len:]
    #gan_training_samples = 25000
    #gan_test_samples = 6250
    #split and shuffle the data for attack
    attack_training_review_list, attack_test_review_list = split_train_test(attack_review_list, training_samples, test_samples)
    defense_training_review_list, defense_test_review_list = split_train_test(defense_review_list, training_samples, test_samples)
    gan_training_review_list, gan_test_review_list = split_train_test(gan_review_list, training_samples, test_samples)
    return attack_training_review_list, attack_test_review_list, defense_training_review_list, defense_test_review_list, gan_training_review_list, gan_test_review_list

def make_vocabulary(dataset_list):
    unique_characters = list(set().union(*dataset_list))
    #unique_characters = list(set(training_review_list + test_review_list))
    #vocabulary
    char_dict = {w:i for i, w in enumerate(unique_characters)}
    ids_to_words = {v: k for k, v in char_dict.items()}
    return char_dict, ids_to_words

def convert_to_ids(char_dict, review_list):
    #convert to flat (1D) np.array(int) of ids
    review_ids = [char_dict.get(token) for token in review_list]
    return np.array(review_ids)

In [3]:
review_path = '/home/kalvin_kao/yelp_challenge_dataset/review.csv'

In [4]:
five_star_reviews = get_review_series(review_path)
training_samples = 20000
test_samples = 5000
attack_training_review_list, attack_test_review_list, defense_training_review_list, defense_test_review_list, gan_training_review_list, gan_test_review_list = make_train_test_data(five_star_reviews, training_samples, test_samples)

number of training characters 14398657
number of test characters 3616540
number of training characters 14439014
number of test characters 3617730
number of training characters 14426538
number of test characters 3574061


In [20]:
#np.random.shuffle(generated_reviews)
#generated_training_review_list, generated_test_review_list = split_train_test(generated_reviews, training_samples=20, test_samples=5)

number of training characters 2979
number of test characters 733


In [25]:
#generated_training_review_list[0]

"<SOR>a true hard fix!!! the owner, egg, olive and potato was classy, simple and ill take the time to even have found the food is damn again i've never been"

In [21]:
with open("artificial_train_data.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(generated_training_review_list)

In [None]:
with open("artificial_test_data.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(generated_test_review_list)

In [5]:
with open("split01_train_data_01.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(attack_training_review_list)

In [6]:
with open("split01_test_data_01.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(attack_test_review_list)

In [7]:
with open("split01_train_data_02.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(defense_training_review_list)

In [8]:
with open("split01_test_data_02.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(defense_test_review_list)

In [9]:
with open("split01_train_data_03.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(gan_training_review_list)

In [10]:
with open("split01_test_data_03.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(gan_test_review_list)

In [12]:
#"<EOR>" in generated_reviews[95]

True

In [13]:
#generated_reviews[95].find('<EOR>')

43

## Extract and Save Generated Reviews

In [3]:
filename = "baseline_attack_single_complex_model_l_05.json"

In [4]:
with open(filename, 'r') as f:
    all_logs = json.load(f)

In [5]:
generated_reviews = []
for log in all_logs:
    if 'jsonPayload' in log.keys():
        payload = log['jsonPayload']
        if 'message' in payload.keys():
            message = payload['message']
            if "SOR" in message:
                generated_reviews.append(message)

In [6]:
clipped_review_list = []
for review in generated_reviews:
    clipped_review = review[5:]
    if "<EOR>" in clipped_review:
        eor_index = clipped_review.find('<EOR>')
        clipped_review = clipped_review[:eor_index]
    clipped_review_list.append(clipped_review)

In [7]:
generated_review_list = []
for review in clipped_review_list:
    #print(type(review))
    char_list = list(review.lower())
    semifinal_review = []
    last_char = ''
    for ascii_char in char_list:
        if ascii_char == '\\' or last_char == '\\':
            pass
        else:
            #isascii = lambda s: len(s) == len(s.encode())
            semifinal_review.append(ascii_char)
        last_char = ascii_char
    #if len(semifinal_review) > 300:
    final_review = ['<SOR>'] + semifinal_review + ['<EOR>']
        #print(final_review)
    generated_review_list.append(final_review)

In [9]:
np.random.shuffle(generated_review_list)
generated_training_review_list, generated_test_review_list = split_train_test(generated_review_list, training_samples=32000, test_samples=8000)

number of training characters 7757192
number of test characters 1942884


In [10]:
with open("gen01_train_data_01.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(generated_training_review_list)

In [11]:
with open("gen01_test_data_01.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(generated_test_review_list)

In [8]:
clipped_review_list[3455]

"from far longer than a carter/girl in a location, but their prices are huge and the service was great yet fairly close. you can't find anyone that knows her downstairs and looks also amazing.  the espresso tastes fresh and crispy and cooked perfectly. one chance it's in my leftover, bly claims the i"

## Read in Processed and Split Datasets

In [25]:
with open("attack_train_data.csv", 'r') as csvfile:
    counter = 0
    reader = csv.reader(csvfile, delimiter=',')
    #for row in reader:
        #print(row)
        #print()
        #if counter > 10:
            #break
        #counter += 1
    new_training_review_list = [item for sublist in reader for item in sublist]

In [26]:
new_training_review_list[100:150]

['e',
 'd',
 ']',
 ' ',
 'y',
 'e',
 'a',
 'r',
 '-',
 'o',
 'l',
 'd',
 ' ',
 'm',
 'e',
 ' ',
 'a',
 'n',
 'd',
 ' ',
 'p',
 'u',
 'n',
 'c',
 'h',
 ' ',
 'm',
 'e',
 ' ',
 's',
 'q',
 'u',
 'a',
 'r',
 'e',
 'l',
 'y',
 ' ',
 'i',
 'n',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'n',
 'a',
 'd',
 's',
 ' ']

In [27]:
with open("artificial_train_data.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    #for row in reader:
        #print(row)
        #print(''.join(row))
        #print()
    new_artificial_review_list = [item for sublist in reader for item in sublist]

In [28]:
new_artificial_review_list[100:150]

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'o',
 'n',
 ' ',
 'i',
 't',
 's',
 ' ',
 'a',
 'c',
 't',
 'y',
 ',',
 ' ',
 'm',
 'a',
 'k',
 'e',
 's',
 ' ',
 'y',
 'o',
 'u',
 ' ',
 'f',
 'e',
 'e',
 'l',
 ' ',
 'v',
 'e',
 'r',
 'y',
 ' ',
 'a',
 't',
 't',
 'e',
 'n',
 't',
 'i',
 'v',
 'e',
 ',',
 ' ']

## Build Character "Vocabulary"

In [30]:
words_to_ids, ids_to_words = make_vocabulary([new_training_review_list, new_artificial_review_list])
attack_train_ids = convert_to_ids(words_to_ids, new_training_review_list)
artificial_train_ids = convert_to_ids(words_to_ids, new_artificial_review_list)

In [31]:
model_params = dict(V=len(words_to_ids.keys()), H=1024, softmax_ns=len(words_to_ids.keys()), num_layers=2)

In [32]:
model_params

{'H': 1024, 'V': 68, 'num_layers': 2, 'softmax_ns': 68}

In [None]:
#trained_filename = run_training(train_ids, test_ids, tf_savedir = "/tmp/artificial_hotel_reviews/a4_model", model_params=model_params, max_time=150, batch_size=256, learning_rate=0.002, num_epochs=20)

In [None]:
#trained_filename = run_training(train_ids, test_ids, tf_savedir = "/tmp/artificial_hotel_reviews/a4_model", model_params=model_params, max_time=150, batch_size=256, learning_rate=0.002, num_epochs=1)

In [18]:
generated_review_char_list = [item for sublist in clipped_review_list for item in sublist]

In [19]:
generated_chars = pd.Series(generated_review_char_list)

In [20]:
generated_chars.describe()

count     13070
unique       56
top            
freq       2376
dtype: object

In [21]:
#import collections
print([item for item, count in collections.Counter(generated_reviews).items() if count > 1])

[]


In [22]:
char_counts = collections.Counter(generated_review_char_list).items()

In [23]:
for item in char_counts:
    print(item)

('o', 762)
('f', 286)
(' ', 2376)
('y', 253)
('u', 288)
('r', 602)
('b', 156)
('w', 252)
('s', 646)
('e', 1294)
(',', 86)
('a', 913)
('t', 934)
('l', 430)
('h', 514)
('i', 686)
('n', 624)
('k', 97)
('g', 229)
('c', 283)
("'", 34)
('d', 400)
('.', 147)
('p', 210)
('v', 112)
('m', 235)
('!', 63)
('&', 2)
('z', 21)
('j', 20)
('q', 13)
('-', 15)
('(', 14)
(')', 7)
('"', 2)
('=', 1)
('x', 13)
(';', 1)
(':', 3)
('1', 8)
('8', 3)
('4', 1)
('5', 4)
('2', 3)
('*', 2)
('7', 2)
('+', 1)
('?', 3)
('{', 1)
('$', 3)
('#', 1)
('0', 6)
('6', 1)
('3', 4)
('9', 2)
('/', 1)
