## Parse and translate LSARS data ##

In [10]:
import json
import pandas as pd
from tqdm import tqdm
from itertools import islice
import gc

from google.cloud import translate_v2 as translate

In [15]:
data_path = "../data/chinese_data/"
train_data_path = data_path + "train.json"
test_data_path = data_path + "test.json"

### Processing training data with paid API ###

In [4]:
translate_client = translate.Client()

def join_tokens(token_list):
    return "".join(token_list)

def translate_json(json):
    item_id = json["item_id"]
    
    summary = join_tokens(json["hq_tokens"])
    reviews = json["lq_tokens_list"]
    review_str = ""
    reviews = [join_tokens(review) for review in reviews]

    translated_summary = translate_client.translate(summary, target_language="en")["translatedText"]
    translated_reviews = translate_client.translate(reviews, target_language="en")
    translated_reviews = [review["translatedText"] for review in translated_reviews]

    return item_id, translated_reviews, translated_summary

In [12]:
with open(train_data_path, 'r') as fp:
    json_lines = islice(fp, 2000, 5000)
    train_jsons = [json.loads(line) for line in json_lines]

len(train_jsons)

3000

In [13]:
ids, reviews, summaries = [], [], []

for json in tqdm(train_jsons):

    item_id, translated_reviews, translated_summary = translate_json(json)
    ids.append(item_id)
    reviews.append(translated_reviews)
    summaries.append(translated_summary)

result_df = pd.DataFrame({"id": ids, "reviews": reviews, "summary": summaries})
result_df["review_str"] = result_df["reviews"].apply(lambda x: "|||||".join(x))
result_df = result_df[["id", "summary", "review_str"]]

result_df

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [23:52<00:00,  2.09it/s]


Unnamed: 0,id,summary,review_str
0,561854202615,"It looks very good, it pills a little, but the...","Not bad, not bad. It&#39;s a fine fabric and d..."
1,561859634230,The clothes have been received. The express de...,The clothes look good and fit well. The seller...
2,561867951687,The baby has been received. The upper body eff...,"The quality of clothes is not bad, not bad||||..."
3,561871433925,It&#39;s very warm and the customer service is...,"Overall it&#39;s very good, the warm color is ..."
4,561872533813,"The quality is very good, feel free to buy fro...",I really like the collar of the sweater. The c...
...,...,...,...
2995,567101327374,"Very beautiful, no color difference, the real ...",The original camera was taken casually in the ...
2996,25398872232,"The upper body effect is good, the fabric is c...",The fabric feels quite comfortable. I’ll revie...
2997,39653879471,This classic style is very summery. When paire...,I just tried it and I like it very much. The q...
2998,40484786894,There is no color difference when I receive th...,"I bought two pieces. The quality is the same, ..."


In [14]:
result_df.to_csv("../data/translated_data/train_trans_2000-5000.csv", index=False)

In [11]:
del result_df
gc.collect()

0