## Parse and translate LSARS data ##

In [20]:
import json
import pandas as pd
from tqdm import tqdm
from itertools import islice

from google.cloud import translate_v2 as translate

In [2]:
data_path = "../data/chinese_data/"
train_data_path = data_path + "train.json"
test_data_path = data_path + "test.json"

#deepl_api_key = "f6b7310c-7474-a87e-a191-27bc9763eb6c:fx"

### Processing test dataset with free API ###

In [3]:
test_jsons = []

with open(test_data_path, 'r') as fp:
  for line in fp:
      test_jsons.append(json.loads(line))

In [8]:
def join_tokens(token_list):
    return "".join(token_list)

def process_json(json):
    item_id = json["item_id"]
    
    summary = join_tokens(json["hq_tokens"])
    reviews = json["lq_tokens_list"]
    review_str = ""
    reviews = [join_tokens(review) for review in reviews]

    translated_reviews = GoogleTranslator('zh-CN', 'en').translate_batch(reviews)
    translated_summary = GoogleTranslator(source='chinese (simplified)', target='en').translate(text=summary)

    return item_id, translated_reviews, translated_summary

In [22]:
ids, reviews, summaries = [], [], []

for json in tqdm(test_jsons[0:100]):

    item_id, translated_reviews, translated_summary = process_json(json)
    ids.append(item_id)
    reviews.append(translated_reviews)
    summaries.append(translated_summary)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [49:24<00:00, 29.65s/it]


In [28]:
result_df = pd.DataFrame({"id": ids, "reviews": reviews, "summary": summaries})
result_df["review_str"] = result_df["reviews"].apply(lambda x: "|||||".join(x))
result_df = result_df[["id", "summary", "review_str"]]

result_df.to_csv("../data/translated_data/test_trans_0-100.csv", index=False)

result_df

Unnamed: 0,id,summary,review_str
0,19102166197,The actual baby is the same as the picture. Th...,The baby was received. I really didn’t expect ...
1,529618039778,The pants are of good quality. I bought them a...,They are perfect. It is very difficult for a s...
2,535894028777,"165100 pounds, size M is just right for wearin...",I gave them to a friend's child. They are nice...
3,539394568874,"It's the same as tailor-made, I'm 158, the cus...",My first reaction after receiving it and weari...
4,540381936384,"The quality is very good, the actual product i...","Very beautiful, just like the description, the..."
...,...,...,...
95,560272505619,I like the clothes very much. The quality is v...,I like the upper body very much. I prefer a sl...
96,560275253291,"The price is very worth it, the fabric is very...",These pants are really good-looking. I was wor...
97,560617877003,The color is very beautiful. The more I look a...,"The clothes are very nice, and I look pretty w..."
98,560643173236,Very satisfied. I am a dark person and this co...,This color is called skin!|||||I put the cloth...


### Processing training data with paid API ###

In [28]:
translate_client = translate.Client()

def translate_json(json):
    item_id = json["item_id"]
    
    summary = join_tokens(json["hq_tokens"])
    reviews = json["lq_tokens_list"]
    review_str = ""
    reviews = [join_tokens(review) for review in reviews]

    translated_summary = translate_client.translate(summary, target_language="en")["translatedText"]
    translated_reviews = translate_client.translate(reviews, target_language="en")
    translated_reviews = [review["translatedText"] for review in translated_reviews]

    return item_id, translated_reviews, translated_summary

In [25]:
with open(train_data_path, 'r') as fp:
    json_lines = islice(fp, 500)
    train_jsons = [json.loads(line) for line in json_lines]

In [29]:
ids, reviews, summaries = [], [], []

for json in tqdm(train_jsons):

    item_id, translated_reviews, translated_summary = translate_json(json)
    ids.append(item_id)
    reviews.append(translated_reviews)
    summaries.append(translated_summary)

result_df = pd.DataFrame({"id": ids, "reviews": reviews, "summary": summaries})
result_df["review_str"] = result_df["reviews"].apply(lambda x: "|||||".join(x))
result_df = result_df[["id", "summary", "review_str"]]

result_df

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [04:14<00:00,  1.97it/s]


Unnamed: 0,id,summary,review_str
0,17260910515,The leather pants have a good texture and good...,Not much flexibility. Everything else is good|...
1,18836261659,I really like the clothes from this store. The...,The clothes are very soft and comfortable to w...
2,41417558656,I received the jacket. It looks very elegant w...,The quality is really good. I have always want...
3,41664011260,The baby has been received. After comparing ma...,"The color is a bit dark, but otherwise it’s fi..."
4,42169031576,A great Taobao purchase. It was cleared out. T...,"Beautiful ~ very fine velvet, very good workma..."
...,...,...,...
495,560033134023,The beach skirt I bought for my wife looks gre...,The service is pretty good but it&#39;s too bi...
496,560039740042,"The logistics is super fast, the fabric is sof...","Not bad, looks beautiful on it, a satisfying p..."
497,560041125184,The bottoming skirt is really a must-have for ...,There is a small tear in the seam of the skirt...
498,560044546792,"The clothes are medium thick and warm, not too...",Very warm and super good-looking. The XL I bou...


In [30]:
result_df.to_csv("../data/translated_data/train_trans_0-500.csv", index=False)