In [1]:
import os
os.chdir('..')

As our dataset is in Chinese, to facilitate communication with team members and instructors, we are creating a subset of English translations using Google Translate API.

If you want to run the following code yourself, follow the instructions [here](https://cloud.google.com/translate/docs/quickstart-client-libraries#client-libraries-install-python).

In [2]:
from config import valid_data_path, train_data_path, testa_data_path
from fgclassifier import read_csv
from sklearn.externals import joblib

df_train_raw = read_csv(train_data_path)

2018-12-05 02:01:38,903 [INFO] Reading /opt/storage/train/sentiment_analysis_trainingset.csv..


In [3]:
from config import data_root 

cache_path = f'{data_root}/train/en.pkl'
try:
    translations = joblib.load(cache_path)
    print(len(translations))
except:
    translations = {}

10015


In [4]:
import glob
import time
from IPython.display import clear_output

from tqdm import tqdm
from google.cloud import translate
from sklearn.externals.joblib import Parallel, delayed

# All available credentials
credentials = glob.glob('../data/google-cloud/*.json')

# Use multiple credentials to bypass rate limit
clients = []
for credential in credentials:
    print(credential)
    clients.append(translate.Client.from_service_account_json(credential))

df = df_train_raw.copy().iloc[0:12000,:]
contents = [x.strip('"') for x in df['content']]
n_client = len(clients)
n_records = df.shape[0]

client_ok = [True for _ in clients]


def get_client(i):
    c = 0
    while not client_ok[i % n_client] and c < n_client:
        c += 1
        i += 1
    i = i % n_client
    client = clients[i] if c < n_client else None
    return i, client

failed = []

clear_output()
pbar = tqdm(total=n_records)
queue = list(range(n_records))
n_failed = 0

while len(queue) and n_failed < n_client:
    i = queue.pop(0)
    if i not in translations:
        start_time = time.time()
        client_idx, client = get_client(i)
        if not client:
            raise RuntimeError('No Available Client.')
        try:
            translation = client.translate(contents[i],
                target_language='en', source_language='zh')
            translations[i] = translation['translatedText']
        except Exception as e:
            # print(client_idx + 1, e)
            client_ok[client_idx] = False
            queue.append(i)
            n_failed += 1
            continue
        end_time = time.time()
        # If finished within 1 second, wait...
        if end_time < start_time + 0.5:
            time.sleep(start_time + 0.5 - end_time)
    pbar.update(1)

100%|██████████| 12000/12000 [39:01<00:00,  1.03s/it]   

In [5]:
joblib.dump(translations, cache_path)

['/opt/storage/train/en.pkl']

In [7]:
import pandas as pd

# Replace content with translation, and replace apostrophe 
df['content'] = [x.replace('&#39;', "'") for x in pd.Series(translations).sort_index()]

Sanity Check

In [9]:
import numpy as np

idx = np.random.randint(0, 10000)
print(df_train_raw['content'][idx])
print(df['content'][idx])

"夏天散步时曾经路过，看到里面很是热闹，进去吃过一次，感觉还不错。今天再散步到此，排队、拿托盘、慢走点菜，菜式非常丰富，好几种鱼、鸡肉、红烧肉、小排骨、炒猪肝、各种蔬菜、羹、汤、、、盛米饭的碗不大，说可以添，问老板为何碗小饭少，曰浪费粮食是可耻的，让食客自己添加不要浪费☺一楼坐不下的客人，可以去二楼，上面面积也挺大的。观察一会，估计旁边居民为多，大概都是一家三口为省事，几人团聚免烧菜的格调各种菜肴价格不贵，味道也不错，就是有人抽烟无人管（虽然到处写着不许抽烟）"
I used to pass by during the summer walk. I saw that it was very lively. I went in and ate once and it felt pretty good. Today, I will take a walk here, line up, take the tray, and slowly go to order. The dishes are very rich. There are several kinds of fish, chicken, braised pork, small ribs, fried pork liver, various vegetables, clams, soup, and bowls of rice. Not big, said that you can add, ask the boss why the bowl is small, it is shameful to waste food, let the diners add themselves and don't waste the guests who can't sit on the first floor, you can go to the second floor, the area is quite big. After observing for a while, it is estimated that there are many residents nearby. It is probably a family of three to save trouble. The style of several peo

In [10]:
import csv

# Sample data obtained by Google Translating to English
df.to_csv('data/english.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

Split to a training set and a hold-out validation set.

In [11]:
import csv
from fgclassifier.utils import read_csv

df = read_csv('data/english.csv')
df_train = df[:10000].sample(frac=0.8, random_state=42)
df_valid = df[:10000].drop(df_train.index)

df_train.to_csv('data/english_train.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
df_valid.to_csv('data/english_valid.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

2018-12-05 02:43:03,992 [INFO] Reading data/english.csv..


In [14]:
df[10000:].to_csv('data/english_test.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)