As our dataset is in Chinese, to facilitate communication with team members and instructors, we are creating a subset of English translations using Google Translate API.

If you want to run the following code yourself, follow the instructions [here](https://cloud.google.com/translate/docs/quickstart-client-libraries#client-libraries-install-python).

In [None]:
from config import valid_data_path, train_data_path, testa_data_path
from fgclassifier import read_csv
from sklearn.externals import joblib

df_train_raw = read_csv(train_data_path)

In [None]:
try:
    translations = joblib.load('data/train/en.pkl')
except:
    translations = {}

In [None]:
import glob
import time
from IPython.display import clear_output

from tqdm import tqdm
from google.cloud import translate
from sklearn.externals.joblib import Parallel, delayed

# All available credentials
credentials = glob.glob('./misc/google-cloud/*.json')

# Use multiple credentials to bypass rate limit
clients = []
for credential in credentials:
    print(credential)
    clients.append(translate.Client.from_service_account_json(credential))

df = df_train.copy().iloc[0:10000,:]
contents = [x.strip('"') for x in df['content']]
n_client = len(clients)
n_records = df.shape[0]

client_ok = [True for _ in clients]


def get_client(i):
    c = 0
    while not client_ok[i % n_client] and c < n_client:
        c += 1
        i += 1
    i = i % n_client
    client = clients[i] if c < n_client else None
    return i, client

failed = []

clear_output()
pbar = tqdm(total=n_records)
queue = list(range(n_records))
n_failed = 0

while len(queue) and n_failed < n_client:
    i = queue.pop(0)
    if i not in translations:
        start_time = time.time()
        client_idx, client = get_client(i)
        if not client:
            raise RuntimeError('No Available Client.')
        try:
            translation = client.translate(contents[i],
                target_language='en', source_language='zh')
            translations[i] = translation['translatedText']
        except Exception as e:
            # print(client_idx + 1, e)
            client_ok[client_idx] = False
            queue.append(i)
            n_failed += 1
            continue
        end_time = time.time()
        # If finished within 1 second, wait...
        if end_time < start_time + 0.5:
            time.sleep(start_time + 0.5 - end_time)
    pbar.update(1)

In [None]:
joblib.dump(translations, 'data/train/en.pkl')

In [None]:
# Replace content with translation, and replace apostrophe 
df['content'] = [x.replace('&#39;', "'") for x in pd.Series(translations).sort_index()]

Sanity Check

In [None]:
import numpy as np

idx = np.random.randint(0, 10000)
print(df_train['content'][idx])
print(df['content'][idx])

Save

In [None]:
import csv

# Sample data obtained by Google Translating to English
df.to_csv('data/english.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

Split to a training set and a hold-out validation set.

In [6]:
import csv
from fgclassifier.utils import read_csv

df = read_csv('data/english.csv')
df_train = df.sample(frac=0.8, random_state=42)
df_valid = df.drop(df_train.index)

df_train.to_csv('data/english_train.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
df_valid.to_csv('data/english_valid.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

2018-11-25 15:54:43,093 [INFO] Reading data/english.csv..
