# Data Augmentation Translation

## Imports

In [1]:
import pandas as pd
import os
import uuid
import requests
from tqdm import tqdm
import time
from dotenv import load_dotenv

In [2]:
tqdm.pandas()

## Azure AI Translation

In [3]:
load_dotenv()

True

In [4]:
key = os.getenv("API_AZURE_KEY")
location = os.getenv("API_AZURE_LOCATION")
endpoint = "https://api.cognitive.microsofttranslator.com"
path = "/translate"

In [5]:
constructed_url = endpoint + path

headers = {
    'Ocp-Apim-Subscription-Key': key, 
    'Ocp-Apim-Subscription-Region': location, 
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}

In [6]:
def translate_text(text, from_lang, to_lang, max_retries=5):
    if not text:
        return None

    params = {
        'api-version': '3.0',
        'from': from_lang,
        'to': [to_lang]
    }
    body = [{'text': text}]
    retries = 0

    while retries <= max_retries:
        try:
            response = requests.post(constructed_url, params=params, headers=headers, json=body)
            response.raise_for_status()  
            translated_text = response.json()[0]['translations'][0]['text']
            return translated_text
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429: 
                retries += 1
                wait_time = 2 ** retries 
                time.sleep(wait_time)
            else:
                print(f"HTTP Error: {e}")
                break
        except Exception as e:
            print(f"Error translating text: {e}")
            break
            
    return None

In [18]:
def compare_translations(df, index):
    
    label = df.loc[index, 'rating']
    original_text = df.loc[index, 'reviewText'] 
    german_text = df.loc[index, 'reviewText_german']
    german_french_text = df.loc[index, 'reviewText_german_french']
    final_text = df.loc[index, 'reviewText_german_french_english'] 
    
    print(f"Index: {index}")
    print(f"****")
    print(f"Label: {label}")
    print(f"****")
    print(f"Original Text: {original_text}")
    print(f"****")
    print(f"German Translation: {german_text}")
    print(f"****")
    print(f"German-French Translation: {german_french_text}")
    print(f"****")
    print(f"Back-Translated Text: {final_text}")
    print("")

## Translate data

In [9]:
df = pd.read_csv("./data/combined_reviews.csv")

In [10]:
df.head(2)

Unnamed: 0,rating,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,reviewToken
0,5.0,2017-01-16,ASWLL1VJA7WOG,Great product... just what I wanted. Works gr...,Five Stars,1484524800,All_Beauty,"['great', 'product', 'want', 'works', 'great',..."
1,5.0,2008-12-08,A265K3A7V83112,"After seeing the popularity of this shoe, I de...",What can i say? chucks rock,1228694400,Clothing_Shoes_and_Jewelry,"['see', 'popularity', 'shoe', 'decide', 'test'..."


In [12]:
df['reviewText_german'] = df.progress_apply(
    lambda row: translate_text(row['reviewText'], from_lang='en', to_lang='de'), axis=1)

  1%|          | 69/12000 [00:14<37:15,  5.34it/s] 

Error translating text: Out of range float values are not JSON compliant


  1%|          | 118/12000 [00:23<20:58,  9.44it/s]

Error translating text: Out of range float values are not JSON compliant


  6%|▌         | 734/12000 [02:15<29:57,  6.27it/s]  

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149b60c70>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▌         | 737/12000 [02:16<30:03,  6.24it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149c8f7c0>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▌         | 744/12000 [02:16<24:39,  7.61it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149cb2340>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▌         | 749/12000 [02:17<22:55,  8.18it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149cb2910>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▋         | 754/12000 [02:18<23:58,  7.82it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149cb8520>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▋         | 757/12000 [02:18<26:42,  7.01it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149cb2d60>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▋         | 762/12000 [02:19<26:36,  7.04it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149c8f640>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▋         | 767/12000 [02:21<1:02:10,  3.01it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149c8f6a0>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▋         | 774/12000 [02:21<29:40,  6.30it/s]  

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149aea8e0>: Failed to establish a new connection: [Errno 61] Connection refused'))


  6%|▋         | 779/12000 [02:22<22:04,  8.47it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149cb8a00>: Failed to establish a new connection: [Errno 61] Connection refused'))


  7%|▋         | 784/12000 [02:23<23:15,  8.04it/s]

Error translating text: HTTPSConnectionPool(host='api.cognitive.microsofttranslator.com', port=443): Max retries exceeded with url: /translate?api-version=3.0&from=en&to=de (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x149cb8d60>: Failed to establish a new connection: [Errno 61] Connection refused'))


 15%|█▍        | 1746/12000 [05:07<21:21,  8.00it/s]  

Error translating text: Out of range float values are not JSON compliant


 15%|█▍        | 1767/12000 [05:11<21:07,  8.08it/s]

Error translating text: Out of range float values are not JSON compliant


 19%|█▊        | 2232/12000 [06:32<25:43,  6.33it/s]

Error translating text: Out of range float values are not JSON compliant


 19%|█▊        | 2238/12000 [06:33<30:56,  5.26it/s]

Error translating text: Out of range float values are not JSON compliant


 21%|██▏       | 2552/12000 [07:26<18:14,  8.63it/s]

Error translating text: Out of range float values are not JSON compliant


 24%|██▍       | 2854/12000 [08:13<22:10,  6.87it/s]

Error translating text: Out of range float values are not JSON compliant


 26%|██▋       | 3162/12000 [09:07<18:34,  7.93it/s]  

Error translating text: Out of range float values are not JSON compliant


 28%|██▊       | 3417/12000 [09:46<15:30,  9.22it/s]

Error translating text: Out of range float values are not JSON compliant


 36%|███▌      | 4317/12000 [12:09<15:33,  8.23it/s]

Error translating text: Out of range float values are not JSON compliant


 36%|███▋      | 4365/12000 [12:16<14:20,  8.88it/s]

Error translating text: Out of range float values are not JSON compliant


 37%|███▋      | 4387/12000 [12:18<13:54,  9.12it/s]

Error translating text: Out of range float values are not JSON compliant


 40%|███▉      | 4745/12000 [13:16<14:42,  8.22it/s]

Error translating text: Out of range float values are not JSON compliant


 41%|████      | 4904/12000 [13:40<16:13,  7.29it/s]

Error translating text: Out of range float values are not JSON compliant


 50%|████▉     | 5989/12000 [16:24<14:35,  6.87it/s]

Error translating text: Out of range float values are not JSON compliant


 54%|█████▍    | 6478/12000 [17:37<12:14,  7.52it/s]

Error translating text: Out of range float values are not JSON compliant


 55%|█████▍    | 6564/12000 [17:51<13:02,  6.95it/s]

Error translating text: Out of range float values are not JSON compliant


 56%|█████▌    | 6669/12000 [18:07<10:10,  8.73it/s]

Error translating text: Out of range float values are not JSON compliant


 73%|███████▎  | 8815/12000 [23:45<05:42,  9.31it/s]

Error translating text: Out of range float values are not JSON compliant


 86%|████████▋ | 10350/12000 [27:55<03:27,  7.95it/s]

Error translating text: Out of range float values are not JSON compliant


 90%|████████▉ | 10773/12000 [29:01<02:38,  7.74it/s]

Error translating text: Out of range float values are not JSON compliant


 91%|█████████ | 10883/12000 [29:17<01:45, 10.62it/s]

Error translating text: Out of range float values are not JSON compliant


 96%|█████████▌| 11517/12000 [30:56<00:50,  9.58it/s]

Error translating text: Out of range float values are not JSON compliant


100%|█████████▉| 11964/12000 [32:06<00:03,  9.95it/s]

Error translating text: Out of range float values are not JSON compliant


100%|██████████| 12000/12000 [32:12<00:00,  6.21it/s]


In [13]:
df['reviewText_german_french'] = df.progress_apply(
    lambda row: translate_text(row['reviewText_german'], from_lang='de', to_lang='fr'), axis=1)

100%|██████████| 12000/12000 [42:50<00:00,  4.67it/s] 


In [14]:
df['reviewText_german_french_english'] = df.progress_apply(
    lambda row: translate_text(row['reviewText_german_french'], from_lang='fr', to_lang='en'), axis=1)

100%|██████████| 12000/12000 [31:02<00:00,  6.44it/s] 


## Check translations

In [19]:
compare_translations(df,2)

Index: 2
****
Label: 5.0
****
Original Text: I was nervousness about the scent because IVe never tried this, but I love Paul Mitchell so I decided to try it.  It smells great!!!  This is a medium hold spray, so I'll use it when my hair is down (I use a stronger hold when I put my hair up).  Shipping speed was good to Alaska too.
****
German Translation: Ich war nervös wegen des Duftes, weil ich ihn noch nie probiert habe, aber ich liebe Paul Mitchell, also beschloss ich, ihn auszuprobieren.  Es riecht gut!!  Dies ist ein Spray mit mittlerem Halt, daher verwende ich es, wenn mein Haar offen ist (ich verwende einen stärkeren Halt, wenn ich mein Haar hochstecke).  Die Versandgeschwindigkeit nach Alaska war auch gut.
****
German-French Translation: J’étais nerveuse à propos de l’odeur parce que je ne l’avais jamais essayé auparavant, mais j’adore Paul Mitchell, alors j’ai décidé de l’essayer.  Ça sent bon !!  Il s’agit d’un spray à tenue moyenne, donc je l’utilise quand mes cheveux sont ou

## Save new df

In [42]:
output_path = './data/reviews_da_translation_wide.csv'
df.to_csv(output_path, index=False)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   rating                            12000 non-null  float64
 1   reviewTime                        12000 non-null  object 
 2   reviewerID                        12000 non-null  object 
 3   reviewText                        11975 non-null  object 
 4   summary                           11991 non-null  object 
 5   unixReviewTime                    12000 non-null  int64  
 6   category                          12000 non-null  object 
 7   reviewToken                       12000 non-null  object 
 8   reviewText_german                 11964 non-null  object 
 9   reviewText_german_french          11964 non-null  object 
 10  reviewText_german_french_english  11964 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.0+ MB


In [38]:
long_df = pd.melt(
    df,
    id_vars=['rating'],
    value_vars=['reviewText', 'reviewText_german_french_english'],
    var_name='text_type',
    value_name='reviewText_long' 
)

In [39]:
long_df.rename(columns={'reviewText_long': 'reviewText'}, inplace=True)

In [41]:
output_path = './data/reviews_da_translation_long.csv'
long_df.to_csv(output_path, index=False)