# Data Augmentation Translation

## Imports

In [None]:
import pandas as pd
import os
import uuid
import requests
from tqdm import tqdm
import time
from dotenv import load_dotenv

In [None]:
tqdm.pandas()

## Azure AI Translation

In [None]:
load_dotenv()

In [None]:
key = os.getenv("API_AZURE_KEY")
location = os.getenv("API_AZURE_LOCATION")
endpoint = "https://api.cognitive.microsofttranslator.com"
path = "/translate"

In [None]:
constructed_url = endpoint + path

headers = {
    'Ocp-Apim-Subscription-Key': key, 
    'Ocp-Apim-Subscription-Region': location, 
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}

In [None]:
def translate_text(text, from_lang, to_lang, max_retries=5):
    if not text:
        return None

    params = {
        'api-version': '3.0',
        'from': from_lang,
        'to': [to_lang]
    }
    body = [{'text': text}]
    retries = 0

    while retries <= max_retries:
        try:
            response = requests.post(constructed_url, params=params, headers=headers, json=body)
            response.raise_for_status()  
            translated_text = response.json()[0]['translations'][0]['text']
            return translated_text
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429: 
                retries += 1
                wait_time = 2 ** retries 
                time.sleep(wait_time)
            else:
                print(f"HTTP Error: {e}")
                break
        except Exception as e:
            print(f"Error translating text: {e}")
            break
            
    return None

In [None]:
def compare_translations(df, index):
    
    label = df.loc[index, 'rating']
    original_text = df.loc[index, 'reviewText'] 
    german_text = df.loc[index, 'reviewText_german']
    german_english_text = df.loc[index, 'text_german_english']
    german_french_text = df.loc[index, 'text_german_french']
    final_text = df.loc[index, 'text_german_french_english'] 
    
    print(f"Index: {index}")
    print(f"****")
    print(f"Label: {label}")
    print(f"****")
    print(f"Original Text: {original_text}")
    print(f"****")
    print(f"German Translation: {german_text}")
    print(f"****")
    print(f"German-English Translation: {german_english_text}")
    print(f"****")
    print(f"German-French Translation: {german_french_text}")
    print(f"****")
    print(f"Back-Translated Text: {final_text}")
    print("")

In [None]:
def select_and_compare(df, label, num_samples=5):
    
    filtered_df = df[df['rating'] == label]
    sampled_df = filtered_df.sample(n=min(len(filtered_df), num_samples), random_state=42)
    
    for index in sampled_df.index:
        compare_translations(df, index)

## Translate data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
file_name = 'combined_reviews.csv'
folder_path = '/content/drive/MyDrive/'
file_path = folder_path + file_name
df = pd.read_csv(file_path)

In [None]:
df.head(2)

In [None]:
df['reviewText_german'] = df.progress_apply(
    lambda row: translate_text(row['reviewText'], from_lang='en', to_lang='de'))

In [None]:
df['reviewText_german_french'] = df.progress_apply(
    lambda row: translate_text(row['reviewText_german'], from_lang='de', to_lang='fr') if row['reviewText_german'] else None)

In [None]:
df['reviewText_german_french_english'] = df.progress_apply(
    lambda row: translate_text(row['reviewText_german_french'], from_lang='fr', to_lang='en') if row['reviewText_german_french'] else None)

## Check translations

In [None]:
compare_translations(df,2)

## Save new df

In [None]:
output_path = '/content/drive/MyDrive/DOPP_Ex2_data/reviews_da_translation.csv'
df.to_csv(output_path, index=False)