In [1]:
# Cell 1: Import libraries
from dotenv import load_dotenv
import requests
import os
import pandas as pd
import time

# Load the API key from .env file
load_dotenv()
google_key = os.getenv("GOOGLE_KEY")

print("✓ Libraries loaded!")
print(f"✓ API Key loaded: {google_key[:10]}..." if google_key else "✗ API Key not found!")

✓ Libraries loaded!
✓ API Key loaded: AIzaSyDfQv...


In [2]:
# Cell 2: Translation function
def google_translate(source_texts: list, key: str):
    """
    Translates English text to Filipino using Google Translate API
    """
    url = "https://translation.googleapis.com/language/translate/v2"
    
    payload = {
        "q": source_texts,
        "target": "tl",
        "format": "text"
    }
    
    params = {"key": key}
    
    response = requests.post(url, params=params, json=payload)
    
    try:
        response_json = response.json()
        translations = [
            item["translatedText"]
            for item in response_json["data"]["translations"]
        ]
        return translations
    except Exception as e:
        print("Unexpected response structure:", e)
        print(response_json)
        raise

# Test the function
test_result = google_translate(["Hello, how are you?"], google_key)
print(f"Test translation: {test_result}")

Test translation: ['Kumusta, kumusta ka?']


In [4]:
# Cell 3: Load the dataset
df_paws = pd.read_csv("../../datasets/cleaned/cleaned_paws.csv")

print("Dataset loaded!")
print(f"Shape: {df_paws.shape}")
print(f"\nColumns: {df_paws.columns.tolist()}")
print(f"\nFirst few rows:")
print(df_paws.head())

Dataset loaded!
Shape: (400, 4)

Columns: ['id', 'sentence1', 'sentence2', 'label']

First few rows:
      id                                          sentence1  \
0  29568  The estuary of the Batten Kill is in East Dors...   
1  45829  La tempestad ( International translation : The...   
2  46990  Since 2006 , when Josephine Alhanko placed him...   
3  13893  He married Lady Florence Jane Taylour , daught...   
4  41986  Elati is a village in the Kozani Regional Unit...   

                                           sentence2  label  
0  The mouth of the Batten Kill is in East Dorset...      1  
1  La Tempestad ( International Translation : The...      1  
2  Cerljen was also the first delegate from Swede...      1  
3  He married Lady Florence Jane Taylour , the da...      1  
4  Elati is a village in the Kozani regional unit...      1  


In [5]:
# Cell 4: Prepare the data for translation
# Get both sentence columns as lists
sentence1 = df_paws['sentence1'].to_list()
sentence2 = df_paws['sentence2'].to_list()

print(f"sentence1: {len(sentence1)} items")
print(f"sentence2: {len(sentence2)} items")
print(f"\nExample sentence1: {sentence1[0]}")
print(f"Example sentence2: {sentence2[0]}")

sentence1: 400 items
sentence2: 400 items

Example sentence1: The estuary of the Batten Kill is in East Dorset , Vermont , and the source of the river is in Easton , New York .
Example sentence2: The mouth of the Batten Kill is in East Dorset , Vermont , and the source of the river is in Easton , New York .


In [6]:
# Cell 5: Translate in batches (to avoid API limits)
def translate_in_batches(texts: list, batch_size: 100):
    """
    Translates texts in batches to avoid overwhelming the API
    """
    all_translations = []
    total_batches = (len(texts) + batch_size - 1) // batch_size
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_num = i // batch_size + 1
        
        print(f"Translating batch {batch_num}/{total_batches} ({i} to {i+len(batch)})...")
        
        try:
            translations = google_translate(batch, google_key)
            all_translations.extend(translations)
            
            # Small delay between batches
            time.sleep(1)
            
        except Exception as e:
            print(f"Error in batch {batch_num}: {e}")
            # Add None for failed translations
            all_translations.extend([None] * len(batch))
    
    print(f"✓ Translation complete! {len(all_translations)} items translated")
    return all_translations

# Translate sentence1
print("\n=== Translating sentence1 ===")
sentence1_translated = translate_in_batches(sentence1, batch_size=100)

# Translate sentence2
print("\n=== Translating sentence2 ===")
sentence2_translated = translate_in_batches(sentence2, batch_size=100)


=== Translating sentence1 ===
Translating batch 1/4 (0 to 100)...
Translating batch 2/4 (100 to 200)...
Translating batch 3/4 (200 to 300)...
Translating batch 4/4 (300 to 400)...
✓ Translation complete! 400 items translated

=== Translating sentence2 ===
Translating batch 1/4 (0 to 100)...
Translating batch 2/4 (100 to 200)...
Translating batch 3/4 (200 to 300)...
Translating batch 4/4 (300 to 400)...
✓ Translation complete! 400 items translated


In [7]:
# Cell 6: Add translations to dataframe
df_paws['sentence1_translated'] = sentence1_translated
df_paws['sentence2_translated'] = sentence2_translated

print("✓ Translations added to dataframe!")
print(f"\nNew columns: {df_paws.columns.tolist()}")

# Show some examples
print("\n=== Sample Translations ===")
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"  EN sentence1: {df_paws['sentence1'].iloc[i]}")
    print(f"  TL sentence1: {df_paws['sentence1_translated'].iloc[i]}")
    print(f"  EN sentence2: {df_paws['sentence2'].iloc[i]}")
    print(f"  TL sentence2: {df_paws['sentence2_translated'].iloc[i]}")

✓ Translations added to dataframe!

New columns: ['id', 'sentence1', 'sentence2', 'label', 'sentence1_translated', 'sentence2_translated']

=== Sample Translations ===

Example 1:
  EN sentence1: The estuary of the Batten Kill is in East Dorset , Vermont , and the source of the river is in Easton , New York .
  TL sentence1: Ang bunganga ng Batten Kill ay nasa East Dorset, Vermont, at ang pinagmumulan ng ilog ay nasa Easton, New York.
  EN sentence2: The mouth of the Batten Kill is in East Dorset , Vermont , and the source of the river is in Easton , New York .
  TL sentence2: Ang bunganga ng Batten Kill ay nasa East Dorset, Vermont, at ang pinagmumulan ng ilog ay nasa Easton, New York.

Example 2:
  EN sentence1: La tempestad ( International translation : The Tempest , dubbed The Storm by Univision ) is a 2013 Mexican telenovela produced by Salvador Mejía Alejandre for Televisa .
  TL sentence1: Ang La tempestad (Internasyonal na salin: The Tempest, na binansagang The Storm ng Univisi

In [8]:
# Cell 7: Save the results
output_path = '../../datasets/translated/translated_paws_google_anthea.csv'
df_paws.to_csv(output_path, index=False)

print(f"✓ Saved to: {output_path}")
print(f"\nFinal dataframe shape: {df_paws.shape}")
print(f"Columns: {df_paws.columns.tolist()}")

✓ Saved to: ../../datasets/translated/translated_paws_google_anthea.csv

Final dataframe shape: (400, 6)
Columns: ['id', 'sentence1', 'sentence2', 'label', 'sentence1_translated', 'sentence2_translated']
