In [2]:
import os
from google.cloud import translate_v2 as translate

def translate_text(text, target_language='en'):
    """Translates text using Google Cloud Translation API."""
    if not text:
        return None
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target_language, source_language='fr')
    return result['translatedText']

def test_translation():
    """Tests the Cloud Translation API with a simple translation."""
    try:
        translated_text = translate_text("Bonjour le monde!")
        print(f"Translated text: {translated_text}")
        print("Translation API test successful!")
    except Exception as e:
        print(f"Translation API test failed: {e}")

def verify_credentials():
    """Verifies that the credentials environment variable is set."""
    credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    if credentials_path:
        print(f"GOOGLE_APPLICATION_CREDENTIALS is set to: {credentials_path}")
        if os.path.exists(credentials_path):
            print(f"Credentials file exists at: {credentials_path}")
        else:
            print(f"Credentials file does NOT exist at: {credentials_path}")
    else:
        print("GOOGLE_APPLICATION_CREDENTIALS is not set.")

if __name__ == "__main__":
    # SET THE ENVIRONMENT VARIABLE WITHIN THE SCRIPT (NOT RECOMMENDED FOR PRODUCTION)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcs.json"  # Replace with the actual path
    verify_credentials()
    test_translation()

GOOGLE_APPLICATION_CREDENTIALS is set to: gcs.json
Credentials file exists at: gcs.json
Translated text: Hello world!
Translation API test successful!


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.functions import col
from deep_translator import GoogleTranslator

credentials_location = './gcs.json'

conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "./lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location) \
    .set("spark.driver.extraClassPath", "./lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.executor.extraClassPath", "./lib/gcs-connector-hadoop3-2.2.5.jar")

sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

df_courses = spark.read.parquet('gs://jugnu-france-course-enrollments/courses_data/courses_raw_parquet/1742298455.7402864.23a1a4c6d1.parquet')

df_courses.show(5)

columns_to_rename = {
    'nom_of': 'provider',
    'siret': 'provider_ID',
    'nom_region': 'region',
    'nom_departement': 'department',
    'intitule_certification': 'certification_title',
    'libelle_niveau_sortie_formation': 'training_exit_level',
    'libelle_code_formacode_principal': 'main_formacode_desc',
    'libelle_nsf_1': 'nsf_code_1_desc',
    'libelle_nsf_2': 'nsf_code_2_desc',
    'libelle_nsf_3': 'nsf_code_3_desc',
    'numero_formation': 'training_ID',
    'intitule_formation': 'title',
    'points_forts': 'strengths',
    'nb_session_active': 'nb_active_session',
    'nb_session_a_distance': 'nb_distant_session',
    'nombre_heures_total_min': 'duration_min',
    'nombre_heures_total_max': 'duration_max',
    'nombre_heures_total_mean': 'duration_mean',
    'frais_ttc_tot_min': 'cost_min',
    'frais_ttc_tot_max': 'cost_max',
    'frais_ttc_tot_mean': 'cost_mean'
}

# Rename the columns
for old_name, new_name in columns_to_rename.items():
    if old_name in df_courses.columns:
        df_courses = df_courses.withColumnRenamed(old_name, new_name)
    else:
        print(f"Column '{old_name}' not found, skipping rename.")

@F.udf(returnType=T.StringType())
def translate(input):
    if input is None:
        return None  # Or return "" if you prefer an empty string
    try:
        return GoogleTranslator(source='auto', target='en').translate(input)
    except NotValidPayload:
        return None #Or some other error handling.
    except Exception as e:
        print(f"Translation error: {e}")
        return None
    
df_courses = df_courses\
            .withColumn('certification_title_en', translate(F.col('certification_title')))\
            .withColumn('title_en', translate(F.col('title')))\
            .withColumn('main_formacode_desc_en', translate(F.col('main_formacode_desc')))

df_courses\
    .write\
    .parquet('gs://jugnu-france-course-enrollments/courses_data/courses_raw_parquet/france_courses_en.parquet', mode='overwrite')


SyntaxError: invalid syntax (691941664.py, line 61)

In [None]:
def translate_text(text, target_language='en'):
    """Translates text using Google Cloud Translation API."""
    if not text:
        return None
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target_language, source_language='fr')
    return result['translatedText']

translate_udf = udf(translate_text)

columns_to_translate = ['certification_title', 'title', 'main_formacode_desc']

for column_name in columns_to_translate:
    if column_name in df_courses.columns: #Check if column exists
        df_courses = df_courses.withColumn(
            f"{column_name}_en", translate_udf(col(column_name))
        )
    else:
        print(f"Column '{column_name}' not found in DataFrame, skipping translation.")

df_courses.show(5)