This is the code to send the descriptions through the libretranslate API 
sources: 

https://www.dataquest.io/blog/python-api-tutorial/
https://libretranslate.com/
https://libretranslate.com/docs/
https://github.com/LibreTranslate/LibreTranslate

In [1]:
import json
import sys
from urllib import request, parse

################## pre set translation code ########################
class LibreTranslateAPI:
    """Connect to the LibreTranslate API"""

    """Example usage:
    from libretranslatepy import LibreTranslateAPI

    lt = LibreTranslateAPI("https://libretranslate.com/")

    print(lt.translate("LibreTranslate is awesome!", "en", "es"))
    # LibreTranslate es impresionante!

    print(lt.detect("Hello World"))
    # [{"confidence": 0.6, "language": "en"}]
    
    print(lt.languages())
    # [{"code":"en", "name":"English"}]
    """

    DEFAULT_URL = "https://libretranslate.com/translate,"

    def __init__(self, url=None, api_key="):
        """Create a LibreTranslate API connection.

        Args:
            url (str): The url of the LibreTranslate endpoint.
            api_key (str): The API key.
        """
        self.url = LibreTranslateAPI.DEFAULT_URL if url is None else url
        self.api_key = api_key

        # Add trailing slash
        assert len(self.url) > 0
        if self.url[-1] != "/":
            self.url += "/"

    def translate(self, q, source, target):
        """Translate string

        Args:
            q (str): The text to translate
            source (str): The source language code (ISO 639)
            target (str): The target language code (ISO 639)

        Returns:
            str: The translated text
        """
        url = self.url + "translate"
        params = {"q": q, "source": source, "target": target}
        if self.api_key is not None:
            params["api_key"] = self.api_key
        url_params = parse.urlencode(params)
        req = request.Request(url, data=url_params.encode())
        response = request.urlopen(req)
        response_str = response.read().decode()
        return json.loads(response_str)["translatedText"]

    def detect(self, q):
        """Detect the language of a single text.

        Args:
            q (str): Text to detect

        Returns:
            The detected languages ex: [{"confidence": 0.6, "language": "en"}]
        """
        url = self.url + "detect"
        params = {"q": q}
        if self.api_key is not None:
            params["api_key"] = self.api_key
        url_params = parse.urlencode(params)
        req = request.Request(url, data=url_params.encode())
        response = request.urlopen(req)
        response_str = response.read().decode()
        return json.loads(response_str)

    def languages(self):
        """Retrieve list of supported languages.

        Returns:
            A list of available languages ex: [{"code":"en", "name":"English"}]
        """
        url = self.url + "languages"
        params = dict()
        if self.api_key is not None:
            params["api_key"] = self.api_key
        url_params = parse.urlencode(params)
        req = request.Request(url, data=url_params.encode(), method="GET")
        response = request.urlopen(req)
        response_str = response.read().decode()
        return json.loads(response_str)

In [11]:
import pandas as pd 
df = pd.read_csv("set_thesis_with_embedding.csv")



In [4]:
# Calculate length of each description
df['description_length'] = df['descrip'].apply(len)

# Calculate Q1, Q3, and IQR for 'description_length'
Q1 = df['description_length'].quantile(0.25)
Q3 = df['description_length'].quantile(0.75)
IQR = Q3 - Q1

# Define an upper bound beyond which a description is considered long/outlier
upper_bound = Q3 + 1.5 * IQR

# Filter outliers
outliers_df = df[df['description_length'] > upper_bound]

# Save outliers to CSV
outliers_df.to_csv('outliers.csv', index=False)



In [5]:
# Filter the original DataFrame to exclude outliers
df_no_outliers = df[df['description_length'] <= upper_bound]

# Optional: Drop the 'description_length' column if you don't need it anymore
df_no_outliers.drop(columns=['description_length'], inplace=True)

df = df_no_outliers

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outliers.drop(columns=['description_length'], inplace=True)


In [12]:
# Define number of chunks
num_chunks = 10

# Calculate the number of rows in each chunk
chunk_size = len(df) // num_chunks

for i in range(num_chunks):
    # Extract chunk from DataFrame
    chunk = df.iloc[i * chunk_size: (i + 1) * chunk_size]
    
    # Save chunk to CSV
    chunk.to_csv(f'chunk_{i + 1}.csv', index=False)

# If there are any remaining rows, save them to an additional CSV
if len(df) % num_chunks != 0:
    df.iloc[num_chunks * chunk_size:].to_csv(f'chunk_{num_chunks + 1}.csv', index=False)

In [5]:
#### translation via api #############

from libretranslatepy import LibreTranslateAPI

lt = LibreTranslateAPI("https://libretranslate.com/translate")

import pandas as pd
import time



# 1. Load CSV into DataFrame
df = pd.read_csv('chunk_1.csv')
error_count = 0 
# Custom translation function with error handling
def custom_translate(text):
    global error_count
    try:
        # Apply translation
        translated_text = lt.translate(text, "nl", "en")
        return translated_text
    except Exception as e:
        error_count += 1
        return (f"Error:{e}")  # return original text in case of error

# Calculate sleep time based on rate limit
# We have 60 seconds in a minute, divided by 80 gives the time we should sleep between each translation.
sleep_time = 60/80

# Apply translation to the DataFrame and print progress
total = len(df)
for i, row in df.iterrows():
    df.at[i, 'descrip_en'] = custom_translate(row['descrip'])
    #time.sleep(sleep_time)  # Sleep to adhere to rate limits
    if (i+1) % 400 == 0:  # Update progress every 40 rows
        print(f"Progress: {i+1}/{total}")

# Save the DataFrame to a new CSV
df.to_csv('translated_output_1.csv', index=False)
percerage_error = error_count / total
print(f"Translation completed with {error_count} ;with {percerage_error} and saved to translated_output.csv")


Progress: 400/3594
Progress: 800/3594
Progress: 1200/3594
Progress: 1600/3594


KeyboardInterrupt: 

In [2]:
import pandas as pd
import requests
import json
import time

# 1. Load CSV into DataFrame
df = pd.read_csv('chunk_3.csv')
error_count = 0

url = "https://libretranslate.com/translate"
headers = {
    "Content-Type": "application/json"
}
apikey = 

# Custom translation function with error handling
def custom_translate(text):
    global error_count
    data = {
        "q": text,
        "source": "nl",
        "target": "en",
        "api_key": apikey
    }
    try:
        # Send translation request
        response = requests.post(url, headers=headers, data=json.dumps(data))
        translated_text = response.json().get('translatedText', text)  # Default to original text if translation not found
        return translated_text
    except Exception as e:
        error_count += 1
        return (f"Error:{e}")  # return error message in case of error

# Calculate sleep time based on rate limit
# We have 60 seconds in a minute, divided by 80 gives the time we should sleep between each translation.
#sleep_time = 60/80

# Apply translation to the DataFrame and print progress
total = len(df)
for i, row in df.iterrows():
    df.at[i, 'descrip_en'] = custom_translate(row['descrip'])
    #time.sleep(sleep_time)  # Sleep to adhere to rate limits
    if (i+1) % 400 == 0:  # Update progress every 400 rows (change this as per your requirement)
        print(f"Progress: {i+1}/{total}")

# Save the DataFrame to a new CSV
df.to_csv('translated_output_3.csv', index=False)
percentage_error = (error_count / total) * 100
print(f"Translation completed with {error_count} errors (which is {percentage_error:.2f}% error rate) and saved to translated_output.csv")


Progress: 400/3594
Progress: 800/3594
Progress: 1200/3594
Progress: 1600/3594
Progress: 2000/3594
Progress: 2400/3594
Progress: 2800/3594
Progress: 3200/3594
Translation completed with 0 errors (which is 0.00% error rate) and saved to translated_output.csv


In [3]:
import pandas as pd
import requests
import json
import time

# Base setup
url = "https://libretranslate.com/translate"
headers = {
    "Content-Type": "application/json"
}
apikey = ""  # Remember to replace this with your actual API key

# Function to process each chunk
def process_chunk(file_name):
    df = pd.read_csv(file_name)
    error_count = 0

    # Custom translation function with error handling
    def custom_translate(text):
        global error_count
        data = {
            "q": text,
            "source": "nl",
            "target": "en",
            "api_key": apikey
        }
        try:
            # Send translation request
            response = requests.post(url, headers=headers, data=json.dumps(data))
            translated_text = response.json().get('translatedText', text)  # Default to original text if translation not found
            return translated_text
        except Exception as e:
            error_count += 1
            return (f"Error:{e}")  # return error message in case of error

    # Apply translation and save the file, similar to your existing code
    total = len(df)
    for i, row in df.iterrows():
        df.at[i, 'descrip_en'] = custom_translate(row['descrip'])
        #time.sleep(sleep_time)  # Sleep to adhere to rate limits
        if (i+1) % 400 == 0:
            print(f"Progress: {i+1}/{total}")

    output_file_name = 'translated_' + file_name
    df.to_csv(output_file_name, index=False)

    percentage_error = (error_count / total) * 100
    print(f"Translation completed with {error_count} errors (which is {percentage_error:.2f}% error rate) and saved to {output_file_name}")

    return error_count  # Return error count for additional processing if needed

# Main loop to go through each file
start_chunk = 3  # You mentioned you've already done 1 and 2
end_chunk = 21  # If you have chunks up to 21

for chunk_num in range(start_chunk, end_chunk + 1):  # +1 because range is exclusive of end point
    file_name = f'chunk_{chunk_num}.csv'
    print(f"Processing {file_name}...")
    error_count = process_chunk(file_name)
    # Add any summary or additional steps per chunk here, e.g., aggregating error counts, etc.

print("All chunks processed.")


Processing chunk_3.csv...
Progress: 400/3594
Progress: 800/3594
Progress: 1200/3594
Progress: 1600/3594
Progress: 2000/3594
Progress: 2400/3594
Progress: 2800/3594
Progress: 3200/3594
Translation completed with 0 errors (which is 0.00% error rate) and saved to translated_chunk_3.csv
Processing chunk_4.csv...
Progress: 400/3594
Progress: 800/3594
Progress: 1200/3594
Progress: 1600/3594
Progress: 2000/3594
Progress: 2400/3594
Progress: 2800/3594
Progress: 3200/3594
Translation completed with 0 errors (which is 0.00% error rate) and saved to translated_chunk_4.csv
Processing chunk_5.csv...
Progress: 400/3594
Progress: 800/3594
Progress: 1200/3594
Progress: 1600/3594
Progress: 2000/3594
Progress: 2400/3594
Progress: 2800/3594
Progress: 3200/3594
Translation completed with 0 errors (which is 0.00% error rate) and saved to translated_chunk_5.csv
Processing chunk_6.csv...
Progress: 400/3594
Progress: 800/3594
Progress: 1200/3594
Progress: 1600/3594
Progress: 2000/3594
Progress: 2400/3594
Prog

In [9]:
import pandas as pd
import requests
import json

# Load your CSV
df = pd.read_csv('chunk_3.csv')

# Get a subset of 10 rows
subset_df = df.head(10)

def split_text(text, limit=2000):
    """Split the text into chunks at the last period before the limit."""
    chunks = []
    while text:
        if len(text) <= limit:
            chunks.append(text)
            break
        split_pos = text.rfind('.', 0, limit)
        if split_pos == -1:
            split_pos = limit
        chunk = text[:split_pos+1]
        chunks.append(chunk)
        text = text[split_pos+1:]
    return chunks

url = "https://libretranslate.com/translate"
headers = {
    "Content-Type": "application/json"
}
apikey =  # Replace with your API key

# Iterate over the subset DataFrame
for _, row in subset_df.iterrows():
    sample_text = row['descrip']
    text_chunks = split_text(sample_text)

    for chunk in text_chunks:
        data = {
            "q": chunk,
            "source": "nl",
            "target": "en",
            "api_key": apikey
        }

        response = requests.post(url, headers=headers, data=json.dumps(data))
        # Print the entire response
        print(response.json())


{'translatedText': 'On the beautiful Joubertstraat located 3-room apartment on the 3rd floor with beautiful views to the front over green strip and trees from a sunny balcony/terrace and storage room in the substructure. The complex has an elevator system. The apartment is very conveniently located compared to all amenities. With just 10 minutes walk you are in the center of Gouda and the NS station is 6 minutes by bike distance. Supermarkets, sports clubs, primary and secondary schools and play facilities are all within a maximum of 10 minutes walking or cycling distance. By car within 25 minutes to Rotterdam, Utrecht, The Hague, Rotterdam, Utrecht, The Hague, Leiden and Amsterdam can be reached quickly via a direct connection. Built in 1956. Living area approx. 48 m2, approx. 2.4 m2. Content approx. 158 m3. Ground floor layout: Covered entrance with letterboxes and bell tableau with intercom, lift system and stairwell. Access to the large storage room in the basement equipped with el

In [7]:
import pandas as pd

def check_translated_chunk(file_name):
    df = pd.read_csv(file_name)
    same_text_count = 0  # Count for rows where translated text is same as original

    # Check for rows where original and translated texts are the same
    for i, row in df.iterrows():
        if row['descrip'] == row['descrip_en']:
            same_text_count += 1

    return same_text_count

# Main loop to go through each translated file
start_chunk = 3  # You mentioned you've already done 1 and 2
end_chunk = 21  # If you have chunks up to 21

total_same_text_count = 0  # Initialize the cumulative same text counter

for chunk_num in range(start_chunk, end_chunk + 1):  # +1 because range is exclusive of end point
    file_name = f'translated_chunk_{chunk_num}.csv'  # Assuming translated files have this pattern
    print(f"Checking {file_name}...")
    same_text_count = check_translated_chunk(file_name)
    
    total_same_text_count += same_text_count  # Update the cumulative same text count

    # Print summary for the chunk
    print(f"For {file_name}: {same_text_count} rows had the same original and translated text.")

# Final summary
print(f"Across all chunks, {total_same_text_count} rows had the same original and translated text.")


Checking translated_chunk_3.csv...
For translated_chunk_3.csv: 3587 rows had the same original and translated text.
Checking translated_chunk_4.csv...
For translated_chunk_4.csv: 3563 rows had the same original and translated text.
Checking translated_chunk_5.csv...
For translated_chunk_5.csv: 3594 rows had the same original and translated text.
Checking translated_chunk_6.csv...
For translated_chunk_6.csv: 3562 rows had the same original and translated text.
Checking translated_chunk_7.csv...
For translated_chunk_7.csv: 3568 rows had the same original and translated text.
Checking translated_chunk_8.csv...
For translated_chunk_8.csv: 3594 rows had the same original and translated text.
Checking translated_chunk_9.csv...
For translated_chunk_9.csv: 3566 rows had the same original and translated text.
Checking translated_chunk_10.csv...
For translated_chunk_10.csv: 3594 rows had the same original and translated text.
Checking translated_chunk_11.csv...
For translated_chunk_11.csv: 3594 

In [3]:
import pandas as pd
import requests
import json
from tqdm import tqdm

def split_text(text, limit=2000):
    """Split the text into chunks at the last period before the limit."""
    chunks = []
    while text:
        if len(text) <= limit:
            chunks.append(text)
            break
        split_pos = text.rfind('.', 0, limit)
        if split_pos == -1:
            split_pos = limit
        chunk = text[:split_pos+1]
        chunks.append(chunk)
        text = text[split_pos+1:]
    return chunks

url = "https://libretranslate.com/translate"
headers = {
    "Content-Type": "application/json"
}
apikey =   

for chunk_num in range(11, 12):  # Adjust this range based on the chunks you have
    print(f"\nProcessing chunk_{chunk_num}.csv...")
    df = pd.read_csv(f'chunk_{chunk_num}.csv')
    total_rows = len(df)
    
    for idx, row in tqdm(df.iterrows(), total=total_rows, desc=f"Chunk {chunk_num}"):
        sample_text = row['descrip']
        text_chunks = split_text(sample_text)
        translated_chunks = []

        for chunk in text_chunks:
            data = {
                "q": chunk,
                "source": "nl",
                "target": "en",
                "api_key": apikey
            }
            try:
                response = requests.post(url, headers=headers, data=json.dumps(data), timeout=30)
                if response.status_code == 200 and 'translatedText' in response.json():
                    translated_chunks.append(response.json()['translatedText'])
                else:
                    print(f"Error on chunk {chunk_num} row {idx}. Status code: {response.status_code}")
                    translated_chunks = ["ERROR"]
                    break
            except requests.Timeout:
                print(f"Timeout on chunk {chunk_num} row {idx}. Marking this row as ERROR.")
                translated_chunks = ["ERROR"]
                break
            except Exception as e:
                print(f"Unexpected error on chunk {chunk_num} row {idx}. Error: {e}. Marking this row as ERROR.")
                translated_chunks = ["ERROR"]
                break

        # Combine the translated chunks
        translated_text = ' '.join(translated_chunks)
        df.at[idx, 'descrip_en'] = translated_text

    # Save the translated DataFrame to a new CSV
    df.to_csv(f'translated_output_{chunk_num}.csv', index=False)
    print(f"Finished processing chunk_{chunk_num}.csv!")

print("\nAll chunks processed!")




Processing chunk_11.csv...


Chunk 11: 100%|██████████| 5/5 [00:20<00:00,  4.20s/it]

Finished processing chunk_11.csv!

All chunks processed!





In [4]:
import pandas as pd
import openai
from tqdm import tqdm



# Function to get embedding
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']


# Assuming chunk range is the same, adjust if necessary
for chunk_num in range(1, 12):  # Adjust this range based on the chunks you have
    print(f"\nProcessing translated_output_{chunk_num}.csv...")
    df = pd.read_csv(f'translated_output_{chunk_num}.csv')
    total_rows = len(df)

    # Add a new column to store embeddings
    df['ada_embedding_eng'] = None

    for index, row in tqdm(df.iterrows(), total=total_rows, desc=f"Chunk {chunk_num}"):
        descrip_en = row['descrip_en']
        embedding = get_embedding(descrip_en, model='text-embedding-ada-002')
        df.at[index, 'ada_embedding_eng'] = embedding

    # Save the DataFrame with embeddings to the same CSV file (or you can choose a new name if you want)
    df.to_csv(f'translated_output_{chunk_num}_emb.csv', index=False)
    print(f"Finished processing translated_output_{chunk_num}.csv!")

print("\nAll chunks processed!")



Processing translated_output_1.csv...


Chunk 1: 100%|██████████| 7465/7465 [30:35<00:00,  4.07it/s]  


Finished processing translated_output_1.csv!

Processing translated_output_2.csv...


Chunk 2: 100%|██████████| 7465/7465 [30:20<00:00,  4.10it/s] 


Finished processing translated_output_2.csv!

Processing translated_output_3.csv...


Chunk 3: 100%|██████████| 7465/7465 [30:22<00:00,  4.10it/s] 


Finished processing translated_output_3.csv!

Processing translated_output_4.csv...


Chunk 4: 100%|██████████| 7465/7465 [30:14<00:00,  4.11it/s] 


Finished processing translated_output_4.csv!

Processing translated_output_5.csv...


Chunk 5: 100%|██████████| 7465/7465 [30:10<00:00,  4.12it/s]  


Finished processing translated_output_5.csv!

Processing translated_output_6.csv...


Chunk 6: 100%|██████████| 7465/7465 [30:31<00:00,  4.08it/s]  


Finished processing translated_output_6.csv!

Processing translated_output_7.csv...


Chunk 7: 100%|██████████| 7465/7465 [30:35<00:00,  4.07it/s]  


Finished processing translated_output_7.csv!

Processing translated_output_8.csv...


Chunk 8: 100%|██████████| 7465/7465 [30:11<00:00,  4.12it/s]  


Finished processing translated_output_8.csv!

Processing translated_output_9.csv...


Chunk 9: 100%|██████████| 7465/7465 [30:39<00:00,  4.06it/s] 


Finished processing translated_output_9.csv!

Processing translated_output_10.csv...


Chunk 10: 100%|██████████| 7465/7465 [30:28<00:00,  4.08it/s]  


Finished processing translated_output_10.csv!

Processing translated_output_11.csv...


Chunk 11: 100%|██████████| 5/5 [00:01<00:00,  3.91it/s]

Finished processing translated_output_11.csv!

All chunks processed!





In [1]:
import pandas as pd
import requests
import json

# Load your CSV
df = pd.read_csv('chunk_3.csv')

# Get a subset of 1000 rows
subset_df = df.head(1000)

def split_text(text, limit=2000):
    """Split the text into chunks at the last period before the limit."""
    chunks = []
    while text:
        if len(text) <= limit:
            chunks.append(text)
            break
        split_pos = text.rfind('.', 0, limit)
        if split_pos == -1:
            split_pos = limit
        chunk = text[:split_pos+1]
        chunks.append(chunk)
        text = text[split_pos+1:]
    return chunks

url = "https://libretranslate.com/translate"
headers = {
    "Content-Type": "application/json"
}
apikey 

# Iterate over the subset DataFrame
for idx, row in subset_df.iterrows():
    sample_text = row['descrip']
    text_chunks = split_text(sample_text)

    translated_chunks = []

    for chunk in text_chunks:
        data = {
            "q": chunk,
            "source": "nl",
            "target": "en",
            "api_key": apikey
        }

        response = requests.post(url, headers=headers, data=json.dumps(data))
        
        # Print the entire response
        print(f"Row {idx}, Chunk: {chunk[:50]}...")  # Printing the start of the chunk for context
        print(response.json())
        
        if response.status_code == 200 and 'translatedText' in response.json():
            translated_chunks.append(response.json()['translatedText'])

    # Combine the translated chunks
    translated_text = ' '.join(translated_chunks)
    df.at[idx, 'descrip_en'] = translated_text

# Optionally, you can save the subset DataFrame to a new CSV after reviewing responses
subset_df.to_csv('translated_subset_output_1000.csv', index=False)

print("Translation completed!")


Row 0, Chunk: Hier wil je  wonen   Deze  riante en  vrijstaande ...
{'translatedText': 'This is where you want to live This spacious and detached villa has a living area of no less than 387m2 and a spacious subhouse with private entrance, sauna and sports area. The luxurious look and spatial design provide great comfort. In this house you will find 5 spacious bedrooms and a wide attic which can also be furnished as a bedroom. The house is completely insulated and equipped with sprouting walls. The layout is very well suited to decorating an office at home. The property facilitates parking space on private grounds for up to 4 cars in the heated driveway and under the carport, which allows you to use this with snow and ice. The garden is built by Erik van Gelder and has the title ""Wellness garden" , in which is the Jacuzzi room as well as the lounge corner including fireplace. A great place to relax and enjoy the pond and the vegetation present. There is also a beautiful lighting plan t

KeyboardInterrupt: 