### Transliterate

In [1]:
from openai import OpenAI
import json
import re
import pandas as pd
import logging
import unicodedata

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
def get_api_key(key_file: str = 'openai_key.txt') -> str:
    """
    Read OpenAI API key from file.
    
    Args:
        key_file: Path to file containing the API key
        
    Returns:
        API key as string
        
    Raises:
        FileNotFoundError: if key file doesn't exist
    """
    try:
        with open(key_file, 'r') as f:
            return f.read().strip()
    except FileNotFoundError:
        logging.error(f"API key file {key_file} not found")
        raise

def chunk_list(lst, chunk_size):
    """
    Yields successive chunks of size `chunk_size` from the list `lst`.
    """
    logging.info(f"Chunking list of {len(lst)} words into chunks of up to {chunk_size} words each.")
    for i in range(0, len(lst), chunk_size):
        chunk = lst[i:i + chunk_size]
        logging.info(f"Yielding chunk from index {i} to {i + len(chunk) - 1} (total {len(chunk)} words).")
        yield chunk

def get_transliterations(word_list, model="gpt-3.5-turbo", key_file='openai_key.txt'):
    """
    Queries the OpenAI API to get English transliterations for a list of words.
    
    The prompt instructs the model to return a JSON array where each element is an object
    with two keys: 'original_word' and 'transliteration'. If a transliteration is not
    available, null should be used.
    
    Args:
        word_list: List of words to transliterate
        model: OpenAI model to use (default: "gpt-3.5-turbo")
        key_file: Path to file containing OpenAI API key (default: 'openai_key.txt')
        
    Returns:
        A list of dictionaries (each with keys 'original_word' and 'transliteration')
        
    Raises:
        ValueError: if the returned JSON does not match the expected format
        FileNotFoundError: if key file doesn't exist
    """
    logging.info(f"Requesting transliterations for a chunk of {len(word_list)} words.")
    
    # Get API key and initialize client
    api_key = get_api_key(key_file)
    client = OpenAI(api_key=api_key)
    
    # Construct the prompt
    prompt = (
        "For the following list of Hindi/Devanagari words, provide their English transliterations. "
        "Each transliteration should represent how the word is pronounced using English letters. "
        "Return ONLY a valid JSON array where each element is an object with two keys: "
        "'original_word' and 'transliteration'. "
        "Never return null for transliteration - always provide a romanized version. "
        "Do not include any explanation or additional text.\n\n"
        "Example format:\n"
        "[{\"original_word\":\"भारत\",\"transliteration\":\"Bharat\"},"
        "{\"original_word\":\"नमस्ते\",\"transliteration\":\"Namaste\"}]\n\n"
        "List of words:\n" + 
        "\n".join(word_list)
    )
    
    # Make API request
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are an assistant that provides transliterations."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        max_tokens=4096  # Maximum allowed for GPT-3.5-turbo
    )
    
    # Extract and parse response
    answer = response.choices[0].message.content.strip()
    logging.info("Received response from OpenAI API")
    logging.info(f"Raw response content: {answer}")  # Log the raw response
    
    try:
        # Try to clean the response if it contains markdown code blocks
        if answer.startswith('```json'):
            answer = answer.replace('```json', '').replace('```', '').strip()
        elif answer.startswith('```'):
            answer = answer.replace('```', '').strip()
            
        data = json.loads(answer)
        logging.info("Successfully parsed JSON response")
    except json.JSONDecodeError as e:
        logging.error(f"Failed to parse JSON response: {e}")
        logging.error(f"Response content that failed parsing: {answer}")
        raise ValueError(f"API response was not valid JSON. Error: {str(e)}")
    
    # Validate response format and check for nulls
    if not (isinstance(data, list) and 
            all(isinstance(item, dict) and 
                'original_word' in item and 
                'transliteration' in item 
                for item in data)):
        logging.error("Returned JSON does not match the expected format.")
        raise ValueError("Returned JSON does not match the expected format.")
    
    # Check for and warn about nulls
    null_count = sum(1 for item in data if item['transliteration'] is None)
    if null_count > 0:
        logging.warning(f"Found {null_count} null transliterations out of {len(data)} total words.")
        
        # Log some examples of words that got null transliterations
        null_examples = [item['original_word'] for item in data if item['transliteration'] is None][:5]
        logging.warning(f"Example words with null transliterations: {null_examples}")
        
        # Optionally, retry these words in a separate request or raise an error
        if null_count == len(data):
            raise ValueError("All transliterations were null. Check the prompt or try with a smaller batch.")
    
    logging.info("Chunk processed successfully.")
    return data

def filter_non_english_words(word_list):
    """
    Filters a list of words to include only those that are considered non-English.
    
    A word is defined as non-English if it contains any character outside the ASCII range.
    Non-string entries are skipped (with a warning).
    
    Parameters:
        word_list (list): The list of words to filter.
    
    Returns:
        list: A list containing only non-English words.
    """
    logging.info("Starting filtering of non-English words.")
    
    filtered = []
    for word in word_list:
        if not isinstance(word, str):
            logging.warning(f"Encountered non-string value: {word} (type: {type(word)}). Skipping.")
            continue
        if re.search(r'[^\x00-\x7F]', word):
            filtered.append(word)
    
    logging.info("Completed filtering of non-English words.")
    logging.info(f"Total words provided: {len(word_list)}, non-English words found: {len(filtered)}")
    
    return filtered

In [6]:
df = pd.read_csv("../data/final_audit_results.csv", low_memory = False)

done = pd.read_csv("../data/transliterate.csv")

word_list = filter_non_english_words(df['panchayat'].str.strip().unique())
word_list = list(set(word_list) - set(done['original_word'])) # set diff to done
print(len(word_list))

2025-04-23 02:19:55,829 - INFO - Starting filtering of non-English words.
2025-04-23 02:19:55,866 - INFO - Completed filtering of non-English words.
2025-04-23 02:19:55,866 - INFO - Total words provided: 57691, non-English words found: 8358


10


In [7]:
max_words_per_chunk = 100

combined_results = []
total_chunks = (len(word_list) + max_words_per_chunk - 1) // max_words_per_chunk
logging.info(f"Total chunks to process: {total_chunks}")
    
chunk_counter = 0
for chunk in chunk_list(word_list, max_words_per_chunk):
    chunk_counter += 1
    logging.info(f"Processing chunk {chunk_counter} of {total_chunks}.")
    results = get_transliterations(chunk)
    combined_results.extend(results)

2025-04-23 02:19:59,090 - INFO - Total chunks to process: 1
2025-04-23 02:19:59,092 - INFO - Chunking list of 10 words into chunks of up to 100 words each.
2025-04-23 02:19:59,093 - INFO - Yielding chunk from index 0 to 9 (total 10 words).
2025-04-23 02:19:59,094 - INFO - Processing chunk 1 of 1.
2025-04-23 02:19:59,094 - INFO - Requesting transliterations for a chunk of 10 words.
2025-04-23 02:20:03,390 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-23 02:20:03,414 - INFO - Received response from OpenAI API
2025-04-23 02:20:03,415 - INFO - Raw response content: [
{"original_word":"अल्‍हनापुर","transliteration":"Alhanapur"},
{"original_word":"फिड़ाेद","transliteration":"Phirod"},
{"original_word":"‍िघ्‍ारड़ाेदा मीठा","transliteration":"Gharodamitha"},
{"original_word":"कूकड़ाेद","transliteration":"Kukrod"},
{"original_word":"फरड़ाेद","transliteration":"Farod"},
{"original_word":"हाड़ाेता","transliteration":"Harota"},
{"original_word":"म

In [8]:
res_df = pd.DataFrame(combined_results)
res_df['original_word'] = res_df['original_word'].str.strip()
res_df['transliteration'] = res_df['transliteration'].str.strip().str.lower()

In [9]:
res_df.head()

Unnamed: 0,original_word,transliteration
0,अल्‍हनापुर,alhanapur
1,फिड़ाेद,phirod
2,‍िघ्‍ारड़ाेदा मीठा,gharodamitha
3,कूकड़ाेद,kukrod
4,फरड़ाेद,farod


In [10]:
res_df.to_csv("../data/transliterate.csv", index=False, mode='a', header=False)

### Remove Diacritics

In [11]:
translit = pd.read_csv("../data/transliterate.csv")

In [12]:
def remove_diacritics(text):
    """
    Normalize the input text using NFKD normalization and remove any combining diacritical marks.
    """
    # Normalize the text to decompose combined letters into base letters and diacritics
    normalized_text = unicodedata.normalize('NFKD', text)
    # Rebuild the string by including only characters that are not diacritical marks
    plain_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
    return plain_text

In [13]:
translit["plain_transliteration"] = translit["transliteration"].apply(remove_diacritics)

In [14]:
translit.shape

(8723, 3)

### Let's join back the results

In [15]:
soc_audit_translit = df.merge(translit, left_on='panchayat', right_on='original_word', how = "left")
soc_audit_translit

Unnamed: 0,state,district,block,panchayat,sa_start_date,sa_end_date,gram_sabha_date,public_hearing_date,sa_period_from,sa_period_to,...,gpc_worksite_facilities.is_women_worker_been_appointed_in_places_with_more_than_5_children,gpc_personnel_&_training.have_all_mates_been_selected_through_gram_sabha/state_norms,gpc_personnel_&_training.have_mates_been_trained,gpc_personnel_&_training.is_there_adequate_manpower_to_implement_mgnrega_at_panchayat_level,gpc_personnel_&_training.is_there_a_person_in_charge_of_mgnrega_at_the_panchayat_level_(not_holding_additional_charge),gpc_personnel_&_training.are_the_personnel_in_charge_of_mgnrega_at_the_panchayat_level_been_trained,gpc_personnel_&_training.does_the_gram_panchayat_have_adequate_technical_support_personnel,original_word,transliteration,plain_transliteration
0,RAJASTHAN,NAGAUR,DEGANA,आंतरोली कलां,19/06/2024,23/06/2024,24/06/2024,24/06/2024,01/04/2023,31/03/2024,...,No,Mostly,Mostly,Between 50% and 75%,Yes,Mostly,Yes,आंतरोली कलां,antaroli kalan,antaroli kalan
1,MEGHALAYA,EAST KHASI HILLS,MAWPHLANG,Lawkhla Mawlong,31/07/2023,03/08/2023,04/08/2023,29/02/2024,01/10/2022,31/03/2023,...,No,Yes,Yes,Greater than 75%,Yes,Yes,Yes,,,
2,MEGHALAYA,EAST KHASI HILLS,KHADARSHNONG-LAITKROH,Myiong,26/06/2023,28/06/2023,28/06/2023,26/07/2023,01/10/2022,31/03/2023,...,No,Yes,No,No,Yes,Yes,Yes,,,
3,UTTAR PRADESH,KANNAUJ,TALGRAM,Terajaket,22/07/2019,24/07/2019,24/07/2019,01/08/2019,01/04/2018,31/03/2019,...,No,Yes,Yes,Greater than 75%,Yes,Yes,Yes,,,
4,BIHAR,AURANAGABAD,GOH,MIRPUR,02/11/2022,06/11/2022,07/11/2022,07/11/2022,01/04/2021,31/03/2022,...,No,No,No,No,Yes,Yes,Yes,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297246,ANDHRA PRADESH,ANANTAPUR,Anantapur,RAJIV COLONY,19/02/2023,21/02/2023,21/02/2023,23/02/2023,01/04/2021,31/03/2022,...,No,Some,Some,Between 25% and 50%,No,No,Yes,,,
297247,RAJASTHAN,BANSWARA,BAGEEDAURA,चोखला,06/02/2023,12/02/2023,13/02/2023,16/02/2023,01/04/2020,31/03/2022,...,Yes,Yes,Yes,Greater than 75%,No,Yes,Yes,चोखला,chokhla,chokhla
297248,ASSAM,KAMRUP (METRO),CHANDRAPUR,Panikhati,12/02/2020,18/02/2020,18/02/2020,,01/04/2018,31/03/2019,...,No,No Mates,No,No,Yes,Yes,Yes,,,
297249,UTTAR PRADESH,SULTANPUR,DHANPATGANJ,BISAWAN,13/06/2022,15/06/2022,15/06/2022,30/07/2022,01/04/2021,31/03/2022,...,No,No Mates,No,No,No,No,No,,,


In [16]:
soc_audit_translit.dropna(subset=['state'], inplace = True)
soc_audit_translit.shape

(295315, 94)

In [17]:
soc_audit_translit.to_csv("../data/final_audit_results_translit.csv", index = False)

In [18]:
soc_audit_translit.to_parquet("../data/final_audit_results_translit.parquet", 
                               index=False, 
                               compression='snappy')