In [24]:
from openai import OpenAI
import json
import re
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [42]:
def get_api_key(key_file: str = 'openai_key.txt') -> str:
    """
    Read OpenAI API key from file.
    
    Args:
        key_file: Path to file containing the API key
        
    Returns:
        API key as string
        
    Raises:
        FileNotFoundError: if key file doesn't exist
    """
    try:
        with open(key_file, 'r') as f:
            return f.read().strip()
    except FileNotFoundError:
        logging.error(f"API key file {key_file} not found")
        raise

def chunk_list(lst, chunk_size):
    """
    Yields successive chunks of size `chunk_size` from the list `lst`.
    """
    logging.info(f"Chunking list of {len(lst)} words into chunks of up to {chunk_size} words each.")
    for i in range(0, len(lst), chunk_size):
        chunk = lst[i:i + chunk_size]
        logging.info(f"Yielding chunk from index {i} to {i + len(chunk) - 1} (total {len(chunk)} words).")
        yield chunk

def get_transliterations(word_list, model="gpt-3.5-turbo", key_file='openai_key.txt'):
    """
    Queries the OpenAI API to get English transliterations for a list of words.
    
    The prompt instructs the model to return a JSON array where each element is an object
    with two keys: 'original_word' and 'transliteration'. If a transliteration is not
    available, null should be used.
    
    Args:
        word_list: List of words to transliterate
        model: OpenAI model to use (default: "gpt-3.5-turbo")
        key_file: Path to file containing OpenAI API key (default: 'openai_key.txt')
        
    Returns:
        A list of dictionaries (each with keys 'original_word' and 'transliteration')
        
    Raises:
        ValueError: if the returned JSON does not match the expected format
        FileNotFoundError: if key file doesn't exist
    """
    logging.info(f"Requesting transliterations for a chunk of {len(word_list)} words.")
    
    # Get API key and initialize client
    api_key = get_api_key(key_file)
    client = OpenAI(api_key=api_key)
    
    # Construct the prompt
    prompt = (
        "For the following list of Hindi/Devanagari words, provide their English transliterations. "
        "Each transliteration should represent how the word is pronounced using English letters. "
        "Return ONLY a valid JSON array where each element is an object with two keys: "
        "'original_word' and 'transliteration'. "
        "Never return null for transliteration - always provide a romanized version. "
        "Do not include any explanation or additional text.\n\n"
        "Example format:\n"
        "[{\"original_word\":\"भारत\",\"transliteration\":\"Bharat\"},"
        "{\"original_word\":\"नमस्ते\",\"transliteration\":\"Namaste\"}]\n\n"
        "List of words:\n" + 
        "\n".join(word_list)
    )
    
    # Make API request
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are an assistant that provides transliterations."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        max_tokens=4096  # Maximum allowed for GPT-3.5-turbo
    )
    
    # Extract and parse response
    answer = response.choices[0].message.content.strip()
    logging.info("Received response from OpenAI API")
    logging.info(f"Raw response content: {answer}")  # Log the raw response
    
    try:
        # Try to clean the response if it contains markdown code blocks
        if answer.startswith('```json'):
            answer = answer.replace('```json', '').replace('```', '').strip()
        elif answer.startswith('```'):
            answer = answer.replace('```', '').strip()
            
        data = json.loads(answer)
        logging.info("Successfully parsed JSON response")
    except json.JSONDecodeError as e:
        logging.error(f"Failed to parse JSON response: {e}")
        logging.error(f"Response content that failed parsing: {answer}")
        raise ValueError(f"API response was not valid JSON. Error: {str(e)}")
    
    # Validate response format and check for nulls
    if not (isinstance(data, list) and 
            all(isinstance(item, dict) and 
                'original_word' in item and 
                'transliteration' in item 
                for item in data)):
        logging.error("Returned JSON does not match the expected format.")
        raise ValueError("Returned JSON does not match the expected format.")
    
    # Check for and warn about nulls
    null_count = sum(1 for item in data if item['transliteration'] is None)
    if null_count > 0:
        logging.warning(f"Found {null_count} null transliterations out of {len(data)} total words.")
        
        # Log some examples of words that got null transliterations
        null_examples = [item['original_word'] for item in data if item['transliteration'] is None][:5]
        logging.warning(f"Example words with null transliterations: {null_examples}")
        
        # Optionally, retry these words in a separate request or raise an error
        if null_count == len(data):
            raise ValueError("All transliterations were null. Check the prompt or try with a smaller batch.")
    
    logging.info("Chunk processed successfully.")
    return data

def filter_non_english_words(word_list):
    """
    Filters a list of words to include only those that are considered non-English.
    
    A word is defined as non-English if it contains any character outside the ASCII range.
    Non-string entries are skipped (with a warning).
    
    Parameters:
        word_list (list): The list of words to filter.
    
    Returns:
        list: A list containing only non-English words.
    """
    logging.info("Starting filtering of non-English words.")
    
    filtered = []
    for word in word_list:
        if not isinstance(word, str):
            logging.warning(f"Encountered non-string value: {word} (type: {type(word)}). Skipping.")
            continue
        if re.search(r'[^\x00-\x7F]', word):
            filtered.append(word)
    
    logging.info("Completed filtering of non-English words.")
    logging.info(f"Total words provided: {len(word_list)}, non-English words found: {len(filtered)}")
    
    return filtered

In [55]:
df = pd.read_csv("../final_audit_results.csv")

In [45]:
api_key_file = "openai_key.txt"

# Let's take out already transliterated
done = pd.read_csv("../data/transliterate.csv")

word_list = filter_non_english_words(df['panchayat'].str.strip().unique())
word_list = list(set(word_list) - set(done['original_word'])) # set diff to done
max_words_per_chunk = 100

combined_results = []
total_chunks = (len(word_list) + max_words_per_chunk - 1) // max_words_per_chunk
logging.info(f"Total chunks to process: {total_chunks}")
    
chunk_counter = 0
for chunk in chunk_list(word_list, max_words_per_chunk):
    chunk_counter += 1
    logging.info(f"Processing chunk {chunk_counter} of {total_chunks}.")
    results = get_transliterations(chunk)
    combined_results.extend(results)

2025-02-08 16:12:42,234 - INFO - Starting filtering of non-English words.
2025-02-08 16:12:42,241 - INFO - Completed filtering of non-English words.
2025-02-08 16:12:42,242 - INFO - Total words provided: 3144, non-English words found: 1768
2025-02-08 16:12:42,243 - INFO - Total chunks to process: 18
2025-02-08 16:12:42,244 - INFO - Chunking list of 1768 words into chunks of up to 100 words each.
2025-02-08 16:12:42,245 - INFO - Yielding chunk from index 0 to 99 (total 100 words).
2025-02-08 16:12:42,245 - INFO - Processing chunk 1 of 18.
2025-02-08 16:12:42,246 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:13:24,968 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:13:24,978 - INFO - Received response from OpenAI API
2025-02-08 16:13:24,979 - INFO - Raw response content: [
{"original_word":"खुन्दनी हाला","transliteration":"Khundani Hala"},
{"original_word":"रोडवाल","transliteration":"Rodwal"},
{"origina

2025-02-08 16:13:42,291 - INFO - Successfully parsed JSON response
2025-02-08 16:13:42,292 - INFO - Chunk processed successfully.
2025-02-08 16:13:42,295 - INFO - Yielding chunk from index 200 to 299 (total 100 words).
2025-02-08 16:13:42,296 - INFO - Processing chunk 3 of 18.
2025-02-08 16:13:42,299 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:14:26,948 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:14:26,962 - INFO - Received response from OpenAI API
2025-02-08 16:14:26,963 - INFO - Raw response content: [
{"original_word":"मोठपुर","transliteration":"Mothpur"},
{"original_word":"छीपाबड‍ौद","transliteration":"Chipabaud"},
{"original_word":"काकडदा","transliteration":"Kakadda"},
{"original_word":"शुभधरा","transliteration":"Shubhadhara"},
{"original_word":"कुपड़ां","transliteration":"Kupdaan"},
{"original_word":"सुदाबेरी","transliteration":"Sudaberi"},
{"original_word":"देवगढ़","transliteration":"Devgar

2025-02-08 16:14:44,905 - INFO - Successfully parsed JSON response
2025-02-08 16:14:44,906 - INFO - Chunk processed successfully.
2025-02-08 16:14:44,907 - INFO - Yielding chunk from index 400 to 499 (total 100 words).
2025-02-08 16:14:44,908 - INFO - Processing chunk 5 of 18.
2025-02-08 16:14:44,908 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:15:01,016 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:15:01,036 - INFO - Received response from OpenAI API
2025-02-08 16:15:01,038 - INFO - Raw response content: [
{"original_word":"भूडा","transliteration":"Bhuda"},
{"original_word":"मीठड़ा","transliteration":"Meethra"},
{"original_word":"दूदवा","transliteration":"Dudva"},
{"original_word":"सांजटा","transliteration":"Sanjata"},
{"original_word":"भामोलाव","transliteration":"Bhamolav"},
{"original_word":"नरवर","transliteration":"Narvar"},
{"original_word":"धोरीमन्ना","transliteration":"Dhorimanna"},
{"origina

2025-02-08 16:15:18,860 - INFO - Successfully parsed JSON response
2025-02-08 16:15:18,879 - INFO - Chunk processed successfully.
2025-02-08 16:15:18,880 - INFO - Yielding chunk from index 600 to 699 (total 100 words).
2025-02-08 16:15:18,880 - INFO - Processing chunk 7 of 18.
2025-02-08 16:15:18,880 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:15:38,296 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:15:38,300 - INFO - Received response from OpenAI API
2025-02-08 16:15:38,301 - INFO - Raw response content: [
{"original_word":"कडैयाहाट","transliteration":"Kadaiyahat"},
{"original_word":"कोटडा","transliteration":"Kotda"},
{"original_word":"खुंटा गलिया","transliteration":"Khunta Galiya"},
{"original_word":"बरजड़िया","transliteration":"Barjariya"},
{"original_word":"बड़नावा जागीर","transliteration":"Badnava Jagir"},
{"original_word":"आडेल","transliteration":"Adel"},
{"original_word":"सरेड़ी बडी","translite

2025-02-08 16:15:55,281 - INFO - Successfully parsed JSON response
2025-02-08 16:15:55,284 - INFO - Chunk processed successfully.
2025-02-08 16:15:55,285 - INFO - Yielding chunk from index 800 to 899 (total 100 words).
2025-02-08 16:15:55,286 - INFO - Processing chunk 9 of 18.
2025-02-08 16:15:55,287 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:16:11,763 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:16:11,771 - INFO - Received response from OpenAI API
2025-02-08 16:16:11,773 - INFO - Raw response content: [
{"original_word":"रेडाना","transliteration":"Redana"},
{"original_word":"लोहिड़ी","transliteration":"Lohri"},
{"original_word":"खोखसर पश्चिम","transliteration":"Khokhsar Pashchim"},
{"original_word":"हिंगवाडा","transliteration":"Hingwada"},
{"original_word":"खजुरी","transliteration":"Khajuri"},
{"original_word":"सुरडिया","transliteration":"Surdiya"},
{"original_word":"मोठूका","transliteration":"M

2025-02-08 16:16:30,597 - INFO - Successfully parsed JSON response
2025-02-08 16:16:30,599 - INFO - Chunk processed successfully.
2025-02-08 16:16:30,600 - INFO - Yielding chunk from index 1000 to 1099 (total 100 words).
2025-02-08 16:16:30,601 - INFO - Processing chunk 11 of 18.
2025-02-08 16:16:30,602 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:16:45,188 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:16:45,194 - INFO - Received response from OpenAI API
2025-02-08 16:16:45,195 - INFO - Raw response content: [{"original_word":"मियाडा","transliteration":"Miyada"},{"original_word":"डूंगरा खुर्द","transliteration":"Doongara Khurd"},{"original_word":"बख्तपुरा","transliteration":"Bakhtapura"},{"original_word":"कुण्डला","transliteration":"Kundala"},{"original_word":"झडवासा","transliteration":"Jhadvasa"},{"original_word":"भलरो का बाड़ा","transliteration":"Bhalro ka Bada"},{"original_word":"झड़स","transliter

2025-02-08 16:17:03,262 - INFO - Successfully parsed JSON response
2025-02-08 16:17:03,263 - INFO - Chunk processed successfully.
2025-02-08 16:17:03,263 - INFO - Yielding chunk from index 1200 to 1299 (total 100 words).
2025-02-08 16:17:03,263 - INFO - Processing chunk 13 of 18.
2025-02-08 16:17:03,264 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:17:19,030 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:17:19,044 - INFO - Received response from OpenAI API
2025-02-08 16:17:19,045 - INFO - Raw response content: [
{"original_word":"कुण्डी","transliteration":"Kundi"},
{"original_word":"भावपुरा","transliteration":"Bhavpura"},
{"original_word":"सिलोर","transliteration":"Silor"},
{"original_word":"बडाखेडा","transliteration":"Badakheda"},
{"original_word":"मजल","transliteration":"Majal"},
{"original_word":"लामगरा","transliteration":"Lamgara"},
{"original_word":"देरासर","transliteration":"Derasar"},
{"origin

2025-02-08 16:17:35,082 - INFO - Successfully parsed JSON response
2025-02-08 16:17:35,082 - INFO - Chunk processed successfully.
2025-02-08 16:17:35,082 - INFO - Yielding chunk from index 1400 to 1499 (total 100 words).
2025-02-08 16:17:35,083 - INFO - Processing chunk 15 of 18.
2025-02-08 16:17:35,083 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:17:50,980 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:17:50,987 - INFO - Received response from OpenAI API
2025-02-08 16:17:50,988 - INFO - Raw response content: ```json
[
    {"original_word":"देवरिया","transliteration":"Devariya"},
    {"original_word":"छीन‍ोद","transliteration":"Chinod"},
    {"original_word":"खोखसर","transliteration":"Khokhsar"},
    {"original_word":"बिलासर","transliteration":"Bilasar"},
    {"original_word":"खरखडा रामलोथान","transliteration":"Kharkhada Ramlothan"},
    {"original_word":"कड़दा","transliteration":"Kadda"},
    {"orig

2025-02-08 16:18:08,480 - INFO - Successfully parsed JSON response
2025-02-08 16:18:08,481 - INFO - Chunk processed successfully.
2025-02-08 16:18:08,482 - INFO - Yielding chunk from index 1600 to 1699 (total 100 words).
2025-02-08 16:18:08,484 - INFO - Processing chunk 17 of 18.
2025-02-08 16:18:08,485 - INFO - Requesting transliterations for a chunk of 100 words.
2025-02-08 16:18:26,933 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-08 16:18:26,935 - INFO - Received response from OpenAI API
2025-02-08 16:18:26,935 - INFO - Raw response content: ```json
[
    {"original_word":"भुरटिया","transliteration":"Bhuratiya"},
    {"original_word":"माकोद","transliteration":"Makod"},
    {"original_word":"चिमरावली गोड","transliteration":"Chimaravali God"},
    {"original_word":"स्वांस","transliteration":"Swans"},
    {"original_word":"घाटोल","transliteration":"Ghatol"},
    {"original_word":"अमरथुन","transliteration":"Amarthun"},
    {"original_w

2025-02-08 16:18:38,476 - INFO - Successfully parsed JSON response
2025-02-08 16:18:38,476 - INFO - Chunk processed successfully.


In [52]:
res_df = pd.DataFrame(combined_results)
res_df['original_word'] = df['original_word'].str.strip()
res_df['transliteration'] = df['transliteration'].str.strip().str.lower()

In [53]:
res_df.head()

Unnamed: 0,original_word,transliteration
0,खुन्दनी हाला,khundani hala
1,रोडवाल,rodwal
2,कोटडापार,kotdapar
3,बलाऊ,balau
4,कनाईकला,kanaikala


In [54]:
res_df.to_csv("../data/transliterate.csv", index = False)

### Let's join back the results

In [59]:
soc_audit_translit = df.merge(res_df, left_on='panchayat', right_on='original_word')
soc_audit_translit

Unnamed: 0,state,district,block,panchayat,sa_start_date,sa_end_date,gram_sabha_date,public_hearing_date,sa_period_from,sa_period_to,...,total_expense,job_cards_with_people,job_cards_updated,job_cards_renewed,demand_registration_process,unmet_demand,payment_agency_problems,source_file,original_word,transliteration
0,RAJASTHAN,BANSWARA,SAJJANGARH,खुन्दनी हाला,11/09/2023,16/09/2023,21/09/2023,21/09/2023,01/04/2022,31/03/2023,...,300.0,Greater than 75%,Yes,Yes,Mostly,"Yes, Some Demand",No,27_2728_2728007_2728007279_2022-2023_9_21_2023...,खुन्दनी हाला,khundani hala
1,RAJASTHAN,ALWAR,NEEMRANA,रोडवाल,09/08/2024,14/08/2024,16/08/2024,16/08/2024,01/04/2023,31/03/2024,...,0.0,Between 50% and 75%,Yes,Yes,Some,"Yes, Some Demand",No,27_2706_2706008_2706008272_2023-2024_8_16_2024...,रोडवाल,rodwal
2,RAJASTHAN,BARAN,CHHABARA,कोटडापार,14/09/2024,19/09/2024,20/09/2024,20/09/2024,01/04/2023,31/03/2024,...,3000.0,Between 50% and 75%,Yes,Yes,Mostly,"Yes, Some Demand",No,27_2731_2731006_2731006164_2023-2024_9_20_2024...,कोटडापार,kotdapar
3,RAJASTHAN,BARMER,BARMER,बलाऊ,01/03/2020,05/03/2020,05/03/2020,05/03/2020,01/04/2019,30/09/2019,...,1125.0,Greater than 75%,Yes,Yes,Yes,"No, people get work when they want it",No,27_2707_2717002_2717002120_2019-2020_3_5_2020_...,बलाऊ,balau
4,RAJASTHAN,AJMER,BHINAY,कनाईकला,06/01/2023,12/01/2023,13/01/2023,13/01/2023,01/04/2020,31/03/2022,...,16000.0,Greater than 75%,Yes,Yes,Mostly,"Yes, Some Demand",No,27_2721_2721002_2721002044_2021-2022_1_13_2023...,कनाईकला,kanaikala
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8701,RAJASTHAN,BARMER,Dhanaau,पुंजासर,27/12/2023,28/12/2023,29/12/2023,29/12/2023,01/04/2022,31/03/2023,...,12900.0,Between 50% and 75%,Yes,No,Mostly,"No, people get work when they want it",No,27_2717_2717011_2717004208_2022-2023_12_29_202...,पुंजासर,punjasar
8702,RAJASTHAN,BARMER,Samdadi,रानी देशीपुरा,19/01/2023,24/01/2023,25/01/2023,25/01/2023,01/04/2020,31/03/2021,...,4000.0,Greater than 75%,Yes,Yes,Yes,"Yes, Some Demand",No,27_2717_2717016_2717008369_2020-2021_1_25_2023...,रानी देशीपुरा,rani deshipura
8703,RAJASTHAN,ALWAR,NEEMRANA,कान्हावास,01/12/2024,06/12/2024,07/12/2024,07/12/2024,01/04/2024,30/09/2024,...,450.0,Between 50% and 75%,Yes,Yes,Mostly,"Yes, Some Demand",No,27_2706_2706008_2706008257_2024-2025_12_7_2024...,कान्हावास,kanhaavas
8704,RAJASTHAN,ALWAR,TIJARA,शाहबाद,04/10/2024,08/10/2024,09/10/2024,09/10/2024,01/04/2023,31/03/2024,...,0.0,Greater than 75%,Yes,Yes,Mostly,"No, people get work when they want it",No,27_2706_2706013_2706013440_2023-2024_10_9_2024...,शाहबाद,shahabad


In [60]:
soc_audit_translit.to_csv("../data/final_audit_results_translit.csv", index = False)