In [1]:
import pandas as pd
from functools import reduce
import os
import zipfile

### Load dat

In [2]:
up_2005 = pd.read_csv("../data/up_gp_sarpanch_2005_fixed.csv", low_memory = False)
up_2010 = pd.read_csv("../data/up_gp_sarpanch_2010_fixed.csv", low_memory = False)

In [3]:
# 2015
dataframes = []
folder_path = "../data/2015"

for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        dataframes.append(df)

up_2015 = pd.concat(dataframes, ignore_index=True)

column_transliteration_map = {
    'ब्लॉक': 'block_name',
    'ग्राम पंचायत': 'gp_name',
    'पद का आरक्षण': 'gp_reservation_status',
    'उम्मीदवार': 'candidate',
    'पिता/पति': 'father_husband',
    'प्रत्याशी का आरक्षण': 'candidate_reservation',
    'शैक्षिक योग्यता': 'educational_qualification',
    'लिंग': 'sex',
    'मोबाइल नं०': 'mobile_number',
    'प्राप्त वैध मत': 'valid_votes_received',
    'प्राप्त मत %': 'votes_received_percent',
    'मतदान %': 'voting_percent',
    'परिणाम': 'result',
    'जिला पंचायत': 'district_panchayat',
    'क्षेत्र पंचायत वार्ड': 'area_panchayat_ward',
    'जिला पंचायत वार्ड': 'district_panchayat_ward',
    'जिला': 'district_name',
    'क्षेत्र पंचायत': 'area_panchayat'
}

up_2015 = up_2015.rename(columns=column_transliteration_map)

In [4]:
# 2021
with zipfile.ZipFile("../data/up_gram_panchayat_pradhan_2021.csv.zip", 'r') as zip_ref:
    csv_file = [f for f in zip_ref.namelist() if f.endswith('.csv') and not f.startswith('__MACOSX')]
    
    with zip_ref.open(csv_file[0]) as a_csv_file:
        up_2021 = pd.read_csv(a_csv_file, encoding='utf-8')

column_transliteration_map = {
    'zila': 'district_name',
    'block': 'block_name',
    'candidate_name_2021': 'candidate',
    'father_husband_name_2021': 'father_husband',
    'gram_panchayat': 'gp_name',
    'gender_2021': 'sex',
    'age_2021': 'age',
    'education_2021': 'education',
    'caste_2021': 'candidate_res_status',
}

up_2021 = up_2021.rename(columns=column_transliteration_map)

### Load and merge transliterations

In [5]:
district_translit = pd.read_csv("../data/transliteration/district_official_hindi_english.csv")
block_translit = pd.read_csv("../data/transliteration/block_name_transliteration.csv")

In [6]:
# Let's convert to dict
mapping_dict_1 = up_2005_trans.set_index('Name')['Transliterated'].str.strip().to_dict()
mapping_dict_2 = up_2010_trans.set_index('Name')['Transliterated'].str.strip().to_dict()
mapping_dict_3 = district_translit.set_index('hindi')['eng'].str.strip().to_dict()
mapping_dict_4 = block_translit.set_index('hindi')['eng'].str.strip().to_dict()

merged_dict = reduce(lambda x, y: x | y, [mapping_dict_1, mapping_dict_2, mapping_dict_3, mapping_dict_4])
len(merged_dict)

NameError: name 'up_2005_trans' is not defined

### Map

In [None]:
up_2005['gp_name_eng'] = up_2005['gp_name_fin'].str.strip().map(merged_dict)
up_2010['gp_name_eng'] = up_2010['gp_name_fin'].str.strip().map(merged_dict)
up_2015['gp_name_eng'] = up_2015['gp_name'].str.strip().map(merged_dict)
up_2021['gp_name_eng'] = up_2021['gp_name'].str.strip().map(merged_dict)

In [None]:
print(up_2005['gp_name_eng'].notna().sum())
print(up_2010['gp_name_eng'].notna().sum())
print(up_2015['gp_name_eng'].notna().sum())
print(up_2021['gp_name_eng'].notna().sum())

In [None]:
up_2005['district_name_eng'] = up_2005['district_name'].str.strip().map(merged_dict)
up_2010['district_name_eng'] = up_2010['district_name'].str.strip().map(merged_dict)
up_2015['district_name_eng'] = up_2015['district_name'].str.strip().map(merged_dict)
up_2021['district_name_eng'] = up_2021['district_name'].str.strip().map(merged_dict)

In [None]:
print(up_2005['district_name_eng'].notna().sum())
print(up_2010['district_name_eng'].notna().sum())
print(up_2015['district_name_eng'].notna().sum())
print(up_2021['district_name_eng'].notna().sum())

In [None]:
up_2005['block_name_eng'] = up_2005['block_name'].str.strip().map(merged_dict)
up_2010['block_name_eng'] = up_2010['block_name'].str.strip().map(merged_dict)
up_2015['block_name_eng'] = up_2015['block_name'].str.strip().map(merged_dict)
up_2021['block_name_eng'] = up_2021['block_name'].str.strip().map(merged_dict)

In [None]:
print(up_2005['block_name_eng'].notna().sum())
print(up_2010['block_name_eng'].notna().sum())
print(up_2015['block_name_eng'].notna().sum())
print(up_2021['block_name_eng'].notna().sum())

In [None]:
up_2005['elected_sarpanch_name_eng'] = up_2005['elected_sarpanch_name'].str.strip().map(merged_dict).str.strip()
up_2010['elected_sarpanch_name_eng'] = up_2010['elected_sarpanch_name'].str.strip().map(merged_dict).str.strip()
up_2015['elected_sarpanch_name_eng'] = up_2015['elected_sarpanch_name'].str.strip().map(merged_dict).str.strip()
up_2021['elected_sarpanch_name_eng'] = up_2021['elected_sarpanch_name'].str.strip().map(merged_dict).str.strip()

In [None]:
print(up_2005['elected_sarpanch_name_eng'].notna().sum())
print(up_2010['elected_sarpanch_name_eng'].notna().sum())
print(up_2015['elected_sarpanch_name_eng'].notna().sum())
print(up_2021['elected_sarpanch_name_eng'].notna().sum())

In [None]:
up_2010['husband_spouse_name_eng'] = up_2010['husband_spouse_name'].str.strip().map(merged_dict).str.strip()
up_2015['husband_spouse_name_eng'] = up_2015['father_husband'].str.strip().map(merged_dict).str.strip()
up_2021['husband_spouse_name_eng'] = up_2021['father_husband'].str.strip().map(merged_dict).str.strip()

In [None]:
up_2005.to_parquet("../data/up_gp_sarpanch_2005_fixed_with_transliteration.parquet", index = False)
up_2010.to_parquet("../data/up_gp_sarpanch_2010_fixed_with_transliteration.parquet", index = False)

# 2015 coercion
columns_to_convert = ['mobile_number', 'valid_votes_received', 'votes_received_percent', 'voting_percent']
up_2015[columns_to_convert] = up_2015[columns_to_convert].astype(str)
up_2015.to_parquet("../data/fin/up_gp_sarpanch_2015_fixed_with_transliteration.parquet", index = False)
up_2021.to_parquet("../data/fin/up_gp_sarpanch_2021_fixed_with_transliteration.parquet", index = False)