## 2015 and 2021 Transliteration Prep

In [1]:
import pandas as pd
import numpy as np
import os
import zipfile
from functools import reduce

### 2021

In [2]:
with zipfile.ZipFile("../data/up_gram_panchayat_pradhan_2021.csv.zip", 'r') as zip_ref:
    csv_file = [f for f in zip_ref.namelist() if f.endswith('.csv') and not f.startswith('__MACOSX')]
    
    with zip_ref.open(csv_file[0]) as a_csv_file:
        up_2021 = pd.read_csv(a_csv_file, encoding='utf-8')

In [3]:
column_transliteration_map = {
    'zila': 'district',
    'candidate_name_2021': 'candidate',
    'father_husband_name_2021': 'father_husband',
}

up_2021 = up_2021.rename(columns=column_transliteration_map)

In [4]:
up_2021.head()

Unnamed: 0,district,block,gram_panchayat,reservation,id,candidate,father_husband,gender_2021,age_2021,education_2021,caste_2021,movable_property,immovable_property,criminal_history_2021,vote_percentage,Unnamed: 15,Unnamed: 16,Unnamed: 17,result
0,अमरोहा,अमरोहा,1-नन्हेड़ा अल्यारपुर,अनुसूचित जाति महिला,356066,मंजू,ओमकार सिंह,महिला,26,प्राईमरी,अनुसूचित जाति,700000,200000,नहीं,46.0,2.56,1.82,जब्त,
1,अमरोहा,अमरोहा,1-नन्हेड़ा अल्यारपुर,अनुसूचित जाति महिला,356517,सुनीता,राजपाल,महिला,37,प्राईमरी,अनुसूचित जाति,210000,250000,नहीं,0.0,0.0,0.0,जब्त,
2,अमरोहा,अमरोहा,1-नन्हेड़ा अल्यारपुर,अनुसूचित जाति महिला,358181,सुमन देवी,छविराम,महिला,36,इंटर,अनुसूचित जाति,250000,3600000,नहीं,12.0,0.67,0.47,जब्त,
3,अमरोहा,अमरोहा,1-नन्हेड़ा अल्यारपुर,अनुसूचित जाति महिला,359116,खजानो,रमेश सिंह,महिला,66,निरक्षर,अनुसूचित जाति,400000,300000,नहीं,33.0,1.84,1.3,जब्त,
4,अमरोहा,अमरोहा,1-नन्हेड़ा अल्यारपुर,अनुसूचित जाति महिला,359870,कमलेश कुमारी,शीशराम,महिला,36,प्राईमरी,अनुसूचित जाति,210000,250000,नहीं,661.0,36.76,26.13,जब्त नहीं,विजेता


In [5]:
selected_up_2021 = up_2021[['district', 'block', 'gram_panchayat', 'candidate', 'father_husband']]

long_up_2021 = pd.melt(selected_up_2021, var_name='Attribute', value_name='Value')
long_up_2021.shape

(1865480, 2)

In [6]:
long_up_2021.replace('', np.nan, inplace=True)
long_up_2021.dropna(subset=['Value'], inplace=True)
long_up_2021.drop_duplicates(subset=['Attribute', 'Value'], inplace=True)

In [7]:
long_up_2021.shape

(216821, 2)

In [8]:
long_up_2021.head()

Unnamed: 0,Attribute,Value
0,district,अमरोहा
4966,district,अमेठी
10215,district,अम्बेडकर नगर
16656,district,अलीगढ़
19807,district,आगरा


In [9]:
long_up_2021.to_csv("../data/transliteration/up_gp_sarpanch_2021_transliterate_prep.csv", index = False)

### 2015

In [10]:
dataframes = []
folder_path = "../data/2015"

for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        dataframes.append(df)

up_2015 = pd.concat(dataframes, ignore_index=True)

In [11]:
column_transliteration_map = {
    'ब्लॉक': 'block',
    'ग्राम पंचायत': 'gram_panchayat',
    'पद का आरक्षण': 'reservation_of_position',
    'उम्मीदवार': 'candidate',
    'पिता/पति': 'father_husband',
    'प्रत्याशी का आरक्षण': 'candidate_reservation',
    'शैक्षिक योग्यता': 'educational_qualification',
    'लिंग': 'gender',
    'मोबाइल नं०': 'mobile_number',
    'प्राप्त वैध मत': 'valid_votes_received',
    'प्राप्त मत %': 'votes_received_percent',
    'मतदान %': 'voting_percent',
    'परिणाम': 'result',
    'जिला पंचायत': 'district_panchayat',
    'क्षेत्र पंचायत वार्ड': 'area_panchayat_ward',
    'जिला पंचायत वार्ड': 'district_panchayat_ward',
    'जिला': 'district',
    'क्षेत्र पंचायत': 'area_panchayat'
}

up_2015 = up_2015.rename(columns=column_transliteration_map)

In [12]:
selected_up_2015 = up_2015[['district', 'block', 'gram_panchayat', 'candidate', 'father_husband']]

long_up_2015 = pd.melt(selected_up_2015, var_name='Attribute', value_name='Value')
long_up_2015.shape

(1033165, 2)

In [13]:
long_up_2015.replace('', np.nan, inplace=True)
long_up_2015.dropna(subset=['Value'], inplace=True)
long_up_2015.drop_duplicates(subset=['Attribute', 'Value'], inplace=True)

In [14]:
long_up_2015.shape

(136833, 2)

In [15]:
long_up_2021.head()

Unnamed: 0,Attribute,Value
0,district,अमरोहा
4966,district,अमेठी
10215,district,अम्बेडकर नगर
16656,district,अलीगढ़
19807,district,आगरा


In [16]:
long_up_2015.to_csv("../data/transliteration/up_gp_sarpanch_2015_transliterate_prep.csv", index = False)

In [17]:
up_2015_2021 = pd.concat([long_up_2015, long_up_2021])
up_2015_2021.drop_duplicates(subset=['Attribute', 'Value'], inplace=True)
up_2015_2021.shape

(316060, 2)

In [18]:
# Load existing transliterations
up_2005_trans = pd.read_csv("../data/transliteration/up_gp_sarpanch_2005_transliterate_out.csv", low_memory = False)
up_2010_trans = pd.read_csv("../data/transliteration/up_gp_sarpanch_2010_transliterate_out.csv", low_memory = False)

In [19]:
district_translit = pd.read_csv("../data/transliteration/district_official_hindi_english.csv")
block_translit = pd.read_csv("../data/transliteration/block_name_transliteration.csv")

In [20]:
# Let's convert to dict
mapping_dict_1 = up_2005_trans.set_index('Name')['Transliterated'].str.strip().to_dict()
mapping_dict_2 = up_2010_trans.set_index('Name')['Transliterated'].str.strip().to_dict()
mapping_dict_3 = district_translit.set_index('hindi')['eng'].str.strip().to_dict()
mapping_dict_4 = block_translit.set_index('hindi')['eng'].str.strip().to_dict()

merged_dict = reduce(lambda x, y: x | y, [mapping_dict_1, mapping_dict_2, mapping_dict_3, mapping_dict_4])
len(merged_dict)

79628

In [21]:
df_filtered = up_2015_2021[~up_2015_2021['Value'].isin(merged_dict.keys())]
df_filtered.shape

(282070, 2)

In [22]:
df_filtered.to_csv("../data/transliteration/up_2015_2021_remaining_transliteration_prep.csv", index = False)