## 1. Imports and Constants

In [17]:
import pandas as pd
import os
import json

In [18]:
DIR = os.getenv('KEA_BASE_DIR')

In [19]:
df_config = {'constituency':'object',
            'name':'object',
            'party':'category',
            'age':'Int32',
            'gender':'category',
            'cases':'Int32',
            'assets':'Int64',
            'liabilities':'Int64',
            'education_category':'category',
            'education':'object',
            'district':'object',
            'self_profession':'object',
            'spouse_profession':'object',
            'self_profession_category':'object',
            'spouse_profession_category':'object',
            'total_electors':'Int64',
            'total_const_votes':'Int64',
            'votes':'Int64',
            'candidate_voteshare_percent':'Float64',
            'constituency_category':'category',
            'percent_votes':'Float64',
            'total_electors':'Int64',
            'voter_turnout':'Float64',
            'is_re_elected':'Int32',
            'eligible_male_voters':'Int64',
            'eligible_female_voters':'Int64',
            'eligible_other_voters':'Int64',
            'eligible_total_voters':'Int64',
            'const_num':'Int32',
            'page_num':'Int32',
            'male_voters':'Int64',
            'female_voters':'Int64',
            'other_voters':'Int64',
            'total_voters':'Int64'
            }

## 2. Constituencies and Winners Data

### 2.1 Reading the data

In [20]:
with open(f'{DIR}/2013 Elections/const_num.json','r',encoding='utf-8') as f:
    const_num = json.load(f)

const_myneta23 = pd.read_csv(f'{DIR}/2023 Elections/winners2023MyNeta.csv',index_col=0,dtype=df_config)
const_myneta18 = pd.read_csv(f'{DIR}/2018 Elections/winners2018MyNeta.csv',index_col=0,dtype=df_config)
const_myneta13 = pd.read_csv(f'{DIR}/2013 Elections/winners2013MyNeta.csv',index_col=0,dtype=df_config)
const_num = pd.Series(const_num['Constituency Name']).to_frame().reset_index().rename(columns={'index':'const_num',0:'constituency'})

In [21]:
const_opencity13 = pd.read_csv(f'{DIR}/2013 Elections/constituency2013OpenCity.csv',index_col=0,dtype=df_config)
const_eci18 = pd.read_csv(f'{DIR}/2018 Elections/constituency2018ECI.csv',index_col=0,dtype=df_config)
const_ndtv23 = pd.read_csv(f'{DIR}/2023 Elections/constituency2023NDTV.csv',index_col=0,dtype=df_config)
eligible_voters = pd.read_csv(f'{DIR}/2023 Elections/eligible_voters_NEW.csv',index_col=0,dtype=df_config)

### 2.2 Merging year-wise and candidates-winners wise data

In [22]:
 # Merging vote data (from ECI) and winner background data (from MyNeta) for 2018 election

const18 = pd.merge(const_myneta18, const_eci18, on='constituency')
const18 = const18.drop(['name_y','age_y','party_y'],axis=1).rename(columns={'name_x':'name', 'party_x':'party', 'age_x':'age'})
const18 = pd.merge(const18,const_num,on='constituency') # Adding constituency number
const18.const_num = const18.const_num.astype('Int32')

In [23]:
# Merging vote data (from OpenCity) and winner background data (from MyNeta) for 2013 election

const13 = pd.merge(const_myneta13, const_opencity13, on='constituency') 
const13 = const13.drop(['name_y','age_y','party_y','district_y'],axis=1).rename(columns={'name_x':'name','party_x':'party',
                                                                                         'age_x':'age','district_x':'district',
                                                                                        'const_category':'constituency_category'})
const13 = pd.merge(const13,const_num,on='constituency')
const13.const_num = const13.const_num.astype('Int32')

In [24]:
# Merging vote data (from NDTV) and winner background data (from MyNeta) for 2023 election

const23 = pd.merge(const_myneta23, const_ndtv23, on='constituency')
const23 = const23.drop(['name_y','age_y','party_y'],axis=1).rename(columns={'name_x':'name', 'party_x':'party', 'age_x':'age'})
const23 = pd.merge(const23,const_num,on='constituency')
const23.const_num = const23.const_num.astype('Int32')

### 2.3 Adding and modifying 2018 results for bye-elections between 2018 and 2023

In [25]:
consteci_18post = pd.read_excel(f'{DIR}/2018 Elections/constituency2018ECIpost-bye-elections.xlsx',index_col=0,dtype=df_config)
const18_post = pd.merge(const_myneta18, consteci_18post, on='constituency')

In [26]:
const18_post.loc[const18_post.name_x!=const18_post.name_y,['cases', 'assets', 'liabilities', 'education_category', 'education','self_profession', 'spouse_profession', 'self_profession_category','spouse_profession_category', 'constituency_category']] = pd.NA
const18_post = const18_post.drop(['name_x','age_x','party_x'],axis=1).rename(columns={'name_y':'name', 'party_y':'party', 'age_y':'age'})
const18_post = pd.merge(const18_post,const_num,on='constituency')
const18_post.const_num = const18_post.const_num.astype('Int32')

### 2.4 Saving the data for constituencies and winners

In [27]:
####################################### CHANGE BEFORE PRODUCTION ##############################################
const18_post.to_csv('../Final Data/const18_post.csv')
const13.to_csv('../Final Data/const13.csv')
const18.to_csv('../Final Data/const18.csv')
const23.to_csv('../Final Data/const23.csv')
eligible_voters.to_csv('../Final Data/eligible_voters.csv')
const_num.to_csv('../Final Data/const_num.csv')

## 3. Candidates Data

### 3.1 Reading in the data

In [29]:
candid_myneta23 = pd.read_csv(f'{DIR}/2023 Elections/candidates2023MyNeta.csv',index_col=0,dtype=df_config)
candid_myneta18 = pd.read_csv(f'{DIR}/2018 Elections/candidates2018MyNeta.csv',index_col=0,dtype=df_config)
candid_myneta13 = pd.read_csv(f'{DIR}/2013 Elections/candidates2013MyNeta.csv',index_col=0,dtype=df_config)
candid_opencity13 = pd.read_csv(f'{DIR}/2013 Elections/candidates2013OpenCity.csv',index_col=0,dtype=df_config)
candid_eci18 = pd.read_csv(f'{DIR}/2018 Elections/candidates2018ECI.csv',index_col=0,dtype=df_config)
candid_ndtv23 = pd.read_csv(f'{DIR}/2023 Elections/candidates2023NDTV.csv',index_col=0,dtype=df_config)

with open(f'{DIR}/OpenAI/name_merge.json','r',encoding='utf-8') as file:
    name_merge = json.load(file)

### 3.2 Changing the names of candididates from votes dataset to match the names in MyNeta dataset

In [30]:
for year in name_merge.keys():
    df  = candid_opencity13 if year=='2013' else candid_eci18 if year=='2018' else candid_ndtv23
    for constituency in name_merge[year].keys():
        for key in name_merge[year][constituency].keys():
            to_change = name_merge[year][constituency][key]
            df.loc[(df.constituency==constituency) & (df.name==to_change),'name'] = key

### 3.3 Merging candiadates data from Votes dataset and candidate background dataset (MyNeta)

In [31]:
candid13 = pd.merge(candid_myneta13, candid_opencity13, on=['name','constituency'],how='inner')
candid13 = candid13.drop(columns=['party_x','age_x','district_x']).rename(columns={'party_y':'party','age_y':'age','district_y':'district'})
candid13.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2807 entries, 0 to 2806
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   constituency                 2807 non-null   object  
 1   name                         2807 non-null   object  
 2   cases                        2791 non-null   Int32   
 3   assets                       2741 non-null   Int64   
 4   liabilities                  1558 non-null   Int64   
 5   education_category           2807 non-null   category
 6   education                    2807 non-null   object  
 7   self_profession              2807 non-null   object  
 8   spouse_profession            2807 non-null   object  
 9   self_profession_category     2807 non-null   object  
 10  spouse_profession_category   2807 non-null   object  
 11  district                     2807 non-null   object  
 12  gender                       2807 non-null   category
 13  age

In [32]:
candid18 = pd.merge(candid_myneta18, candid_eci18, on=['name','constituency'],how='inner')
candid18 = candid18.drop(columns=['party_x','age_x']).rename(columns={'party_y':'party','age_y':'age'})
candid18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2577 entries, 0 to 2576
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   constituency                2577 non-null   object  
 1   name                        2577 non-null   object  
 2   cases                       2576 non-null   Int32   
 3   assets                      2559 non-null   Int64   
 4   liabilities                 1479 non-null   Int64   
 5   education_category          2577 non-null   category
 6   education                   2577 non-null   object  
 7   district                    2577 non-null   object  
 8   self_profession             2577 non-null   object  
 9   spouse_profession           2577 non-null   object  
 10  self_profession_category    2577 non-null   object  
 11  spouse_profession_category  2577 non-null   object  
 12  age                         2552 non-null   Int32   
 13  gender            

In [33]:
candid23 = pd.merge(candid_myneta23, candid_ndtv23, on=['name','constituency'],how='inner')
candid23 = candid23.drop(columns=['party_x']).rename(columns={'party_y':'party'})
candid23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2595 entries, 0 to 2594
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   constituency                2595 non-null   object  
 1   name                        2595 non-null   object  
 2   age                         2595 non-null   Int32   
 3   cases                       2595 non-null   Int32   
 4   assets                      2581 non-null   Int64   
 5   liabilities                 1639 non-null   Int64   
 6   education_category          2595 non-null   category
 7   education                   2595 non-null   object  
 8   constituency_category       2595 non-null   category
 9   district                    2595 non-null   object  
 10  self_profession             2595 non-null   object  
 11  spouse_profession           2595 non-null   object  
 12  self_profession_category    2595 non-null   object  
 13  spouse_profession_

### 3.4 Adding and modifying 2018 results for bye-elections

In [34]:
candideci_18post = pd.read_excel(f"{DIR}/2018 Elections/candidates2018ECIpost-bye-elections.xlsx",index_col=0,dtype=df_config)

In [35]:
# Changing the names from votes dataset to match the names in myneta dataset

year='2018'
df = candideci_18post
for constituency in name_merge[year].keys():
    if constituency not in ['Athani','Chikkaballapur','Gokak','Hirekerur','Hosakote','Hunsur','Kagwad','K.R.Pura','Krishnarajpet','Mahalakshmi Layout','Ranibennur','Shivajinagar','Vijayanagara','Yellapur','Yeshvanthapura']:
        for key in name_merge[year][constituency].keys():
            to_change = name_merge[year][constituency][key]
            df.loc[(df.constituency==constituency) & (df.name==to_change),'name'] = key

In [36]:
candid18_post = pd.merge(candid_myneta18, candideci_18post, on=['name','constituency'],how='right')
candid18_post = candid18_post.drop(columns=['party_x','age_x']).rename(columns={'party_y':'party','age_y':'age'})
candid18_post.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2739 entries, 0 to 2738
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   constituency                2739 non-null   object  
 1   name                        2739 non-null   object  
 2   cases                       2430 non-null   Int32   
 3   assets                      2413 non-null   Int64   
 4   liabilities                 1395 non-null   Int64   
 5   education_category          2431 non-null   category
 6   education                   2431 non-null   object  
 7   district                    2431 non-null   object  
 8   self_profession             2431 non-null   object  
 9   spouse_profession           2431 non-null   object  
 10  self_profession_category    2431 non-null   object  
 11  spouse_profession_category  2431 non-null   object  
 12  age                         2498 non-null   Int32   
 13  gender            

### 3.5 Saving the Data

In [37]:
candid18_post.to_csv('../Final Data/candid18_post.csv')
candid13.to_csv('../Final Data/candid13.csv')
candid18.to_csv('../Final Data/candid18.csv')
candid23.to_csv('../Final Data/candid23.csv')

In [4]:
import pandas as pd

candid13 = pd.read_csv('../Final Data/candid13.csv', index_col=0)
candid18 = pd.read_csv('../Final Data/candid18.csv', index_col=0)
candid23 = pd.read_csv('../Final Data/candid23.csv', index_col=0)
candid18_post = pd.read_csv('../Final Data/candid18_post.csv', index_col=0)

In [10]:
candid13.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2807 entries, 0 to 2806
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   constituency                 2807 non-null   object 
 1   name                         2807 non-null   object 
 2   cases                        2791 non-null   float64
 3   assets                       2741 non-null   float64
 4   liabilities                  1558 non-null   float64
 5   education_category           2807 non-null   object 
 6   education                    2807 non-null   object 
 7   self_profession              2807 non-null   object 
 8   spouse_profession            2807 non-null   object 
 9   self_profession_category     2807 non-null   object 
 10  spouse_profession_category   2807 non-null   object 
 11  district                     2807 non-null   object 
 12  gender                       2807 non-null   object 
 13  age                    

In [14]:
set(candid18.columns.tolist()).difference(set(candid13.columns.tolist()))

{'constituency_category', 'percent_votes'}