### Retreiving extra info about 2023 candidates from myneta.info 

In [5]:
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import requests
import re
import time
import asyncio
from math import ceil
import numpy as np
import os
import sys
import json

DIR = os.getenv('KEA_BASE_DIR')
sys.path.append(DIR)

from cleaning import myneta18_const_corrector, myneta23_df_cleaner, rmv_dspace

In [2]:
# Retreiving the HTML code from the main 2023 candidates webpage 
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51',
          'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"}

html = requests.get('https://myneta.info/Karnataka2023/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary',headers=headers).text
soup = BeautifulSoup(html, 'lxml')

In [3]:
# Retreiving the HTML code from the main 2023 candidates webpage

temp_dic = {
    'constituency':[],
    'name':[],
    'party':[],
    'age':[],
    'profession':[],
    'cases':[],
    'assets':[],
    'liabilities':[],
    'education_category':[],
    'education':[]
}


# Function to retreive candidate IDs from main webpage's HTML code
def has_candidate(href):
    return href and bool(re.compile('^candidate\.php\?candidate_id=').search(href))

# Finding all 'a' tags with candidate IDs
candids = soup.find_all('a',href=has_candidate)

# Get HTML of the page synchronously
def http_get_sync(url: str):
    response = requests.get(url,headers=headers,timeout=(10.0,15.0))
    return response.content

# Get HTML of the page Asynchronously
async def http_get(url: str):
    return await asyncio.to_thread(http_get_sync, url)


# Fetching the HTML and parsing it to retrieve necessary info
async def winner_details(index):
    url = 'https://myneta.info/karnataka2023/' + candids[index].attrs['href']
    html = await http_get(url)
    soup = BeautifulSoup(html, 'lxml')
    temp_dic['name'].append(soup.find('div', class_='grid_9 alpha omega').div.h2.text.strip(' \n'))
    temp_dic['constituency'].append(soup.find('div', class_='grid_9 alpha omega').div.h5.text.strip(' \n'))
    temp_dic['party'].append(soup.find('div', class_='grid_9 alpha omega').div.div.text.strip(' \n'))
    temp_dic['age'].append(soup.find('div', {'class':'grid_3 alpha','style':'background:khaki;'}).find_all('div',class_='grid_2 alpha')[2].text.strip(' \n'))
    temp_dic['profession'].append(soup.find('div', {'class':'grid_3 alpha','style':'background:khaki;'}).p.text.strip(' \n'))
    try:
        temp_dic['cases'].append(soup.find('div',{'class':'grid_3 alpha left-border-div left-green-border','style':'background-color:red;'}).div.span.text.strip(' \n'))
    except:
        temp_dic['cases'].append('0')
    try:
        temp_dic['assets'].append(soup.find('div', class_='bottom-border-div red fullWidth').b.text.strip(' \n'))
    except:
        temp_dic['assets'].append(np.nan)
    try:
        temp_dic['liabilities'].append(soup.find('div', class_='bottom-border-div blue fullWidth').b.text.strip(' \n'))
    except:
        temp_dic['liabilities'].append(np.nan)
    try:
        temp_dic['education_category'].append(soup.find('div',class_='grid_3 alpha omega left-border-div left-blue-border').find_all('div')[0].text.strip(' \n'))
    except:
        temp_dic['education_category'].append(np.nan)
    try:
        temp_dic['education'].append(soup.find('div',class_='grid_3 alpha omega left-border-div left-blue-border').find_all('div')[1].text.strip(' \n'))
    except:
        temp_dic['education'].append(np.nan)
        
        
async def main(start,end):
    await asyncio.gather(*[winner_details(index) for index in range(start,end)])
    
for i in range(ceil(len(candids)/20)):
    start = i*20
    end = start+20
    if end>len(candids):
        end=len(candids)
    await main(start,end)
    time.sleep(3.5)

In [4]:
df = pd.DataFrame(temp_dic)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2588 entries, 0 to 2587
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   constituency        2588 non-null   object
 1   name                2588 non-null   object
 2   party               2588 non-null   object
 3   age                 2588 non-null   object
 4   profession          2588 non-null   object
 5   cases               2588 non-null   object
 6   assets              2588 non-null   object
 7   liabilities         2588 non-null   object
 8   education_category  2588 non-null   object
 9   education           2588 non-null   object
dtypes: object(10)
memory usage: 202.3+ KB


In [5]:
df.head(5)

Unnamed: 0,constituency,name,party,age,profession,cases,assets,liabilities,education_category,education
0,KAMPLI (ST) (BELLARY),A DEVADAS,Party:SUCI(C),Age: 55,Self Profession:A Whole time worker of SUCI {C...,0,"Rs 17,68,878",Nil,8th Pass,"SSLC Discontinued, Sri Kampilaraya Girijana Hi..."
1,PULAKESHINAGAR (SC) (B.B.M.P(NORTH)),A CHANDRA KUMAR,Party:IND,Age: 40,Self Profession:Flooring Tile Labour\nSpouse P...,0,"Rs 1,10,000",Nil,5th Pass,7th Standard from S.L.B. Higher Primary School...
2,MUDDEBIHAL (BIJAPUR),K. B. DODAMANI VAKEELARU,Party:BSP,Age: 40,Self Profession:Private Advocate \nSpouse Prof...,0,"Rs 5,00,000","Rs 40,000",Post Graduate,MA in Kuvempu university distance education-2013
3,PAVAGADA (SC) (TUMKUR),B T RAMASUBBAIAH,Party:BhartiyaBahujanKranti Dal,Age: 61,Self Profession:Social Worker\nSpouse Professi...,1,"Rs 1,20,000","Rs 1,24,000",Graduate,BA
4,KRISHNARAJA (MYSORE),T.S. SRIVATHSA,Party:BJP,Age: 56,Self Profession:Contractor\nSpouse Profession:NA,0,"Rs 48,89,436","Rs 1,35,000",Graduate,B.Com. from Banumaya College New Saiyaji Rao R...


In [6]:
def party_cleaner(text:str)->str:
    text = re.sub("^Kalyana Rajya Pragathi Paksha$","KRPP",text)
    text = re.sub("^Sarvodaya Karnataka Paksha$",'SKP',text)
    return text

In [7]:
# Cleaning the DF, extracting features and resolving name conflicts
df = myneta23_df_cleaner(df)
df.party = df.party.apply(party_cleaner)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2588 entries, 0 to 2587
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   constituency           2588 non-null   object  
 1   name                   2588 non-null   object  
 2   party                  2588 non-null   category
 3   age                    2588 non-null   Int32   
 4   profession             2588 non-null   object  
 5   cases                  2588 non-null   int32   
 6   assets                 2574 non-null   Int64   
 7   liabilities            1633 non-null   Float64 
 8   education_category     2588 non-null   object  
 9   education              2588 non-null   object  
 10  constituency_category  2588 non-null   category
 11  district               2588 non-null   object  
 12  self_profession        2588 non-null   object  
 13  spouse_profession      2588 non-null   object  
dtypes: Float64(1), Int32(1), Int64(1), categ

In [8]:
df.head(5)

Unnamed: 0,constituency,name,party,age,profession,cases,assets,liabilities,education_category,education,constituency_category,district,self_profession,spouse_profession
0,Kampli,A Devadas,SUCI(C),55,Self Profession:A Whole time worker of SUCI {C...,0,1768878,,8th Pass,"SSLC Discontinued, Sri Kampilaraya Girijana Hi...",ST,Bellary,A Whole time worker of SUCI {C} Agriculturist,A Whole time worker of SUCI {C} retired teache...
1,Pulakeshinagar,A Chandra Kumar,IND,40,Self Profession:Flooring Tile Labour\nSpouse P...,0,110000,,5th Pass,7th Standard from S.L.B. Higher Primary School...,SC,B.B.M.P(North),Flooring Tile Labour,Not Given
2,Muddebihal,K. B. Dodamani Vakeelaru,BSP,40,Self Profession:Private Advocate \nSpouse Prof...,0,500000,40000.0,Post Graduate,MA in Kuvempu university distance education-2013,GEN,Bijapur,Private Advocate,House wife
3,Pavagada,B T Ramasubbaiah,BhartiyaBahujanKranti Dal,61,Self Profession:Social Worker\nSpouse Professi...,1,120000,124000.0,Graduate,BA,SC,Tumkur,Social Worker,Not Given
4,Krishnaraja,T.S. Srivathsa,BJP,56,Self Profession:Contractor\nSpouse Profession:NA,0,4889436,135000.0,Graduate,B.Com. from Banumaya College New Saiyaji Rao R...,GEN,Mysore,Contractor,Not Given


In [9]:
df.drop('profession',axis='columns',inplace=True)
df.to_csv('candidates2023MyNeta.csv')

In [10]:
# Repeat the same procedure above, but for the winners 

html = requests.get('https://myneta.info/Karnataka2023/index.php?action=summary&subAction=winner_analyzed&sort=candidate#summary',headers=headers).text
soup = BeautifulSoup(html, 'lxml')

temp_dic = {
    'constituency':[],
    'name':[],
    'party':[],
    'age':[],
    'profession':[],
    'cases':[],
    'assets':[],
    'liabilities':[],
    'education_category':[],
    'education':[]
}


candids = soup.find_all('a',href=has_candidate)

for i in range(ceil(len(candids)/20)):
    start = i*20
    end = start+20
    if end>len(candids):
        end=len(candids)
    await main(start,end)
    time.sleep(3.5)

In [11]:
winners_df = pd.DataFrame(temp_dic)
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   constituency        223 non-null    object
 1   name                223 non-null    object
 2   party               223 non-null    object
 3   age                 223 non-null    object
 4   profession          223 non-null    object
 5   cases               223 non-null    object
 6   assets              223 non-null    object
 7   liabilities         223 non-null    object
 8   education_category  223 non-null    object
 9   education           223 non-null    object
dtypes: object(10)
memory usage: 17.6+ KB


In [12]:
winners_df = myneta23_df_cleaner(winners_df)
winners_df.party = winners_df.party.apply(party_cleaner)
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   constituency           223 non-null    object  
 1   name                   223 non-null    object  
 2   party                  223 non-null    category
 3   age                    223 non-null    Int32   
 4   profession             223 non-null    object  
 5   cases                  223 non-null    int32   
 6   assets                 223 non-null    Int64   
 7   liabilities            196 non-null    Float64 
 8   education_category     223 non-null    object  
 9   education              223 non-null    object  
 10  constituency_category  223 non-null    category
 11  district               223 non-null    object  
 12  self_profession        223 non-null    object  
 13  spouse_profession      223 non-null    object  
dtypes: Float64(1), Int32(1), Int64(1), categor

In [13]:
np.setdiff1d(df.constituency.unique(),winners_df.constituency.unique())

array(['Sarvagnanagar'], dtype=object)

In [14]:
# Creating a temporary dataframe with missing values filled.
temp_dic = {
    'constituency':['Sarvagnanagar'],
    'name':['K.J. George'],
    'party':['INC'],
    'age':[pd.NA],
    'profession':[pd.NA],
    'cases':[pd.NA],
    'assets':[pd.NA],
    'liabilities':[pd.NA],
    'education_category':['Not Available'],
    'education':['Not Available'],
    'district':[str.title('B.B.M.P(NORTH)')],
    'self_profession':['Not Available'],
    'spouse_profession':['Not Available'],
}
temp_df = pd.DataFrame(temp_dic)
winners_df = pd.concat([winners_df,temp_df],ignore_index=True)

In [15]:
winners_df.drop('profession',axis='columns',inplace=True)
winners_df.to_csv('winners2023MyNeta.csv')

In [16]:
const_myneta23 = pd.read_csv("winners2023MyNeta.csv",index_col=0)
const_eci18 = pd.read_csv(f'{DIR}/2018 Elections/constituency2018ECI.csv',index_col=0)
const_ndtv23 = pd.read_csv("constituency2023NDTV.csv",index_col=0)

In [17]:
# Renaming the winning candidates' names in this (MyNeta) dataset from the ECI dataset if the winners were re-elected

temp_dic = {const_eci18.loc[index,'constituency']:const_eci18.loc[index,'name'] for index in const_eci18.index if const_eci18.loc[index,'constituency'] in list(const_ndtv23.query("is_re_elected==1").constituency.values)}

for const in temp_dic.keys():
    index = const_myneta23.loc[const_myneta23.constituency==const].index[0]
    const_myneta23.loc[index,'name'] = temp_dic[const]

In [18]:
const_myneta23.to_csv('winners2023MyNeta.csv')

In [6]:
df = pd.read_csv('candidates2023MyNeta.csv',index_col=0)
winners_df = pd.read_csv('winners2023MyNeta.csv',index_col=0)
with open(f'{DIR}/education.json','r',encoding='utf-8') as file:
    education = dict(json.loads(file.read()))
with open(f'{DIR}/profession.json','r',encoding='utf-8') as file:
    profession = dict(json.loads(file.read()))

In [7]:
df[['education','self_profession','spouse_profession']] = df[['education','self_profession','spouse_profession']].fillna('Unknown')
winners_df[['education','self_profession','spouse_profession']] = winners_df[['education','self_profession','spouse_profession']].fillna('Unknown')

In [8]:
for edu in df.education.unique():
    df.loc[df.education==edu,'education_category'] = education[edu]
    
for edu in winners_df.education.unique():
    winners_df.loc[winners_df.education==edu,'education_category'] = education[edu]

In [9]:
df[['self_profession_category','spouse_profession_category']] = 'Unknown'
winners_df[['self_profession_category','spouse_profession_category']] = 'Unknown'

In [10]:
for prof in df.self_profession.unique():
    df.loc[df.self_profession==prof,'self_profession_category'] = ', '.join(profession[prof])
    
for prof in df.spouse_profession.unique():
    df.loc[df.spouse_profession==prof,'spouse_profession_category'] = ', '.join(profession[prof])
    
for prof in winners_df.self_profession.unique():
    winners_df.loc[winners_df.self_profession==prof,'self_profession_category'] = ', '.join(profession[prof])
    
for prof in winners_df.spouse_profession.unique():
    winners_df.loc[winners_df.spouse_profession==prof,'spouse_profession_category'] = ', '.join(profession[prof])

In [13]:
df.to_csv('candidates2023MyNeta.csv')
winners_df.to_csv('winners2023MyNeta.csv')