In [2]:
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import numpy as np
import requests
import re
import time
import asyncio
from math import ceil
import os
import sys
import json

DIR = os.getenv('KEA_BASE_DIR')
sys.path.append(DIR)

from cleaning import myneta18_const_corrector, myneta18_df_cleaner, rmv_dspace

In [2]:
# Retreiving the HTML code from the main 2013 candidates webpage 
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51',
          'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"}

html = requests.get('https://myneta.info/Karnataka2013/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary',headers=headers).text
soup = BeautifulSoup(html, 'lxml')

In [3]:

temp_dic = {
    'constituency':[],
    'name':[],
    'party':[],
    'age':[],
    'profession':[],
    'cases':[],
    'assets':[],
    'liabilities':[],
    'education_category':[],
    'education':[]
}


# Filter candidate IDs from main webpage's HTML code
def has_candidate(href):
    return href and bool(re.compile('^candidate\.php\?candidate_id=').search(href))

# Finding all 'a' tags with candidate IDs
candids = soup.find_all('a',href=has_candidate)

# Get HTML of the page synchronously
def http_get_sync(url: str):
    response = requests.get(url,headers=headers,timeout=(15.0,30.0))
    return response.text

# Get HTML of the page Asynchronously
async def http_get(url: str):
    return await asyncio.to_thread(http_get_sync, url)


# Fetching the HTML and parsing it to retrieve necessary info
async def winner_details(index):
    url = 'https://myneta.info/karnataka2013/' + candids[index].attrs['href']
    html = await http_get(url)
    soup = BeautifulSoup(html, 'lxml')
    temp_dic['name'].append(soup.find('div', class_='grid_9 alpha omega').div.h2.text.strip(' \n'))
    temp_dic['constituency'].append(soup.find('div', class_='grid_9 alpha omega').div.h5.text.strip(' \n'))
    temp_dic['party'].append(soup.find('div', class_='grid_9 alpha omega').div.div.text.strip(' \n'))
    temp_dic['age'].append(soup.find('div', {'class':'grid_3 alpha','style':'background:khaki;'}).find_all('div',class_='grid_2 alpha')[2].text.strip(' \n'))
    temp_dic['profession'].append(soup.find('div', {'class':'grid_3 alpha','style':'background:khaki;'}).p.text.strip(' \n'))
    try:
        temp_dic['cases'].append(soup.find('div',{'class':'grid_3 alpha left-border-div left-green-border','style':'background-color:red;'}).div.span.text.strip(' \n'))
    except:
        temp_dic['cases'].append('0')
    try:
        temp_dic['assets'].append(soup.find('div', class_='bottom-border-div red fullWidth').b.text.strip(' \n'))
    except:
        temp_dic['assets'].append(np.nan)
    try:
        temp_dic['liabilities'].append(soup.find('div', class_='bottom-border-div blue fullWidth').b.text.strip(' \n'))
    except:
        temp_dic['liabilities'].append(np.nan)
    try:
        temp_dic['education_category'].append(soup.find('div',class_='grid_3 alpha omega left-border-div left-blue-border').find_all('div')[0].string.strip(' \n'))
    except:
        temp_dic['education_category'].append(np.nan)
    try:
        temp_dic['education'].append(soup.find('div',class_='grid_3 alpha omega left-border-div left-blue-border').find_all('div')[1].text.strip(' \n'))
    except:
        temp_dic['education'].append(np.nan)
        
        
async def main(start,end):
    await asyncio.gather(*[winner_details(index) for index in range(start,end)])
    
for i in range(ceil(len(candids)/20)):
    start = i*20
    end = start+20
    if end>len(candids):
        end=len(candids)
    await main(start,end)
    time.sleep(3.5)
    

In [4]:
df = pd.DataFrame(temp_dic)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2788 entries, 0 to 2787
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   constituency        2788 non-null   object
 1   name                2788 non-null   object
 2   party               2788 non-null   object
 3   age                 2788 non-null   object
 4   profession          2788 non-null   object
 5   cases               2788 non-null   object
 6   assets              2788 non-null   object
 7   liabilities         2788 non-null   object
 8   education_category  2788 non-null   object
 9   education           2788 non-null   object
dtypes: object(10)
memory usage: 217.9+ KB


In [5]:
# Cleaning the DF, extracting features and resolving name conflicts
df = myneta18_df_cleaner(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2788 entries, 0 to 2787
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   constituency        2788 non-null   object  
 1   name                2788 non-null   object  
 2   party               2788 non-null   category
 3   age                 2768 non-null   Int32   
 4   profession          2788 non-null   object  
 5   cases               2788 non-null   int32   
 6   assets              2738 non-null   Int64   
 7   liabilities         1556 non-null   Float64 
 8   education_category  2788 non-null   category
 9   education           2788 non-null   object  
 10  district            2788 non-null   object  
 11  self_profession     2788 non-null   object  
 12  spouse_profession   2788 non-null   object  
dtypes: Float64(1), Int32(1), Int64(1), category(2), int32(1), object(7)
memory usage: 234.7+ KB


In [6]:
# Repeat the same procedure above, but for the winners 

html = requests.get('https://myneta.info/Karnataka2013/index.php?action=summary&subAction=winner_analyzed&sort=candidate#summary',headers=headers).text
soup = BeautifulSoup(html, 'lxml')

temp_dic = {
    'constituency':[],
    'name':[],
    'party':[],
    'age':[],
    'profession':[],
    'cases':[],
    'assets':[],
    'liabilities':[],
    'education_category':[],
    'education':[]
}

candids = soup.find_all('a',href=has_candidate)

for i in range(ceil(len(candids)/25)):
    start = i*25
    end = start+25
    if end>len(candids):
        end=len(candids)
    await main(start,end)
    time.sleep(3)

In [7]:
winners_df = pd.DataFrame(temp_dic)
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   constituency        208 non-null    object
 1   name                208 non-null    object
 2   party               208 non-null    object
 3   age                 208 non-null    object
 4   profession          208 non-null    object
 5   cases               208 non-null    object
 6   assets              208 non-null    object
 7   liabilities         208 non-null    object
 8   education_category  208 non-null    object
 9   education           208 non-null    object
dtypes: object(10)
memory usage: 16.4+ KB


In [8]:
winners_df = myneta18_df_cleaner(winners_df)
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   constituency        208 non-null    object  
 1   name                208 non-null    object  
 2   party               208 non-null    category
 3   age                 208 non-null    Int32   
 4   profession          208 non-null    object  
 5   cases               208 non-null    int32   
 6   assets              208 non-null    Int64   
 7   liabilities         177 non-null    Float64 
 8   education_category  208 non-null    category
 9   education           208 non-null    object  
 10  district            208 non-null    object  
 11  self_profession     208 non-null    object  
 12  spouse_profession   208 non-null    object  
dtypes: Float64(1), Int32(1), Int64(1), category(2), int32(1), object(7)
memory usage: 18.1+ KB


In [9]:
# Using this dataset to resolve name conflicts and fill missing values
const_opencity13 = pd.read_csv('candidates2013OpenCity.csv',index_col='_id').groupby('constituency').apply(lambda x:x.sort_values('votes',ascending=False).head(1).drop('constituency',axis='columns').squeeze()).reset_index()
const_opencity13.head(3)

Unnamed: 0,constituency,district,name,gender,age,party,const_category,total_electors,total_const_votes,votes,candidate_voteshare_percent
0,Afzalpur,Gulbarga,Malikayya Venkayya Guttedar,M,57,INC,GEN,190336,128606,38093,29.62
1,Aland,Gulbarga,B.R. Patil,M,63,KJP,GEN,192986,132385,67085,50.67
2,Anekal,Bangalore Urban,Shivanna B.,M,46,INC,SC,270767,186461,105464,56.56


In [10]:
# Checking discrepencies for constituencies between both datasets.
np.setdiff1d(df.constituency.unique(),const_opencity13.constituency.unique())

array([], dtype=object)

In [11]:
# Checking discrepencies for constituencies between both datasets.
np.setdiff1d(df.constituency.unique(),winners_df.constituency.unique())

array(['Badami', 'Dasarahalli', 'Gurmitkal', 'Heggadadevankote',
       'Jagalur', 'Kanakagiri', 'Kundgol', 'Lingsugur', 'Maski',
       'Melukote', 'Mudigere', 'Nagthan', 'Pavagada', 'Raichur',
       'Shorapur'], dtype=object)

In [12]:
# Checking discrepencies for constituencies between both datasets.
np.setdiff1d(const_opencity13.constituency.unique(),winners_df.constituency.unique())

array(['Badami', 'Dasarahalli', 'Gurmitkal', 'Heggadadevankote',
       'Jagalur', 'Kanakagiri', 'Kundgol', 'Lingsugur', 'Maski',
       'Melukote', 'Mudigere', 'Nagthan', 'Pavagada', 'Raichur',
       'Shorapur', 'Shrirangapattana'], dtype=object)

In [13]:
# Creating a temporary dataframe with the missing values in the winners dataset from votes dataset
missing = list(np.setdiff1d(const_opencity13.constituency.unique(),winners_df.constituency.unique()))
temp_df = pd.DataFrame(columns=df.columns)
for const in missing:
    srs = const_opencity13.loc[const_opencity13.constituency==const].squeeze()
    temp_df2 = pd.DataFrame({
        'constituency':[const],
        'name':[srs['name']],
        'party':[srs['party']],
        'age':[srs['age']],
        'profession':[pd.NA],
        'cases':[pd.NA],
        'assets':[pd.NA],
        'liabilities':[pd.NA],
        'education_category':['Not Available'],
        'education':['Not Available'],
        'district':[srs['district']],
        'self_profession':['Not Available'],
        'spouse_profession':['Not Available'],
    })
    temp_df = pd.concat([temp_df,temp_df2],ignore_index=True)

In [14]:
temp_df

Unnamed: 0,constituency,name,party,age,profession,cases,assets,liabilities,education_category,education,district,self_profession,spouse_profession
0,Badami,Chimmanakatti Balappa Bhimappa,INC,62,,,,,Not Available,Not Available,Bagalkot,Not Available,Not Available
1,Dasarahalli,S Muniraju,BJP,55,,,,,Not Available,Not Available,Bangalore Urban,Not Available,Not Available
2,Gurmitkal,Baburao Chinchanasoor,INC,62,,,,,Not Available,Not Available,Yadgir,Not Available,Not Available
3,Heggadadevankote,Chikkamadu S,JD(S),62,,,,,Not Available,Not Available,Mysore,Not Available,Not Available
4,Jagalur,H.P.Rajesh,INC,47,,,,,Not Available,Not Available,Davangere,Not Available,Not Available
5,Kanakagiri,Shivaraj Sangappa Tangadagi,INC,42,,,,,Not Available,Not Available,Koppal,Not Available,Not Available
6,Kundgol,Channabasappa Satyappa Shivalli,INC,51,,,,,Not Available,Not Available,Dharwad,Not Available,Not Available
7,Lingsugur,Manappa D.Vajjal,JD(S),52,,,,,Not Available,Not Available,Raichur,Not Available,Not Available
8,Maski,Pratapgowda Patil,INC,58,,,,,Not Available,Not Available,Raichur,Not Available,Not Available
9,Melukote,K.S.Puttannaiah,SKP,63,,,,,Not Available,Not Available,Mandya,Not Available,Not Available


In [15]:
# Concatenating the temporary and main dataframes
df = pd.concat([temp_df,df],ignore_index=True)
winners_df = pd.concat([temp_df,winners_df],ignore_index=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2804 entries, 0 to 2803
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   constituency        2804 non-null   object 
 1   name                2804 non-null   object 
 2   party               2804 non-null   object 
 3   age                 2784 non-null   object 
 4   profession          2788 non-null   object 
 5   cases               2788 non-null   object 
 6   assets              2738 non-null   Int64  
 7   liabilities         1556 non-null   Float64
 8   education_category  2804 non-null   object 
 9   education           2804 non-null   object 
 10  district            2804 non-null   object 
 11  self_profession     2804 non-null   object 
 12  spouse_profession   2804 non-null   object 
dtypes: Float64(1), Int64(1), object(11)
memory usage: 290.4+ KB


In [17]:
# Filtering unnecessary attributes and converting to csv file
df.drop('profession',axis='columns',inplace=True)
df.to_csv('candidates2013MyNeta.csv')

In [18]:
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   constituency        224 non-null    object 
 1   name                224 non-null    object 
 2   party               224 non-null    object 
 3   age                 224 non-null    object 
 4   profession          208 non-null    object 
 5   cases               208 non-null    object 
 6   assets              208 non-null    Int64  
 7   liabilities         177 non-null    Float64
 8   education_category  224 non-null    object 
 9   education           224 non-null    object 
 10  district            224 non-null    object 
 11  self_profession     224 non-null    object 
 12  spouse_profession   224 non-null    object 
dtypes: Float64(1), Int64(1), object(11)
memory usage: 23.3+ KB


In [19]:
# Filtering unnecessary attributes and converting to csv file
winners_df.drop('profession',axis='columns',inplace=True)
winners_df.to_csv('winners2013MyNeta.csv')

In [104]:
df = pd.read_csv('candidates2013MyNeta.csv',index_col=0)
winners_df = pd.read_csv('winners2013MyNeta.csv',index_col=0)
with open(f'{DIR}/education.json','r',encoding='utf-8') as file:
    education = dict(json.loads(file.read()))
with open(f'{DIR}/profession.json','r',encoding='utf-8') as file:
    profession = dict(json.loads(file.read()))

In [105]:
df[['education','self_profession','spouse_profession']] = df[['education','self_profession','spouse_profession']].fillna('Unknown')
winners_df[['education','self_profession','spouse_profession']] = winners_df[['education','self_profession','spouse_profession']].fillna('Unknown')

In [106]:
for edu in df.education.unique():
    df.loc[df.education==edu,'education_category'] = education[edu]
    
for edu in winners_df.education.unique():
    winners_df.loc[winners_df.education==edu,'education_category'] = education[edu]

In [107]:
df[['self_profession_category','spouse_profession_category']] = 'Unknown'
winners_df[['self_profession_category','spouse_profession_category']] = 'Unknown'

In [108]:
for prof in df.self_profession.unique():
    df.loc[df.self_profession==prof,'self_profession_category'] = ', '.join(profession[prof])
    
for prof in df.spouse_profession.unique():
    df.loc[df.spouse_profession==prof,'spouse_profession_category'] = ', '.join(profession[prof])
    
for prof in winners_df.self_profession.unique():
    winners_df.loc[winners_df.self_profession==prof,'self_profession_category'] = ', '.join(profession[prof])
    
for prof in winners_df.spouse_profession.unique():
    winners_df.loc[winners_df.spouse_profession==prof,'spouse_profession_category'] = ', '.join(profession[prof])

In [112]:
df.to_csv('candidates2013MyNeta.csv')
winners_df.to_csv('winners2013MyNeta.csv')

### A table for resolving name conflicts of constituencies from all datasets.


| myneta13 | myneta18  | myneta23  | opencity13  | eci18  | ndtv23  | conflict  | final  |
|----|---|---|---|---|---|---|---|
|  ARAKALGUD  |   |   | ARKALGUD  | Arkalgud  | arakalgud  | arak vs. ark  | Arkalgud  |
|  B.T.M LAYOUT  |   |   |  B.T.M LAYOUT |  B.T.M.LAYOUT | b-t-m-layout  |  . and space after M | B.T.M.Layout  |
|  BAILAHONGAL  |   |   | BAILHONGAL  | Bailhongal  | bailahongal  |  bail vs. baila | Bailhongal  |
|  BAINDUR  | BYNDOOR  | BAINDUR  |  BYNDOOR | Byndoor  | baindur  |   | Byndoor  |
|  BANTWAL  |   |   |  BANTVAL | Bantval  | bantval  |  w vs. v |  Bantval |
|  BHADRAVATHI  |   |   | BHADRAVATI  | Bhadravati  | bhadravathi  | ti vs. thi  | Bhadravati  |
|  C.V. RAMANNNAGAR  |   |   |  C.V.RAMAN NAGAR |  C.V. Raman Nagar | c-v-ramannnagar  |  nnn and space before nagar | C.V. Raman Nagar  |
|  CHICKAMAGALUR  |   |   | CHIKMAGALUR  | Chikmagalur  | chickamagalur  |  chick vs chik and ama vs ma |  Chikmagalur |
|  CHAMRAJAPET  |   |   | CHAMRAJPET  | Chamrajpet  | chamrajapet  | raj vs. raja  | Chamrajpet  |
|  CHIKKNAYAKANHALLI  |   |   | CHIKNAYAKANHALLI  | Chiknayakanhalli  | chikknayakanhalli  |  chikk vs chik | Chiknayakanhalli  |
|    |   |   |   |   | chikkodi-sadalga  |  same for all but, edge case that all have hyphens |   |
|  DEVARA HIPPARGI  |   |   |  DEVAR HIPPARGI | Devar Hippargi  | devara-hippargi  |  devar vs devara |  Devar Hippargi |
|  GANDHINAGAR  |   |   | GANDHI NAGAR  |  Gandhi Nagar | gandhinagar  | space bewteen words  |  Gandhi Nagar |
|  GANGAVATHI  |   |   | GANGAWATI  |  Gangawati | gangavathi  | w vs v and ti vs thi  | Gangawati  |
|  GOVINDARAJANAGAR  |   |   | GOVINDRAJ NAGAR  | Govindraj Nagar  | govindrajnagar  | govinda vs govind and raj vs raja and space between  | Govindraj Nagar  |
|  HADAGALI  |   |   | HADAGALLI  | Hadagalli  | hadagalli  | gali vs galli  | Hadagalli  |
|  HUMNABAD  |   |   | HOMNABAD  | Humnabad  | homnabad  | hom vs hum  | Humnabad  |
|  HUBLI-DHARWAD-CENTRAL  |   |   |  HUBLI-DHARWAD-CENTRAL | Hubli-Dharwad Central  | hubli-dharwad-central  | space and -  |  Hubli-Dharwad Central |
|  HUBLI-DHARWAD-EAST  |   |   |  HUBLI-DHARWAD-EAST | Hubli-dharwad-East  | hubli-dharwad-east  | space and -  | Hubli-Dharwad East  |
|  HUBLI-DHARWAD-WEST  |   |   | HUBLI-DHARWAD- WEST  | Hubli-dharwad- West  | hubli-dharwad-west  |  space and - | Hubli-Dharwad West  |
|  HUNSUR  |   |   | HUNSUR  |  Hunasuru | hunsur  |   | Hunsur  |
|  K.R. PURA  |   |   |  K.R.PURA |  K.R.Pura | k-r-pura  |  space after period | K.R.Pura  |
|  KALAGHATGI  |   |   | KALGHATGI  |  Kalghatgi | kalaghatgi  |  kal vs kala | Kalghatgi  |
|  KARKALA  | KARKAL  |  KARKALA | KARKAL  | Karkal  |  karkala | al vs ala  | Karkal  |
|  KAUP  |   |   | KAPU  | Kapu  |  kaup |   | Kapu  |
|  KRISHNARAJPET  |   |   |  KRISHNARAJPET |  Krishnarajapete | krishnarajpet  | raj vs raja and pet vs pete  |  Krishnarajpet |
|  KUNDAPUR  |   |   | KUNDAPURA  |  Kundapura | kundapur  | pur vs pura  | Kundapura  |
|  PADMANABANAGAR  |   |   |  PADMANABA NAGAR | Padmanaba Nagar  |  padmanabanagar |  space between | Padmanaba Nagar  |
| PIRIYAPATNA   |   |   | PERIYAPATNA  | Periyapatna  |  piriyapatna | peri vs piri  | Periyapatna  |
|  RAJAJINAGAR  |   |   | RAJAJI NAGAR  | Rajaji Nagar  |  rajajinagar | space  | Rajaji Nagar  |
|  RANEBENNUR  |   |   |  RANIBENNUR | Ranibennur  | ranebennur  | rani vs rane  |  Ranibennur |
|  SAKALESHPUR  |   |   | SAKLESHPUR  | Sakleshpur  | sakaleshpur  | sakal vs sakl  | Sakaleshpur  |
|  SRINIVASAPUR  |   |   | SRINISVASPUR  | Srinivaspur  | srinivasapur  | vas vs vasa  | Srinivaspur  |
|  SHANTINAGAR  |   |   |  SHANTI NAGAR | Shanti Nagar  |  shantinagar |  space |  Shanti Nagar |
|  Shrirangapattana  |   |   |   |   |   | missing from myneta13  |   |
|  SIRAGUPPA  |   |   | SIRUGUPPA  | Siruguppa  | siruguppa  |  sira vs siru |  Siruguppa |
|  T. NARASIPUR  |   |   | T.NARASIPUR  |  T.Narasipur | t-narasipur  |  space after period | T.Narasipur  |
|  VIJAYANAGAR  |   |   |  VIJAY NAGAR |  Vijay Nagar |  vijayanagar | jay vs jaya and space between  |  Vijay Nagar |
|  YEMKANAMARDI  |   |   |  YEMKANMERDI | Yemkanmardi  | yemkanamerdi  |  kan vs kana | Yemkanmardi  |
|  YESHWANTHAPURA  |   |   | YESHVANTHAPURA  | Yeshvanthapura  | yeshwanthapura  |  v vs w |  Yeshvanthapura |
