In [3]:
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import requests
import re
import time
import asyncio
from math import ceil
import numpy as np
import os
import sys
import json

DIR = os.getenv('KEA_BASE_DIR')
sys.path.append(DIR)

from cleaning import myneta18_const_corrector, myneta18_df_cleaner, rmv_dspace

In [2]:
# Retreiving the HTML code from the main 2018 candidates webpage
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51',
          'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"}

html = requests.get('https://myneta.info/Karnataka2018/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary',headers=headers).text
soup = BeautifulSoup(html, 'lxml')

temp_dic = {
    'constituency':[],
    'name':[],
    'party':[],
    'age':[],
    'profession':[],
    'cases':[],
    'assets':[],
    'liabilities':[],
    'education_category':[],
    'education':[]
}


# Filter candidate IDs from main webpage's HTML code
def has_candidate(href):
    return href and bool(re.compile('^candidate\.php\?candidate_id=').search(href))

# Finding all 'a' tags with candidate IDs
candids = soup.find_all('a',href=has_candidate)

# Get HTML of the page synchronously
def http_get_sync(url: str):
    response = requests.get(url,headers=headers,timeout=(5.0,15.0))
    return response.content

# Get HTML of the page Asynchronously
async def http_get(url: str):
    return await asyncio.to_thread(http_get_sync, url)


# Fetching the HTML and parsing it to retrieve necessary info
async def winner_details(index):
    url = 'https://myneta.info/karnataka2018/' + candids[index].attrs['href']
    html = await http_get(url)
    soup = BeautifulSoup(html, 'lxml')
    temp_dic['name'].append(soup.find('div', class_='grid_9 alpha omega').div.h2.text.strip(' \n'))
    temp_dic['constituency'].append(soup.find('div', class_='grid_9 alpha omega').div.h5.text.strip(' \n'))
    temp_dic['party'].append(soup.find('div', class_='grid_9 alpha omega').div.div.text.strip(' \n'))
    temp_dic['age'].append(soup.find('div', {'class':'grid_3 alpha','style':'background:khaki;'}).find_all('div',class_='grid_2 alpha')[2].text.strip(' \n'))
    temp_dic['profession'].append(soup.find('div', {'class':'grid_3 alpha','style':'background:khaki;'}).p.text.strip(' \n'))
    try:
        temp_dic['cases'].append(soup.find('div',{'class':'grid_3 alpha left-border-div left-green-border','style':'background-color:red;'}).div.span.text.strip(' \n'))
    except:
        temp_dic['cases'].append('0')
    try:
        temp_dic['assets'].append(soup.find('div', class_='bottom-border-div red fullWidth').b.text.strip(' \n'))
    except:
        temp_dic['assets'].append(np.nan)
    try:
        temp_dic['liabilities'].append(soup.find('div', class_='bottom-border-div blue fullWidth').b.text.strip(' \n'))
    except:
        temp_dic['liabilities'].append(np.nan)
    try:
        temp_dic['education_category'].append(soup.find('div',class_='grid_3 alpha omega left-border-div left-blue-border').find_all('div')[0].string.strip(' \n'))
    except:
        temp_dic['education_category'].append(np.nan)
    try:
        temp_dic['education'].append(soup.find('div',class_='grid_3 alpha omega left-border-div left-blue-border').find_all('div')[1].string.strip(' \n'))
    except:
        temp_dic['education'].append(np.nan)
        
        
async def main(start,end):
    await asyncio.gather(*[winner_details(index) for index in range(start,end)])
    
for i in range(ceil(len(candids)/20)):
    start = i*20
    end = start+20
    if end>len(candids):
        end=len(candids)
    await main(start,end)
    time.sleep(3.5)

In [3]:
df = pd.DataFrame(temp_dic)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   constituency        2576 non-null   object
 1   name                2576 non-null   object
 2   party               2576 non-null   object
 3   age                 2576 non-null   object
 4   profession          2576 non-null   object
 5   cases               2576 non-null   object
 6   assets              2576 non-null   object
 7   liabilities         2576 non-null   object
 8   education_category  2576 non-null   object
 9   education           2576 non-null   object
dtypes: object(10)
memory usage: 201.4+ KB


In [4]:
# Cleaning the DF, extracting features and resolving name conflicts
df = myneta18_df_cleaner(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   constituency        2576 non-null   object  
 1   name                2576 non-null   object  
 2   party               2576 non-null   category
 3   age                 2569 non-null   Int32   
 4   profession          2576 non-null   object  
 5   cases               2576 non-null   int32   
 6   assets              2559 non-null   Int64   
 7   liabilities         1478 non-null   Float64 
 8   education_category  2576 non-null   category
 9   education           2576 non-null   object  
 10  district            2576 non-null   object  
 11  self_profession     2576 non-null   object  
 12  spouse_profession   2576 non-null   object  
dtypes: Float64(1), Int32(1), Int64(1), category(2), int32(1), object(7)
memory usage: 217.3+ KB


In [5]:
# Repeat the same procedure above, but for the winners

html = requests.get('https://myneta.info/Karnataka2018/index.php?action=summary&subAction=winner_analyzed&sort=candidate#summary',headers=headers).text
soup = BeautifulSoup(html, 'lxml')

temp_dic = {
    'constituency':[],
    'name':[],
    'party':[],
    'age':[],
    'profession':[],
    'cases':[],
    'assets':[],
    'liabilities':[],
    'education_category':[],
    'education':[]
}

candids = soup.find_all('a',href=has_candidate)

for i in range(ceil(len(candids)/20)):
    start = i*20
    end = start+20
    if end>len(candids):
        end=len(candids)
    await main(start,end)
    time.sleep(3)

In [6]:
winners_df = pd.DataFrame(temp_dic)
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   constituency        223 non-null    object
 1   name                223 non-null    object
 2   party               223 non-null    object
 3   age                 223 non-null    object
 4   profession          223 non-null    object
 5   cases               223 non-null    object
 6   assets              223 non-null    object
 7   liabilities         223 non-null    object
 8   education_category  223 non-null    object
 9   education           223 non-null    object
dtypes: object(10)
memory usage: 17.6+ KB


In [7]:
winners_df = myneta18_df_cleaner(winners_df)
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   constituency        223 non-null    object  
 1   name                223 non-null    object  
 2   party               223 non-null    category
 3   age                 223 non-null    Int32   
 4   profession          223 non-null    object  
 5   cases               223 non-null    int32   
 6   assets              223 non-null    Int64   
 7   liabilities         192 non-null    Float64 
 8   education_category  223 non-null    category
 9   education           223 non-null    object  
 10  district            223 non-null    object  
 11  self_profession     223 non-null    object  
 12  spouse_profession   223 non-null    object  
dtypes: Float64(1), Int32(1), Int64(1), category(2), int32(1), object(7)
memory usage: 19.2+ KB


In [8]:
print(len(df.constituency.unique()))

# Checking discrepencies for constituencies between both datasets.
np.setdiff1d(df.constituency.unique(),winners_df.constituency.unique())

224


array(['Athani'], dtype=object)

In [9]:
# Creating a temporary dataframe with the missing values in the winners dataset from votes dataset

temp_dic = {
    'constituency':['Athani'],
    'name':['Mahesh Iranagouda Kumathalli'],
    'party':['INC'],
    'age':[56],
    'profession':[pd.NA],
    'cases':[pd.NA],
    'assets':[pd.NA],
    'liabilities':[pd.NA],
    'education_category':['Not Available'],
    'education':['Not Available'],
    'district':['Belgaum'],
    'self_profession':['Not Available'],
    'spouse_profession':['Not Available'],
}
temp_df = pd.DataFrame(temp_dic)

# Concatenating the temporary and main dataframes
winners_df = pd.concat([winners_df,temp_df],ignore_index=True)
df = pd.concat([df,temp_df],ignore_index=True)

In [10]:
# Correcting party names
df.party = df.party.apply(lambda x : 'KPJP' if x=='Karnataka Pragnyavantha Janatha Party' else x)
winners_df.party = winners_df.party.apply(lambda x : 'KPJP' if x=='Karnataka Pragnyavantha Janatha Party' else x)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2577 entries, 0 to 2576
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   constituency        2577 non-null   object
 1   name                2577 non-null   object
 2   party               2577 non-null   object
 3   age                 2570 non-null   Int64 
 4   profession          2576 non-null   object
 5   cases               2576 non-null   object
 6   assets              2559 non-null   object
 7   liabilities         1478 non-null   object
 8   education_category  2577 non-null   object
 9   education           2577 non-null   object
 10  district            2577 non-null   object
 11  self_profession     2577 non-null   object
 12  spouse_profession   2577 non-null   object
dtypes: Int64(1), object(12)
memory usage: 264.4+ KB


In [12]:
# Removing unnecessary attributes and converting dataframe to a csv file.
df.drop('profession',axis='columns',inplace=True)
df.to_csv('candidates2018MyNeta.csv')

In [13]:
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   constituency        224 non-null    object
 1   name                224 non-null    object
 2   party               224 non-null    object
 3   age                 224 non-null    Int64 
 4   profession          223 non-null    object
 5   cases               223 non-null    object
 6   assets              223 non-null    object
 7   liabilities         192 non-null    object
 8   education_category  224 non-null    object
 9   education           224 non-null    object
 10  district            224 non-null    object
 11  self_profession     224 non-null    object
 12  spouse_profession   224 non-null    object
dtypes: Int64(1), object(12)
memory usage: 23.1+ KB


In [14]:
# Removing unnecessary attributes and converting dataframe to a csv file.
winners_df.drop('profession',axis='columns',inplace=True)
winners_df.to_csv('winners2018MyNeta.csv')

In [15]:
const_eci18 = pd.read_csv('constituency2018ECI.csv',index_col=0)
const_myneta18 = pd.read_csv('winners2018MyNeta.csv',index_col=0)

In [16]:
# Creating a dictionary with keys as constituency and values as candidate (winner) name
temp_dic = {const_eci18.loc[index,'constituency'] : const_eci18.loc[index,'name'] for index in const_eci18.index}

In [17]:
# Renaming the winning candidates' names in this (votes dataset) dataset from the ECI dataset
for index in const_myneta18.index:
    const = const_myneta18.loc[index,'constituency']
    const_myneta18.loc[index,'name'] = temp_dic[const]

In [18]:
# Converting the dataframe to a csv file
const_myneta18.to_csv('winners2018MyNeta.csv')

In [4]:
df = pd.read_csv('candidates2018MyNeta.csv',index_col=0)
winners_df = pd.read_csv('winners2018MyNeta.csv',index_col=0)
with open(f'{DIR}/education.json','r',encoding='utf-8') as file:
    education = dict(json.loads(file.read()))
with open(f'{DIR}/profession.json','r',encoding='utf-8') as file:
    profession = dict(json.loads(file.read()))

In [5]:
df[['education','self_profession','spouse_profession']] = df[['education','self_profession','spouse_profession']].fillna('Unknown')
winners_df[['education','self_profession','spouse_profession']] = winners_df[['education','self_profession','spouse_profession']].fillna('Unknown')

In [6]:
for edu in df.education.unique():
    df.loc[df.education==edu,'education_category'] = education[edu]
    
for edu in winners_df.education.unique():
    winners_df.loc[winners_df.education==edu,'education_category'] = education[edu]

In [7]:
df[['self_profession_category','spouse_profession_category']] = 'Unknown'
winners_df[['self_profession_category','spouse_profession_category']] = 'Unknown'

In [8]:
for prof in df.self_profession.unique():
    df.loc[df.self_profession==prof,'self_profession_category'] = ', '.join(profession[prof])
    
for prof in df.spouse_profession.unique():
    df.loc[df.spouse_profession==prof,'spouse_profession_category'] = ', '.join(profession[prof])
    
for prof in winners_df.self_profession.unique():
    winners_df.loc[winners_df.self_profession==prof,'self_profession_category'] = ', '.join(profession[prof])
    
for prof in winners_df.spouse_profession.unique():
    winners_df.loc[winners_df.spouse_profession==prof,'spouse_profession_category'] = ', '.join(profession[prof])

In [10]:
df.to_csv('candidates2018MyNeta.csv')
winners_df.to_csv('winners2018MyNeta.csv')