In [1]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import numpy as np
import requests
import re
import os
import sys

DIR = os.getenv('KEA_BASE_DIR')
sys.path.append(DIR)

import asyncio
from cleaning import ndtv_df_cleaner, const_url_extractor, ndtv23_const_corrector, opencity13_const_corrector

In [4]:
DIR = os.getenv('KEA_BASE_DIR')

In [2]:
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "eager" # For faster page loading in selenium

options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2} # Limiting the number of images that can be loaded for faster page loading

options.add_experimental_option('detach', True) # Detatches the webdriver instance
options.add_experimental_option("prefs", prefs)
options.add_argument('--headless') # Run browser instance without showing it graphically

s = Service(f"{DIR}/chromedriver/chromedriver.exe")

In [3]:
# Retreiving the HTML code of the NDTV webpage

driver = webdriver.Chrome(options=options, service=s, desired_capabilities=caps)
driver.set_page_load_timeout(15)
driver.get("https://www.ndtv.com/elections/karnataka/winning-candidates")

try:
    # There is a notification menu that pops up, which hides the HTML of the page underneath untill I dismiss it
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="__cricketsubscribe"]/div[2]/div[2]/a[1]'))).click()
except TimeoutException:
    driver.execute_script("window.stop();")

time.sleep(3)
html = driver.page_source # Fetching the HTML of the page

driver.close()

In [4]:
lead_c = BeautifulSoup(html, 'lxml')
divs = lead_c.find_all('div',class_='kmp_crd-skn') # Finding all the winning candidate info tables

In [5]:
# Parsing the tables to extract necessary info into a dictionary

temp_dic = {
    'name':[],
    'constituency':[],
    'party':[],
    'votes':[],
    'age':[],
    'gender':[],
    'sitting_mla':[]
}

for div in divs:
    temp_dic['name'].append(div.find('span', class_='crd_wrp-ttl').find('a').string)
    try:
        temp_dic['party'].append(div.find('span', class_='kmp_pty-wrp').text)
    except:
        temp_dic['party'].append(np.nan)
    tags = div.find('div', class_='card_wrp-sum').find_all('li')
    temp_dic['constituency'].append(tags[0].text)
    temp_dic['votes'].append(tags[1].text)
    temp_dic['age'].append(tags[2].text)
    temp_dic['gender'].append(tags[3].text)
    temp_dic['sitting_mla'].append(tags[9].text)

In [6]:
df1 = pd.DataFrame(temp_dic) # Converting the dictionary to a Dataframe
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          224 non-null    object
 1   constituency  224 non-null    object
 2   party         224 non-null    object
 3   votes         224 non-null    object
 4   age           224 non-null    object
 5   gender        224 non-null    object
 6   sitting_mla   224 non-null    object
dtypes: object(7)
memory usage: 12.4+ KB


In [7]:
df1.sample(5)

Unnamed: 0,name,constituency,party,votes,age,gender,sitting_mla
142,Nara Bharath Reddy,Seat :Bellary City,Cong,Votes (% Share) :86440 (48.74%),Age :33 Years,Gender :M,Sitting MLA :No
15,Prabhu Chavan,Seat :Aurad,BJP,Votes (% Share) :81382 (51.46%),Age :53 Years,Gender :M,Sitting MLA :
212,Karemma,Seat :Devadurga,JDS,Votes (% Share) :99544 (57.65%),Age :50 Years,Gender :F,Sitting MLA :No
216,HP Swaroop,Seat :Hassan,JDS,Votes (% Share) :85176 (50.06%),Age :40 Years,Gender :M,Sitting MLA :No
28,BP Harish,Seat :Harihar,BJP,Votes (% Share) :63924 (38.07%),Age :62 Years,Gender :M,Sitting MLA :No


In [8]:
# The URLs for each constituency have that constituency name in a specific format. Converting the constituencies to that format.
list_of_constituencies = sorted(list(df1.constituency.apply(const_url_extractor)))

# Cleaning the dataframe.
df1 = ndtv_df_cleaner(df1)

In [18]:
# Correct party names
def party_cleaner(text:str)->str:
    text = re.sub("^Cong$","INC",text)
    text = re.sub("^SKP-Cong$","SKP",text)
    text = re.sub("^JDS$",r"JD(S)",text)
    return text

In [13]:
df1.party = df1.party.apply(party_cleaner) # Correct party names

In [14]:
# df1.to_csv('constituency2023NDTV.csv') # Converting constituency dataframe to a csv file.
df1.sample(5)

Unnamed: 0,name,constituency,party,votes,age,gender,is_re_elected
102,AC Srinivasa,Pulakeshinagar,INC,87316,56,M,0
147,KC Veerendra Puppy,Chitradurga,INC,122021,47,M,0
43,Munirathna,Rajarajeshwarinagar,BJP,127980,59,M,1
73,C Puttarangashetty,Chamarajanagar,INC,83858,68,M,0
3,Balachandra L Jarkiholi,Arabhavi,BJP,115402,56,M,1


In [9]:
# Using a webdriver instance to visit the site, dismiss the notification pop-up which generates cookies.
# These cookies ensure that the subsequent visits to this site don't get the same notification pop-up

driver = webdriver.Chrome(options=options, service=s, desired_capabilities=caps)
driver.get('https://www.ndtv.com/elections/karnataka-vidhan-sabha-election-results-2023/afzalpur')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="__cricketsubscribe"]/div[2]/div[2]/a[1]'))).click()
time.sleep(5)
driver.get('https://www.ndtv.com/elections/karnataka-vidhan-sabha-election-results-2023/aland')
time.sleep(5)
cookies = driver.get_cookies() # Storing the generated cookies in a dictionary
driver.close()

In [10]:
driver = webdriver.Chrome(options=options, service=s, desired_capabilities=caps)
if not os.path.exists(f"{DIR}/2023 Elections/Key candidates NDTV"):
    os.mkdir(f"{DIR}/2023 Elections/Key candidates NDTV")
# key_candidates = []

key_candidates_div = SoupStrainer('div', {'class':'Key_candidates'})

In [11]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51',
          'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"}

# Adding the stored cookies to a CookieJar object to pass in the GET request
jar = requests.cookies.RequestsCookieJar()
for cookie_dict in cookies:
    for key in cookie_dict:
        cookie_dict[key] = str(cookie_dict[key])
        jar.update(cookie_dict)

# Get HTML of webpage synchronously
def get_sync_html(url: str):
    response = requests.get(url,headers=headers,timeout=(15.0,30.0),cookies=jar)
    return response.text

# Get HTML of webpage Asynchronously
async def get_html(url: str):
    return await asyncio.to_thread(get_sync_html, url)

In [12]:
# Fetch the HTML of a webpage
async def const_details(constituency):
    url = f'https://www.ndtv.com/elections/karnataka-vidhan-sabha-election-results-2023/{constituency}'
    temp_html = await get_html(url)
    temp_html = BeautifulSoup(temp_html, 'lxml', parse_only=key_candidates_div)
    with open(f"{DIR}/2023 Elections/Key candidates NDTV/{constituency}.html",'w',encoding='utf-8') as f:
        f.write(str(temp_html))

In [13]:
# Fetch the HTML of multiple pages at once in multiple batches.
for i in range(15):
    start = i*15
    end = start+15
    await asyncio.gather(*[const_details(constituency) for constituency in list_of_constituencies[start:end]])
    time.sleep(3.5)

In [14]:
# Save the HTML on local machine.
# for i in range(len(key_candidates)):
#     with open(f"{DIR}/2023 Elections/Key candidates NDTV/{list_of_constituencies[i]}.html",'w',encoding='utf-8') as f:
#         f.write(str(key_candidates[i]))

In [15]:
temp_dic = {
    'constituency':[],
    'name':[],
    'party':[],
    'votes':[],
    'gender':[]
}

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51',
          'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"}

def http_get_sync(url: str):
    response = requests.get(url,headers=headers,timeout=(15.0,30.0))
    return response.content
    
async def http_get(url: str):
    return await asyncio.to_thread(http_get_sync, url)
    
# Parse the HTML to extract necessary info
async def candid_details(candidate,filename):
    constituency = re.sub(r'\.html','',filename)
    temp_dic['constituency'].append(constituency)
    temp_dic['name'].append(candidate.find('span', class_='candidate_nm').a.string.strip())
    temp_dic['party'].append(candidate.find('span', class_='party-icon bi').parent.text.strip())
    temp_dic['votes'].append(candidate.find_all('li')[-2].text.strip())
    html = await http_get(candidate.find('span', class_='candidate_nm').a.attrs['href'])
    soup = BeautifulSoup(html,'html.parser')
    temp_dic['gender'].append(soup.find('td',class_='lok_lst-tdl',string='Gender').next_sibling.string.strip())

# Read the HTML file for a constituency and pass each candidate in that constituency to 'candid_details' function
async def candids_from_const(constituency):
    with open(f"{DIR}/2023 Elections/Key candidates NDTV/{constituency}",'r',encoding='utf-8') as f:
        const_html = BeautifulSoup(f.read(),'html.parser')
    candidates_table = const_html.find_all('ul',{'id':'key_cand_list'})
    await asyncio.gather(*[candid_details(candidate,constituency) for candidate in candidates_table])

# List of all HTML files for all constituencies.
const_list = os.listdir(f"{DIR}/2023 Elections/Key candidates NDTV/")

async def main():
    for i in range(15):
        start = i*15
        end = start+15
        await asyncio.gather(*[candids_from_const(constituency) for constituency in const_list[start:end]])
        time.sleep(2.5)

await main()

In [19]:
df2 = pd.DataFrame(temp_dic) # Converting the temporary dictionary to a Dataframe
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2621 entries, 0 to 2620
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   constituency  2621 non-null   object
 1   name          2621 non-null   object
 2   party         2621 non-null   object
 3   votes         2621 non-null   object
 4   gender        2621 non-null   object
dtypes: object(5)
memory usage: 102.5+ KB


In [20]:
# Cleaning the Dataframe

df2.constituency = df2.constituency.apply(ndtv23_const_corrector) # Correcting the constituency names
df2.party = df2.party.str.strip().astype('category')
df2.votes = df2.votes.str.replace(',','') # Removing the commas from votes
df2.loc[df2.votes.str.contains('\D'),'votes'] = pd.NA # Assign a null value to all votes that have anything other than digits
df2.votes = df2.votes.astype('Int64') # Converting the votes from str to int
df2.gender = df2.gender.apply(lambda x: 'M' if x=='Male' else 'F' if x=='Female' else pd.NA) # Transforming the gender values
df2.party = df2.party.apply(party_cleaner) # Correcting the party names

In [21]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2621 entries, 0 to 2620
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   constituency  2621 non-null   object
 1   name          2621 non-null   object
 2   party         2621 non-null   object
 3   votes         2603 non-null   Int64 
 4   gender        2583 non-null   object
dtypes: Int64(1), object(4)
memory usage: 105.1+ KB


In [22]:
df2.sample(5)

Unnamed: 0,constituency,name,party,votes,gender
852,Gulbarga Dakshin,Krishna Reddy,JD(S),1409,M
1437,Koppal,Adavi Hanumappa Godachalli,KRS,651,M
1845,Nanjangud,HK Swamy Haradanahalli,IND,746,M
1491,Kudachi,P Rajeev,BJP,60078,M
2234,Shirahatti,Hanumantappa Peerappa Koravar,IND,445,M


In [23]:
df2.to_csv('candidates2023NDTV.csv') # Converting the candidates dataframe to a csv file.

In [40]:
const_ndtv23 = pd.read_csv("constituency2023NDTV.csv",index_col=0)
const_myneta23 = pd.read_csv("winners2023MyNeta.csv",index_col=0)

In [42]:
# Rename the winning candidates' names in this (NDTV) dataset from the MyNeta dataset
temp_dic = {const_myneta23.loc[index,'constituency'] : const_myneta23.loc[index,'name'] for index in const_myneta23.index}

for index in const_ndtv23.index:
    const = const_ndtv23.loc[index,'constituency']
    const_ndtv23.loc[index,'name'] = temp_dic[const]

In [44]:
const_ndtv23.to_csv('constituency2023NDTV.csv')

In [45]:
# The eligible_voters_NEW dataset has constituency numbers instead of constituency name.
# So, in order to merge the constituency dataframe with the elgible voters dataframe, I first need to convert constituency numbers to names.

# Reading the dataset containing both constituency numbers and names
df = pd.read_csv(f"{DIR}/2013 Elections/2013DetailedResults.csv",index_col='_id')
df['Constituency Name'] = df['Constituency Name'].apply(opencity13_const_corrector) # Correcting constituency names

const_ndtv23 = pd.read_csv('constituency2023NDTV.csv',index_col=0) # Reading the main constituency dataset

eligible_voters = pd.read_csv('eligible_voters_NEW.csv',index_col=0) # Reading the eligible voters dataset
eligible_voters = eligible_voters.groupby('const_num')[['male_voters','female_voters','other_voters','total_voters']].sum()
merged = eligible_voters.merge(df,left_on='const_num',right_on='Constituency No') # convert constituency numbers to names
eligible_voters = merged[['Constituency Name','male_voters','female_voters','other_voters','total_voters']].drop_duplicates()

In [46]:
eligible_voters.rename(columns={'Constituency Name':'constituency',
                                'male_voters':'eligible_male_voters',
                                'female_voters':'eligible_female_voters',
                               'other_voters':'eligible_other_voters',
                               'total_voters':'eligible_total_voters'},inplace=True)

In [47]:
const_ndtv23 = const_ndtv23.merge(eligible_voters,on='constituency') # merge the constituency dataframe with the elgible voters dataframe
const_ndtv23.head(3)

Unnamed: 0,name,constituency,party,votes,age,gender,is_re_elected,eligible_male_voters,eligible_female_voters,eligible_other_voters,eligible_total_voters
0,Jolle Shashikala Annasaheb,Nippani,BJP,73348,53.0,F,1,113856,111823,9,225688
1,Aihole Duryodhan Mahalingappa,Raybag,BJP,57500,65.0,M,1,108127,102369,9,210505
2,Katti Nikhil Umesh,Hukkeri,BJP,103574,40.0,M,0,103117,103449,8,206574


In [48]:
const_ndtv23.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224 entries, 0 to 223
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    224 non-null    object 
 1   constituency            224 non-null    object 
 2   party                   224 non-null    object 
 3   votes                   224 non-null    int64  
 4   age                     222 non-null    float64
 5   gender                  222 non-null    object 
 6   is_re_elected           224 non-null    int64  
 7   eligible_male_voters    224 non-null    int64  
 8   eligible_female_voters  224 non-null    int64  
 9   eligible_other_voters   224 non-null    int64  
 10  eligible_total_voters   224 non-null    int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 21.0+ KB


In [49]:
const_ndtv23.to_csv('constituency2023NDTV.csv')