prototype_scraping_unsuccessful_pages

# Script for scraping previously unsuccessful pages

When we scraped all pages for a sector, some of the pages failed to scrape due to some random fluke. We store the URLs to these pages in a pickled list. In this notebook, we write code to scrape just these pages

## imports

In [None]:
#--------DISPLAY SETTINGS FOR THE NOTEBOOK--------
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #to see all outputs of a cell, as opposed to only the last one
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) #make the notebook take up more screen space (default is 60%)

#-------FOR WORKING WITH DATA IN A DATAFRAME--------

import pandas as pd #To store scraped data

#-------SCRAPING SPECIFIC MODULES--------
import requests #to conduct different forms of HTTP requests
import html5lib #to construct tree structure of HTML data
from bs4 import BeautifulSoup as soup #to parse the html data obtained from the scrape
import time # to add wait time, to keep the website from kicking us out and also to let the page load before grabbing data
from selenium import webdriver #to automate the navigating within the browser
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select #to select the features we want on the website via the scraper
from selenium.webdriver.support.ui import WebDriverWait #again, to add wait times more 'implicitly'
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options #to use properties of the chrome webbrowser

import random
from joblib import Parallel, delayed # for parallelizing
from tqdm import tqdm # this provides a visual progress tracker for loops

## loading list of unsuccessful pages' URLs

In [None]:
unsuccessful_pages = pd.read_csv('./unsuccessfully_scraped_pages.csv')

In [None]:
unsuccessful_pages

## setting up scraper for these pages

In [None]:
options = Options()
options.headless = True # True hides the navigating of the browser by the scraper, False shows you the tab/window opening and stuff getting clicked
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

In [None]:
def get_sector_urls(base_url): 
    driver.get(base_url)
    sector_urls = [url.get_attribute('href') for url in driver.find_elements_by_class_name('bluelink11px')]
    sector_urls = [url+'?per_page=100' for url in sector_urls]
    return sector_urls


def get_num_of_pages_for_sector(sector_num): 
    sector_url = sector_urls[sector_num]
    driver.get(sector_url)
    driver.find_elements_by_partial_link_text('Last')[0].click()
    last_page_url = driver.current_url
    total_pages = int(last_page_url.split('?')[0].rsplit('/',1)[1])
    return total_pages

def get_page_url(sector_url, page_num):
    page_url = sector_url.split('?',1)[0][:-1]+f'{page_num}?per_page=100'
    return page_url

def scrape_a_single_page(page_url, unsuccessful_pages_df): 
    driver.get(page_url)
    ngos_on_page = driver.find_elements_by_xpath("//a[contains(@onclick,'show_ngo_info')]")
    page_df = pd.DataFrame()
    for i in range(0, len(ngos_on_page)): 
        print(f'scraping NGO number {i+1}')
        ngo = ngos_on_page[i]
        time.sleep(2)
        ngo.click()
#         print('Opened NGO info box')
#         print('Extracting Details')
        time.sleep(2)
        name = driver.find_element_by_id('ngo_name_title').get_attribute('innerHTML')
        uid = driver.find_element_by_id('UniqueID').get_attribute('innerHTML')
        reg_with = driver.find_element_by_id('reg_with').get_attribute('innerHTML')
        ngo_type = driver.find_element_by_id('ngo_type').get_attribute('innerHTML')
        ngo_regno = driver.find_element_by_id('ngo_regno').get_attribute('innerHTML')
        rc_upload = driver.find_element_by_id('rc_upload').get_attribute('innerHTML')
        pc_upload = driver.find_element_by_id('pc_upload').get_attribute('innerHTML')
        act_name = driver.find_element_by_id('ngo_act_name').get_attribute('innerHTML')
        city_reg = driver.find_element_by_id('ngo_city_p').get_attribute('innerHTML')
        state_reg = driver.find_element_by_id('ngo_state_p').get_attribute('innerHTML')
        reg_date = driver.find_element_by_id('ngo_reg_date').get_attribute('innerHTML')
        key_issues = driver.find_element_by_id('key_issues').get_attribute('innerHTML')
        operational_states = driver.find_element_by_id('operational_states').get_attribute('innerHTML')
        operational_districts = driver.find_element_by_id('operational_district').get_attribute('innerHTML')
        fcra_details = driver.find_element_by_id('FCRA_details').get_attribute('innerHTML')
        fcra_regno = driver.find_element_by_id('FCRA_reg_no').get_attribute('innerHTML')
        details_achievement = driver.find_element_by_id('activities_achieve').get_attribute('innerHTML')
        contact_address = driver.find_element_by_id('address').get_attribute('innerHTML')
        contact_city = driver.find_element_by_id('city').get_attribute('innerHTML')
        contact_state = driver.find_element_by_id('state_p_ngo').get_attribute('innerHTML')
        contact_telephone = driver.find_element_by_id('phone_n').get_attribute('innerHTML')
        contact_mobile = driver.find_element_by_id('mobile_n').get_attribute('innerHTML')
        contact_website = driver.find_element_by_id('ngo_web_url').get_attribute('innerText')
        contact_email = driver.find_element_by_id('email_n').get_attribute('innerHTML')
#         print('Extracting details from members table...')
        members_table = driver.find_element_by_id('member_table')
        member_names =  [i.get_attribute('innerHTML') for i in members_table.find_elements_by_xpath('.//tr//td')[::4]]
        member_designations = [i.get_attribute('innerHTML') for i in members_table.find_elements_by_xpath('.//tr//td')[1::4]]
        member_pan = [i.get_attribute('innerHTML') for i in members_table.find_elements_by_xpath('.//tr//td')[2::4]]
        member_aadhar = [i.get_attribute('innerHTML') for i in members_table.find_elements_by_xpath('.//tr//td')[3::4]]
        member_name_designation_dict = dict(zip(member_names, member_designations))
        member_name_pan_dict = dict(zip(member_names, member_pan))
        member_name_aadhar_dict = dict(zip(member_names, member_aadhar))
#         print('Extracting details from Source of Funds table...')
        sof_table = driver.find_element_by_id('source_table')
        dept_name = [i.get_attribute('innerHTML') for i in sof_table.find_elements_by_xpath('.//tr//td')[::5]]
        source = [i.get_attribute('innerHTML') for i in sof_table.find_elements_by_xpath('.//tr//td')[1::5]]
        financial_year = [i.get_attribute('innerHTML') for i in sof_table.find_elements_by_xpath('.//tr//td')[2::5]]
        amount_sanctioned =[i.get_attribute('innerHTML') for i in sof_table.find_elements_by_xpath('.//tr//td')[3::5]]
        purpose = [i.get_attribute('innerHTML') for i in sof_table.find_elements_by_xpath('.//tr//td')[4::5]]
        year_amount_dict = dict(zip(financial_year, amount_sanctioned))
        year_dept_dict = dict(zip(financial_year, dept_name))
        year_source_dict = dict(zip(financial_year, source))
        year_purpose_dict = dict(zip(financial_year, purpose))
#         print('Storing extracted info into a dataframe')
        df = pd.DataFrame()
        df['ngo_name'] = [name]
        df['unique_id'] = uid
        df['registered_with'] = reg_with
        df['type_of_ngo'] = ngo_type
        df['registration_number'] = ngo_regno
        df['copy_of_registration_certificate'] = rc_upload
        df['copy_of_pan_card'] = pc_upload
        df['act_name'] = act_name
        df['city_of_registration'] = city_reg
        df['state_of_registration'] = state_reg
        df['registration_date'] = reg_date
        df['key_issues'] = key_issues
        df['operational_areas_states'] = operational_states
        df['operational_areas_districts'] = operational_districts
        df['FCRA_details'] = fcra_details
        df['FCRA_registration_num'] = fcra_regno
        df['details_of_achievement'] = details_achievement
        df['contact_details_address'] = contact_address
        df['contact_details_city'] = contact_city
        df['contact_details_state'] = contact_state
        df['contact_details_telephone'] = contact_telephone
        df['contact_details_website'] = contact_website
        df['contact_details_email'] = contact_email
        df['members_names_designations'] = [member_name_designation_dict]
        df['members_names_pan_availability'] = [member_name_pan_dict]
        df['members_names_aadhar_availability'] = [member_name_aadhar_dict]
        df['source_of_funds_amount_sanctioned'] = [year_amount_dict]
        df['source_of_funds_department_name'] = [year_dept_dict]
        df['source_of_funds_source'] = [year_source_dict]
        df['source_of_funds_purpose'] = [year_purpose_dict]
        df['page_url'] = page_url
        df = df.merge(unsuccessful_pages_df, on='page_url', how='left')
        df.drop('page_url', axis=1, inplace=True)
#         print('Appending to dataframe for all NGOs...')
        page_df = page_df.append(df)
        page_df.to_csv(f'./sectorno_{sector_num}_pageno_{page_num}_ngo_scraping.csv', index=False)
#         print('Closing pop-up window')
        close_button = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#ngo_info_modal > div.modal-dialog.modal-lg > div > div.modal-header > button')))
        time.sleep(2)
        close_button.click()
    return page_df

In [None]:
urls_to_scrape = df['page_url'].tolist()
all_pages_df = pd.DataFrame()
for url in tqdm(urls_to_scrape): 
    page_df = scrape_a_single_page(url)
    all_pages_df = all_pages_df.append(page_df)

In [None]:
all_pages_df

### code for removing curly brackets from the dictionaries

In [None]:
test_df = pd.read_csv('sectorno_0_pageno_10_ngo_scraping.csv') # replacing with whichever csv file has columns to remove curly brackets from 

In [None]:
def remove_curly_brackets(input_dict): 
    to_string = str(input_dict)
    to_string= to_string.replace('{', '')
    to_string= to_string.replace('}', '')
    return to_string 

In [None]:
member_cols = [x for x in test_df.columns if 'members' in x]
funds_cols = [x for x in test_df.columns if 'source_of_funds' in x]
dictionary_cols = member_cols + funds_cols

In [None]:
for col in dictionary_cols: 
    test_df[col] = test_df[col].apply(lambda x: remove_curly_brackets(x))

In [None]:
test_df