## Aim

I want to improve the script I did yesterday. Specifically, I want to scrape the session info of a paper and correct mistakes. 

In [1]:
import pandas as pd
import numpy as np
import time 
import math
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import Select

In [2]:
def click_on_view_program():
    all_btn = driver.find_elements(
        By.CSS_SELECTOR, 
        "div.menu_item__icon_text_window__text > a.mainmenu_text"
    )
    for btn in all_btn:
        if 'Program' in btn.text:
            view_program_btn = btn 
            break
    view_program_btn.click()

In [3]:
def click_on_individual_presentations():
    '''
    To click on 'individual presentations'
    '''
    presentations = wait.until(EC.element_to_be_clickable((
        By.XPATH,
        '//td[@class="tab_topped_window__tab_cell"][2]'
    )))
    presentations.click()

In [4]:
def get_papers():
    """
    get all paper elements in the current page
    """
    papers = driver.find_elements(
        By.CSS_SELECTOR, 'tr.worksheet_window__row__light, tr.worksheet_window__row__dark'
    )
    return papers

def get_paper_meta(paper, year, paper_meta_dict_list):
    """
    get paper index, paper title, and paper_type
        the author names can be found here but I'll collect later in the view page
    """
    idx = paper.find_element(
        By.CSS_SELECTOR, 'td[title="##"]').text
    paper_id = year + '-' + idx
    # summary elements:
    summary = paper.find_element(
        By.CSS_SELECTOR, 'td[title="Summary"]'
    )
    title = summary.find_element(
        By.CSS_SELECTOR, 'a.search_headingtext'
    ).text
    session_division_submitType = summary.find_elements(
        By.CSS_SELECTOR, 'td[style="padding: 5px;"] tr'
    )
    session = session_division_submitType[0]
    division = session_division_submitType[1]
    submission_type = session_division_submitType[2]
    session = session.text.lstrip('  In Session Submission: ')
    division = division.text.lstrip('  Session Submission Division: ')
    submission_type = submission_type.text.lstrip('  Individual Submission type: ')
    paper_meta_dict = {
        'Paper ID': paper_id,
        'Title': title,
        'Session': session,
        'Division': division,
        'Sumission Type': submission_type
    }
    # update the dict list
    paper_meta_dict_list.append(paper_meta_dict)
    return paper_meta_dict

def open_view(paper):
    """
    Input:
        paper element
    Aim:
        open a new window and click 'view'
    """
    action = paper.find_element(
        By.CSS_SELECTOR, 'td[title="Action"]'
    )
    view_link_e = action.find_element(
                By.CSS_SELECTOR, "li.action_list > a.fieldtext"
            )
    view_link = view_link_e.get_attribute('href')
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    driver.get(view_link)

def get_title_to_check(paper_meta_dict_list):
    # there are two 'tr.header font.headingtext'
    # title is the second one
    headingtexts = driver.find_elements(
        By.CSS_SELECTOR, 'tr.header font.headingtext'
    )
    title_to_check = headingtexts[1].text
    # update the most recent paper_meta_dict_list
    paper_meta_dict_list[-1]['Title to Check'] = title_to_check
    return title_to_check


def get_authors(paper_meta_dict, author_dict_list):
    paper_id, title = paper_meta_dict['Paper ID'], paper_meta_dict['Title']
    # note that authors_e will return a list since there might be multiple authors
    authors = driver.find_elements(
        By.CSS_SELECTOR, 'a.search_fieldtext_name'
    )
    for author in authors:
        author_idx = authors.index(author) + 1
        authorNum = len(authors)
        author_elements = author.text.split(' (')
        author_name = author_elements[0]
        # doc: https://docs.python.org/3.4/library/stdtypes.html?highlight=strip#str.rstrip
        # some don't contain '()', i.e., affiliation info
        try:
            author_aff = author_elements[1].rstrip(')')
        except:
            author_aff = np.nan
        author_dict = {
            'Paper ID': paper_id,
            'Paper Title': title,
            'Number of Authors': authorNum,
            'Author Position': author_idx,
            'Author Name': author_name,
            'Author Affiliation': author_aff,
        }
        author_dict_list.append(author_dict)

def get_abstract(paper_meta_dict_list):
    # abstract
    abstract = driver.find_element(
        By.CSS_SELECTOR, 'blockquote.tight > font.fieldtext'
    ).text
    paper_meta_dict_list[-1]['Abstract'] = abstract
    return abstract

def scrape_one_page(year, page_num):
    papers = get_papers()
    for paper in papers[0:1]:
        paper_idx = papers.index(paper) + 1
        paper_meta_dict = get_paper_meta(paper, year, paper_meta_dict_list)
        open_view(paper)
        get_title_to_check(paper_meta_dict_list)
        get_authors(paper_meta_dict, author_dict_list)
        get_abstract(paper_meta_dict_list)
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        print(f'Page {page_num} Paper {paper_idx} is done')
        time.sleep(0.5)

In [5]:
# # nth-child(1) means there are two div.iterators, I only need one of the two
def get_iterators():
    iterators = driver.find_elements(
        By.CSS_SELECTOR, "div.iterator:nth-child(1) > form a.fieldtext"
    )
    return iterators

In [6]:
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 10)

In [7]:
url = 'https://convention2.allacademic.com/one/ica/ica05'
driver.get(url)

In [8]:
click_on_view_program()
click_on_individual_presentations()

In [15]:
papers = get_papers()
paper = papers[0]

In [16]:
idx = paper.find_element(
    By.CSS_SELECTOR, 'td[title="##"]').text
idx

'1'

In [17]:
summary = paper.find_element(
        By.CSS_SELECTOR, 'td[title="Summary"]'
    )
title = summary.find_element(
    By.CSS_SELECTOR, 'a.search_headingtext'
).text
title

'\x93A Beard and a Pasty Forehead\x94: Collective Memory of the First Kennedy-Nixon Debate'

In [20]:
session_division_submitType = summary.find_elements(
    By.CSS_SELECTOR, 'td[style="padding: 5px;"] tr'
)
session = session_division_submitType[0]
division = session_division_submitType[1]
submission_type = session_division_submitType[2]

In [21]:
session = session.text.lstrip('  In Session Submission: ')
session

'Campaign Communications: A Look at the Old and the New'

In [24]:
division = division.text.lstrip('  Session Submission Division: ')
division

'Political Communication'

In [26]:
submission_type = submission_type.text.lstrip('  Individual Submission type: ')
submission_type

'Paper'

In [188]:
urlBase = 'https://convention2.allacademic.com/one/ica/ica'
years = range(5,14)
for year in years:
    year = str(year).zfill(2)
    url = urlBase + year
    driver.get(url)
    click_on_view_program()
    click_on_individual_presentations()
    paper_meta_dict_list = []
    author_dict_list = []
    iterators = get_iterators()
    total_pages = int(iterators[-2].text)
    for i in range(1,total_pages+1):
        page_num = i
        print(f'going to solve page {i}')
        if i < 10:
            pass
        elif i >= 10 and i < 17:
            select = Select(driver.find_element(
                By.XPATH, '//child::div[@class="iterator"][1] // select'
            ))
            select.select_by_visible_text('+ 10')
        elif i >= 17 and i < 27:
            select = Select(driver.find_element(
                By.XPATH, '//child::div[@class="iterator"][1] // select'
            ))
            select.select_by_visible_text('+ 20')
        else:
            iterators = get_iterators()
            iterators[-2].click()
        iterators = get_iterators()
        for j in iterators:
            if (j.text == str(i)):
                current_idx = int(j.text)
                j.click()
                break 
        scrape_one_page(f'20+{year}',page_num)
        print(f'page {i} is done')
#         if i < total_pages - 1:
#             driver.refresh()
#         if i == total_pages - 1:
        iterators = get_iterators()
        iterators[1].click()
    print(f'{year} is done!')
    paper_df = pd.DataFrame(paper_meta_dict_list)
    author_df = pd.DataFrame(author_dict_list)
    paper_df.to_csv(f'../data/interim/paper_df_{year}.csv', index = False)
    author_df.to_csv(f'../data/interim/author_df_{year}.csv', index = False)
driver.close()
driver.quit()

going to solve page 1
Page 1 Paper 1 is done
page 1 is done
going to solve page 2
Page 2 Paper 1 is done
page 2 is done
going to solve page 3
Page 3 Paper 1 is done
page 3 is done
going to solve page 4
Page 4 Paper 1 is done
page 4 is done
going to solve page 5
Page 5 Paper 1 is done
page 5 is done
going to solve page 6
Page 6 Paper 1 is done
page 6 is done
going to solve page 7
Page 7 Paper 1 is done
page 7 is done
going to solve page 8
Page 8 Paper 1 is done
page 8 is done
going to solve page 9
Page 9 Paper 1 is done
page 9 is done
going to solve page 10
Page 10 Paper 1 is done
page 10 is done
going to solve page 11
Page 11 Paper 1 is done
page 11 is done
going to solve page 12
Page 12 Paper 1 is done
page 12 is done
going to solve page 13
Page 13 Paper 1 is done
page 13 is done
going to solve page 14
Page 14 Paper 1 is done
page 14 is done
going to solve page 15
Page 15 Paper 1 is done
page 15 is done
going to solve page 16
Page 16 Paper 1 is done
page 16 is done
going to solve page

KeyboardInterrupt: 

In [91]:
# def get_all_labels(): 
#     labels = driver.find_elements(
#         By.CSS_SELECTOR, "label.ui-button"
#     )
#     return labels[2:]

In [92]:
# labels = get_all_labels()
# len(labels)
# labels[0].text
# labels[0].click()

In [93]:
# iterators = get_iterators() # used to ca
# len(iterators)
# total_pages = int(iterators[-2].text)
# total_pages

In [94]:
# for i in range(1,total_pages+1):
#     page_num = i
#     iterators = get_iterators()
#     for j in iterators:
#         if (j.text == str(i)):
#             j.click()
#             break 
#     scrape_one_page('2005',page_num)
#     print(f'page {i} is done')
#     if i < total_pages - 1:
#         labels[0].click()
#     if i == total_pages - 1:
#         iterators = get_iterators()
#         iterators[1].click()

In [95]:
# labels = get_all_labels()
# for char in labels:
#     char.click()
#     papers = get_papers()
#     if papers:
#         iterators = get_iterators() # used to calculate total pages
#         total_pages = int(iterators[-2].text)
#     else:
#         total_pages = np.nan
#     year = '2005'
#     if not math.isnan(total_pages):
#         for i in range(1,total_pages+1):
#             page_num = i
#             iterators = get_iterators()
#             for j in iterators:
#                 if (j.text == str(i)):
#                     j.click()
#                     break 
#             scrape_one_page(year,page_num)
#             print(f'page {i} is done')
#             if i < total_pages - 1:
#                 driver.refresh()
#             if i == total_pages - 1:
#                 iterators = get_iterators()
#                 iterators[1].click()
#     print(f'{char.text} is done')