In [2]:
import requests
from bs4 import BeautifulSoup
import re
import csv

In [3]:
# https://www.dataquest.io/blog/web-scraping-tutorial-python/
def get_titles(weblink):
	page = requests.get(weblink)
	soup = BeautifulSoup(page.content, 'html.parser')
	title_items = soup.select('h4.artifact-title a')
	title_names = [i.get_text() for i in title_items]
	return title_names

p1titles = get_titles("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=0&etal=-1&order=ASC")
p2titles = get_titles("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=20&etal=-1&order=ASC")

web_titles = p1titles + p2titles


In [4]:
def get_alpha_titles(titleslist):
	alpha_t = []
	for i in titleslist:
		strlist = re.findall(r"[\w']+", i)
		strwt = ' '.join(strlist)
		alpha_t.append(strwt)
	return alpha_t

In [5]:
def load_csv_titles(filename):
	with open(filename, 'r', encoding='utf-8') as file:
		# create empty array to hold each entry as a dict
		data = []
		# create a csv-reader object
		reader = csv.DictReader(file)
		# loop through each row in the csv-reader object...
		for row in reader:
			data.append(row['title'])
	return data


In [6]:

csv_titles = load_csv_titles('./data/mais.csv')

alpha_wt = get_alpha_titles(web_titles)
alpha_ct = get_alpha_titles(csv_titles)

missing_titles = []
for i in alpha_wt:
	if i not in alpha_ct:
		missing_titles.append(i)

In [10]:
missing_titles

["Modernizing the Greek Tragedy Clint Eastwood's Impact on the Western",
 'The War With No End Sentencing Disparities in the War on Drugs and National Trends that are Defining a Nation',
 'Building Sustainable Behavior through Social Marketing Encouraging Reusable Shopping Bag Use at Stadium Thriftway in Tacoma WA A Case Study',
 'David and Goliath Individualism and Liberty in the Italian Renaissance and the American Revolution',
 'Master of Interdisaplinary Studies',
 'Sexual Violence in a Native American Community Native American Women Speak Out',
 'These are the Ghettos of Washington Public Housing and Neoliberalization in Tacoma WA',
 "Critical reflections on and in the field The Study of Religion' and the methodology of true reflexive praxis in Puerto Rico",
 'Digital Activism How Social Media Prevalence has Impacted Modern Activism',
 'HIVAIDS Social Stigma and Visual Art']

In [14]:
def get_web_info(weblink):
	page = requests.get(weblink)
	soup = BeautifulSoup(page.content, 'html.parser')
	
	title_items = soup.select('h4.artifact-title a')
	title_names = [i.get_text() for i in title_items]
	alpha_titles = get_alpha_titles(title_names)
	
	link_items = soup.select('h4.artifact-title a[href]')
	link_text = [i['href'] for i in link_items]
	full_link_text = ['https://digital.lib.washington.edu'+i for i in link_text]

	au_items = soup.select('div.artifact-info')
	au_text = [i.get_text() for i in au_items]
	alphanum_au = get_alpha_titles(au_text)
	aulist_list = [re.findall('[%A-Za-z]+', i) for i in alphanum_au]

	keys = ['title','link','author']
	values = list(zip(alpha_titles,full_link_text,aulist_list))
	web_info = [dict(zip(keys, v)) for v in values]
	return web_info

In [15]:
p1 = get_web_info("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=0&etal=-1&order=ASC")
p2 = get_web_info("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=20&etal=-1&order=ASC")
web_info = p1 + p2
# print(web_info)

In [16]:
missing_info = [i for i in web_info if i['title'] in missing_titles]
missing_info

[{'author': ['Williams', 'Jacob', 'A'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/20881',
  'title': "Modernizing the Greek Tragedy Clint Eastwood's Impact on the Western"},
 {'author': ['Campbell', 'Crystal'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/23386',
  'title': 'The War With No End Sentencing Disparities in the War on Drugs and National Trends that are Defining a Nation'},
 {'author': ['Laakso', 'Alysen', 'Kristen'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/24105',
  'title': 'Building Sustainable Behavior through Social Marketing Encouraging Reusable Shopping Bag Use at Stadium Thriftway in Tacoma WA A Case Study'},
 {'author': ['McConnell', 'Jennifer'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/24106',
  'title': 'David and Goliath Individualism and Liberty in the Italian Renaissance and the American Revolution'},
 {'author': ['Panzer', 'Sean', 'Robert'],