In [2]:
import requests
from bs4 import BeautifulSoup
import re
import csv

In [3]:
# https://www.dataquest.io/blog/web-scraping-tutorial-python/
def get_titles(weblink):
	page = requests.get(weblink)
	soup = BeautifulSoup(page.content, 'html.parser')
	title_items = soup.select('h4.artifact-title a')
	title_names = [i.get_text() for i in title_items]
	return title_names

p1titles = get_titles("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=0&etal=-1&order=ASC")
p2titles = get_titles("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=20&etal=-1&order=ASC")

web_titles = p1titles + p2titles


In [4]:
def get_alpha_titles(titleslist):
	alpha_t = []
	for i in titleslist:
		strlist = re.findall(r"[\w']+", i)
		strwt = ' '.join(strlist)
		alpha_t.append(strwt)
	return alpha_t

In [5]:
def load_csv_titles(filename):
	with open(filename, 'r', encoding='utf-8') as file:
		# create empty array to hold each entry as a dict
		data = []
		# create a csv-reader object
		reader = csv.DictReader(file)
		# loop through each row in the csv-reader object...
		for row in reader:
			data.append(row['title'])
	return data


In [6]:

csv_titles = load_csv_titles('./data/mais.csv')

alpha_wt = get_alpha_titles(web_titles)
alpha_ct = get_alpha_titles(csv_titles)

missing_titles = []
for i in alpha_wt:
	if i not in alpha_ct:
		missing_titles.append(i)

In [10]:
missing_titles

["Modernizing the Greek Tragedy Clint Eastwood's Impact on the Western",
 'The War With No End Sentencing Disparities in the War on Drugs and National Trends that are Defining a Nation',
 'Building Sustainable Behavior through Social Marketing Encouraging Reusable Shopping Bag Use at Stadium Thriftway in Tacoma WA A Case Study',
 'David and Goliath Individualism and Liberty in the Italian Renaissance and the American Revolution',
 'Master of Interdisaplinary Studies',
 'Sexual Violence in a Native American Community Native American Women Speak Out',
 'These are the Ghettos of Washington Public Housing and Neoliberalization in Tacoma WA',
 "Critical reflections on and in the field The Study of Religion' and the methodology of true reflexive praxis in Puerto Rico",
 'Digital Activism How Social Media Prevalence has Impacted Modern Activism',
 'HIVAIDS Social Stigma and Visual Art']

In [20]:
def get_web_info(weblink):
	page = requests.get(weblink)
	soup = BeautifulSoup(page.content, 'html.parser')
	
	title_items = soup.select('h4.artifact-title a')
	title_names = [i.get_text() for i in title_items]
	alpha_titles = get_alpha_titles(title_names)
	
	link_items = soup.select('h4.artifact-title a[href]')
	link_text = [i['href'] for i in link_items]
	full_link_text = ['https://digital.lib.washington.edu'+i+'?show=full' for i in link_text]

	au_items = soup.select('div.artifact-info')
	au_text = [i.get_text() for i in au_items]
	alphanum_au = get_alpha_titles(au_text)
	aulist_list = [re.findall('[%A-Za-z]+', i) for i in alphanum_au]

	keys = ['title','link','author']
	values = list(zip(alpha_titles,full_link_text,aulist_list))
	web_info = [dict(zip(keys, v)) for v in values]
	return web_info

In [21]:
p1 = get_web_info("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=0&etal=-1&order=ASC")
p2 = get_web_info("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=20&etal=-1&order=ASC")
web_info = p1 + p2
# print(web_info)

In [22]:
missing_info = [i for i in web_info if i['title'] in missing_titles]
missing_info

[{'author': ['Williams', 'Jacob', 'A'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/20881?show=full',
  'title': "Modernizing the Greek Tragedy Clint Eastwood's Impact on the Western"},
 {'author': ['Campbell', 'Crystal'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/23386?show=full',
  'title': 'The War With No End Sentencing Disparities in the War on Drugs and National Trends that are Defining a Nation'},
 {'author': ['Laakso', 'Alysen', 'Kristen'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/24105?show=full',
  'title': 'Building Sustainable Behavior through Social Marketing Encouraging Reusable Shopping Bag Use at Stadium Thriftway in Tacoma WA A Case Study'},
 {'author': ['McConnell', 'Jennifer'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/24106?show=full',
  'title': 'David and Goliath Individualism and Liberty in the Italian Renaissance and the American Revolution'},
 

In [19]:
fieldnames = ['title', 'publication_date', 'season', 'document_type', 'date_avail', 
              'work_type', 'degree_name', 'department', 'advisor1', 'advisor2'
              'advisor3', 'advisor4', 'keywords', 'disciplines', 'abstract', 
              'comments', 'fulltext_url', 'author1_fname', 'author1_mname', 'author1_lname', 
              'author1_suffix', 'author1_suffix', 'author1_email', 'author1_institution']

In [25]:
spreadsheet = []
for row in missing_info:
    ex_row = {}
    ex_row['title'] = row['title']
    ex_row['author1_fname'] = row['author'][1]
    if len(row['author']) == 3:
        ex_row['author1_mname'] = row['author'][2]
    else:
        ex_row['author1_mname'] = ''
    ex_row['author1_lname'] = row['author'][0]
    ex_row['author1_institution'] = 'University of Washington Tacoma'
    ex_row['author1_email'] = 'SOMETHING'
    ex_row['fulltext_url'] = ''
    ex_row['degree_name'] = 'Master of Arts in Interdisciplinary Studies (MAIS)'
    spreadsheet.append(ex_row)

In [27]:
def get_excel_info(weblink):
    page = requests.get(weblink)
    soup = BeautifulSoup(page.content, 'html.parser')
    title_items = soup.select('td.artifact-title a')
    for item in soup.findAll('meta'):
        meta_attrs = dict(item.attrs)
        print(meta_attrs[u'content'])
get_excel_info('https://digital.lib.washington.edu/researchworks/handle/1773/38049?show=full')
#     keys = ['title','link','author']
#     values = list(zip(alpha_titles,full_link_text,aulist_list))
#     web_info = [dict(zip(keys, v)) for v in values]
#     return web_info

text/html; charset=UTF-8
text/html; charset=UTF-8
IE=edge,chrome=1
width=device-width,initial-scale=1
DSpace 5.6
Jolly, Natalie
Chamberlain, Ed
Demmings, Naomi
2017-02-14T22:35:34Z
2016-12
Demmings_washington_0250O_16652.pdf
http://hdl.handle.net/1773/38049
Thesis (Master's)--University of Washington, 2016-12
The purpose of this research is to examine the development and progression of HIV/AIDS stigma within a social structure of power and powerlessness from the early 1980s to the 2010s, through a case study of selected visual images. I focus on the social aspect of how HIV/AIDS is given social stigmas that cause as much suffering as the disease’s physical health effects.  To do this, I apply Erving Goffman’s theory on stigma and analyzing visual images from the early 1980s, 1990s and early 2000s to consider how HIV/AIDS has been constructed and reinforced through time. In considering the historical context I show that each of these images responds to stigma as it existed in the early 