In [1]:
import requests
from bs4 import BeautifulSoup
import re
import csv

In [2]:
# https://www.dataquest.io/blog/web-scraping-tutorial-python/
def get_titles(weblink):
    page = requests.get(weblink)
    soup = BeautifulSoup(page.content, 'html.parser')
    title_items = soup.select('h4.artifact-title a')
    title_names = [i.get_text() for i in title_items]
    return title_names

p1titles = get_titles("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=0&etal=-1&order=ASC")
p2titles = get_titles("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=20&etal=-1&order=ASC")

web_titles = p1titles + p2titles


In [3]:
def get_alpha_titles(titleslist):
    alpha_t = []
    for i in titleslist:
        strlist = re.findall(r"[\w']+", i)
        strwt = ' '.join(strlist)
        alpha_t.append(strwt)
    return alpha_t

In [62]:
def load_csv_titles(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        # create empty array to hold each entry as a dict
        data = []
        # create a csv-reader object
        reader = csv.DictReader(file)
        # loop through each row in the csv-reader object...
        for row in reader:
            data.append(row['title'])
    return data

In [64]:
csv_titles = load_csv_titles('./data/mais.csv')
print(csv_titles)
alpha_wt = get_alpha_titles(web_titles)
alpha_ct = get_alpha_titles(csv_titles)

missing_titles = [i for i in alpha_wt if i not in alpha_ct]
missing_titles2 = [i for i in web_titles if i not in csv_titles]
missing_titles2

['Experiences of Lesbian-Parented Children: A Journey of Discovery', 'The War With No End: Sentencing Disparities in the War on Drugs" and National Trends that are Defining a Nation""he War With No End: Sentencing Disparities in the "War on Dr""e War Wit"', "Girls, STEM, and Children's Books: A Review of the Literature Concerning Girls' Interest, Motivation and Ability in STEM, Complemented by a Mixed Methods Content Analysis of Award Winning Informational Children's Books", "Roads Less Traveled: Access, Automobiles, and Recreation in Mount Rainier National Park's Wilderness Areas", 'Double Coded Feminist TV- Overlooked Contradictions within Buffy the Vampire Slayer', 'Paradigms of American Identity: and the Struggle for a More Authentic Self', 'Assessment and Monitoring of Water Quality in Lake Waughop as a Service-Learning Project: A Case Study Approach', 'Assuming Rape: The Reproduction of Fear in American Military Female POWs', "Growing a greener Tacoma: The historical roots of Tac

["Modernizing the Greek Tragedy: Clint Eastwood's Impact on the Western",
 'The War With No End: Sentencing Disparities in the "War on Drugs" and National Trends that are Defining a Nation',
 'Building Sustainable Behavior through Social Marketing: Encouraging Reusable Shopping Bag Use at Stadium Thriftway in Tacoma, WA - A Case Study',
 'David and Goliath: Individualism and Liberty in the Italian Renaissance and the American Revolution',
 'Master of Interdisaplinary Studies',
 'Sexual Violence in a Native American Community: Native American Women Speak Out',
 '"These are the Ghettos of Washington": Public Housing and Neoliberalization in Tacoma, WA',
 'Critical reflections on and in "the field": The Study of `Religion\' and the methodology of true (reflexive) praxis in Puerto Rico',
 'Digital Activism: How Social Media Prevalence has Impacted Modern Activism',
 'HIVAIDS Social Stigma and Visual Art']

In [54]:
missing_titles

["Modernizing the Greek Tragedy Clint Eastwood's Impact on the Western",
 'The War With No End Sentencing Disparities in the War on Drugs and National Trends that are Defining a Nation',
 'Building Sustainable Behavior through Social Marketing Encouraging Reusable Shopping Bag Use at Stadium Thriftway in Tacoma WA A Case Study',
 'David and Goliath Individualism and Liberty in the Italian Renaissance and the American Revolution',
 'Master of Interdisaplinary Studies',
 'Sexual Violence in a Native American Community Native American Women Speak Out',
 'These are the Ghettos of Washington Public Housing and Neoliberalization in Tacoma WA',
 "Critical reflections on and in the field The Study of Religion' and the methodology of true reflexive praxis in Puerto Rico",
 'Digital Activism How Social Media Prevalence has Impacted Modern Activism',
 'HIVAIDS Social Stigma and Visual Art']

In [50]:
def get_web_info(weblink):
    page = requests.get(weblink)
    soup = BeautifulSoup(page.content, 'html.parser')

    title_items = soup.select('h4.artifact-title a')
    title_names = [i.get_text() for i in title_items]
    alpha_titles = get_alpha_titles(title_names)

    link_items = soup.select('h4.artifact-title a[href]')
    link_text = [i['href'] for i in link_items]
    full_link_text = ['https://digital.lib.washington.edu'+i+'?show=full' for i in link_text]

    au_items = soup.select('div.artifact-info')
    au_text = [i.get_text() for i in au_items]
    alphanum_au = get_alpha_titles(au_text)
    aulist_list = [re.findall('[%A-Za-z]+', i) for i in alphanum_au]

    keys = ['alphatitle','title','link','author']
    values = list(zip(alpha_titles,title_names,full_link_text,aulist_list))
    web_info = [dict(zip(keys, v)) for v in values]
    return web_info

In [51]:
p1 = get_web_info("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=0&etal=-1&order=ASC")
p2 = get_web_info("https://digital.lib.washington.edu/researchworks/handle/1773/20063/browse?rpp=20&sort_by=2&type=dateissued&offset=20&etal=-1&order=ASC")
web_info = p1 + p2
# print(web_info)

In [52]:
missing_info = [i for i in web_info if i['alphatitle'] in missing_titles]
missing_info

[{'alphatitle': "Modernizing the Greek Tragedy Clint Eastwood's Impact on the Western",
  'author': ['Williams', 'Jacob', 'A'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/20881?show=full',
  'title': "Modernizing the Greek Tragedy: Clint Eastwood's Impact on the Western"},
 {'alphatitle': 'The War With No End Sentencing Disparities in the War on Drugs and National Trends that are Defining a Nation',
  'author': ['Campbell', 'Crystal'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/23386?show=full',
  'title': 'The War With No End: Sentencing Disparities in the "War on Drugs" and National Trends that are Defining a Nation'},
 {'alphatitle': 'Building Sustainable Behavior through Social Marketing Encouraging Reusable Shopping Bag Use at Stadium Thriftway in Tacoma WA A Case Study',
  'author': ['Laakso', 'Alysen', 'Kristen'],
  'link': 'https://digital.lib.washington.edu/researchworks/handle/1773/24105?show=full',
  'title': 'Buildin

In [17]:
def get_dc_info(weblink):
    page = requests.get(weblink)
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find("table", attrs={"class":"ds-includeSet-table detailtable table table-striped table-hover"})
    #table_rows = table.find_all("tr")
    table_heads = [h.get_text() for h in table.find_all("td", attrs={'class':'label-cell'})]
    table_info = [i.get_text() for i in table.find_all("td", attrs={'class':'word-break'})]
    # if do dict zip, removes duplicate keys! so get a list of tuple
    all_heads_info = list(zip(table_heads,table_info))
    # then create a new dict where each value is a list
    dict_heads_info = {}
    for x, y in all_heads_info:
        dict_heads_info.setdefault(x, []).append(y)
    # then, flatten 1 item lists
    for k,v in dict_heads_info.items():
        if len(v) == 1:
            dict_heads_info[k] = v[0]
            #print(v)
    return dict_heads_info
#get_dc_info('https://digital.lib.washington.edu/researchworks/handle/1773/38049?show=full')

{'dc.contributor.advisor': ['Jolly, Natalie', 'Chamberlain, Ed'],
 'dc.contributor.author': 'Demmings, Naomi',
 'dc.date.accessioned': '2017-02-14T22:35:34Z',
 'dc.date.submitted': '2016-12',
 'dc.description': "Thesis (Master's)--University of Washington, 2016-12",
 'dc.description.abstract': 'The purpose of this research is to examine the development and progression of HIV/AIDS stigma within a social structure of power and powerlessness from the early 1980s to the 2010s, through a case study of selected visual images. I focus on the social aspect of how HIV/AIDS is given social stigmas that cause as much suffering as the disease’s physical health effects.  To do this, I apply Erving Goffman’s theory on stigma and analyzing visual images from the early 1980s, 1990s and early 2000s to consider how HIV/AIDS has been constructed and reinforced through time. In considering the historical context I show that each of these images responds to stigma as it existed in the early 1980s but also 

In [47]:
spreadsheet = []
for row in missing_info:
    full_info = get_dc_info(row['link'])
    ex_row = {}

    ex_row['title'] = row['title']
    ex_row['author1_fname'] = row['author'][1]
    if len(row['author']) == 3:
        ex_row['author1_mname'] = row['author'][2]
    else:
        ex_row['author1_mname'] = ''
    ex_row['author1_lname'] = row['author'][0]
    ex_row['author1_institution'] = 'University of Washington Tacoma'
    ex_row['author1_email'] = ''
    ex_row['fulltext_url'] = ''
    ex_row['author1_suffix'] = ''
    ex_row['season'] = ''
    ex_row['comments'] = ''
    ex_row['degree_name'] = 'Master of Arts in Interdisciplinary Studies (MAIS)'
    ex_row['abstract'] = full_info['dc.description.abstract']
    ex_row['publication_date'] = full_info['dc.date.accessioned']
    ex_row['department'] = 'Interdisciplinary Arts and Sciences'

    if '1 year' in full_info['dc.embargo.terms']:
        ex_row['document_type'] = 'restrict_1yr'
    elif '2 years' in full_info['dc.embargo.terms']:
        ex_row['document_type'] = 'restrict_2yr'
    elif '5 years' in full_info['dc.embargo.terms']:
        ex_row['document_type'] = 'restrict_5yr'
    else:
        ex_row['document_type'] = 'open_access'

    if ex_row['document_type'] != 'open_access':
        ex_row['date_avail'] = full_info['dc.embargo.lift']
    else:
        ex_row['date_avail'] = ''
    
    if isinstance(full_info['dc.contributor.advisor'], str):
        #for name in full_info['dc.contributor.advisor']:
        namelist = full_info['dc.contributor.advisor'].split(', ')
        ex_row['advisor1'] = namelist[1]+' '+namelist[0]
        ex_row['advisor2'] = ''
        ex_row['advisor3'] = ''
        ex_row['advisor4'] = ''
    elif len(full_info['dc.contributor.advisor']) == 2:
        names = full_info['dc.contributor.advisor']
        namelist1 = names[0].split(', ')
        #print(full_info['dc.contributor.advisor'])
        ex_row['advisor1'] = namelist1[1]+' '+namelist1[0]
        namelist2 = names[1].split(', ')
        ex_row['advisor2'] = namelist2[1]+' '+namelist2[0]
        ex_row['advisor3'] = ''
        ex_row['advisor4'] = ''
    elif len(full_info['dc.contributor.advisor']) == 3:
        names = full_info['dc.contributor.advisor']
        namelist1 = names[0].split(', ')
        ex_row['advisor1'] = namelist1[1]+' '+namelist1[0]
        namelist2 = names[1].split(', ')
        ex_row['advisor2'] = namelist2[1]+' '+namelist2[0]
        namelist3 = names[2].split(', ')
        ex_row['advisor3'] = namelist3[1]+' '+namelist3[0]
        ex_row['advisor4'] = ''
    elif len(full_info['dc.contributor.advisor']) == 4:
        names = full_info['dc.contributor.advisor']
        namelist1 = names[0].split(', ')
        ex_row['advisor1'] = namelist1[1]+' '+namelist1[0]
        namelist2 = names[1].split(', ')
        ex_row['advisor2'] = namelist2[1]+' '+namelist2[0]
        namelist3 = names[2].split(', ')
        ex_row['advisor3'] = namelist3[1]+' '+namelist3[0]
        namelist4 = names[3].split(', ')
        ex_row['advisor4'] = namelist4[1]+' '+namelist4[0]

    if full_info['dc.type'] == 'Thesis':
        ex_row['work_type'] = 'Masters Thesis'
    else:
        ex_row['work_type'] = 'Masters Capstone Project'

    if isinstance(full_info['dc.subject'], str):
        ex_row['keywords'] = full_info['dc.subject'].replace(';',',')
        #print(ex_row['keywords'])
    else:
        #print(full_info['dc.subject'].type())
        ex_row['keywords'] = ', '.join(full_info['dc.subject'])
        
    
    if 'interdisciplinary arts and sciences - tacoma' in full_info['dc.subject.other']:
        full_info['dc.subject.other'].remove('interdisciplinary arts and sciences - tacoma')
        ex_row['disciplines'] = '; '.join(full_info['dc.subject.other'])
    elif 'Interdisciplinary arts and sciences - Tacoma' in full_info['dc.subject.other']:
        full_info['dc.subject.other'].remove('Interdisciplinary arts and sciences - Tacoma')
        ex_row['disciplines'] = '; '.join(full_info['dc.subject.other'])
    else:
        ex_row['disciplines'] = '; '.join(full_info['dc.subject.other'])

    spreadsheet.append(ex_row)

Blood Justice, Eastwood, Theseus
African-American, Discrimination, Incarceration, Offender Rights, War on Drugs, Washington State
community-based social marketing, LCA of shopping bags, reusable bag, shopping bag, shopping bag legislation, strong sustainability
American Revolution, Individualism, Italian Renaissance, Liberty, Philosophy, Religion
third parties
colonialism, feminism, interviews, Native American women, sexual violence, violent victimization
Discourse, Diversity, Mixed-income development, Neoliberalism, Place, Public housing reform
Methodology, Methods, praxis, Puerto Rico, Qualitative research, reflexivity


In [48]:
spreadsheet

[{'abstract': "A thesis on Clint Eastwood's impact on the Western Film Genre. It discusses how Eastwood's acting and directing relate to classical Greek literature. It compares and contrasts Eastwood's Western hero to classical Greek heores, in particular the Athenian Theseus. And it argues that the Western is not dead, but that it is immortal.",
  'advisor1': 'Claudia Gorbman',
  'advisor2': '',
  'advisor3': '',
  'advisor4': '',
  'author1_email': '',
  'author1_fname': 'Jacob',
  'author1_institution': 'University of Washington Tacoma',
  'author1_lname': 'Williams',
  'author1_mname': 'A',
  'author1_suffix': '',
  'comments': '',
  'date_avail': '',
  'degree_name': 'Master of Arts in Interdisciplinary Studies (MAIS)',
  'department': 'Interdisciplinary Arts and Sciences',
  'disciplines': 'Film studies; Classical studies',
  'document_type': 'open_access',
  'fulltext_url': '',
  'keywords': 'Blood Justice, Eastwood, Theseus',
  'publication_date': '2012-09-13T17:39:15Z',
  'sea

In [49]:
def write(data, output):
    fieldnames = ['title', 'publication_date', 'season', 'document_type', 'date_avail', 
              'work_type', 'degree_name', 'department', 'advisor1', 'advisor2',
              'advisor3', 'advisor4', 'keywords', 'disciplines', 'abstract', 
              'comments', 'fulltext_url', 'author1_fname', 'author1_mname', 'author1_lname', 
              'author1_suffix', 'author1_email', 'author1_institution']
    with open(output, 'w', newline='',encoding='utf-8-sig') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        dict_writer.writeheader()
        dict_writer.writerows(data)

write(spreadsheet, './data/mais_update.csv')