In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup


In [None]:
#get author/artist page links from data and scrape each to get title links
authors = pd.read_csv('sisters_of_tomorrow_names.csv')
links = authors['link']
work_title_link = []

for i in links:
    site = requests.get (i)
    html_source = site.content
    soup = BeautifulSoup(html_source)
    a_tags = soup.find_all('a', class_="italic")
    for a in a_tags:
        work_title_link.append(a['href']) 

In [None]:
#transform title links to cover page links that include all documented image covers for each title
#i do this because each title has a list of publications(different editions, etc.) but not all publications have cover images. The titlecovers page lists all of the covers for a given title.
coverpages = []
work_title_link = pd.read_csv('Title_links.csv')['Links']
for i in work_title_link: 
    coverpages.append(i.replace("https://www.isfdb.org/cgi-bin/title.cgi?", "https://www.isfdb.org/cgi-bin/titlecovers.cgi?"))

#get internal ids for each publication listed on the titlecover page.

publication = []
image_link=[] #also logging image link and title for debugging
work=[]

for i in coverpages[0:200]: 
    site = requests.get (i)
    html_source = site.content
    soup = BeautifulSoup(html_source)
    a_tags = soup.find_all('a', dir="ltr")
    
    for a in a_tags:
        image= a.find('img')
        if image:
            image_link.append(image['src'])  
            publication.append(a['href']) 

    worklink = soup.find('a', class_="bold")
    if worklink:
        work.append(worklink['href']) 

covers = []
covers.append({'image_link': image_link, 'publication_link': publication , 'work_link': work})


In [None]:
#scrape all publications info by internal id for each publication 
id = []
for i in publication: 
    id.append(i.split("?", 1)[-1])

title = []
record = []
coverimage_link = []
author = []
publisher = []
date = []
price = []
pages = []
binding = []
tag = []
publication_type = []
note = []

#BS4 to parse the xml from ISFDB api links 
for i in id:
    site = requests.get ('https://www.isfdb.org/cgi-bin/rest/getpub_by_internal_ID.cgi?'+ i)
    source = site.content
    soup = BeautifulSoup(source, 'xml')
    pub = soup.find('Publication')
    record.append(pub.find('Record').text if pub.find('Record') else None)
    title.append(pub.find('Title').text if pub.find('Title') else None)
    coverimage_link.append(pub.find('Image').text if pub.find('Image') else None)
    author.append(pub.find('Author').text if pub.find('Author') else None)
    publisher.append(pub.find('Publisher').text if pub.find('Publisher') else None)
    date.append(pub.find('Year').text if pub.find('Year') else None)
    price.append(pub.find('Price').text if pub.find('Price') else None)
    pages.append(pub.find('Pages').text if pub.find('Pages') else None)
    binding.append(pub.find('Binding').text if pub.find('Binding') else None)
    tag.append(pub.find('Tag').text if pub.find('Tag') else None)
    publication_type.append(pub.find('Type').text if pub.find('Type') else None)
    note.append(pub.find('Note').text if pub.find('Note') else None)

All_SF = pd.DataFrame({
    "Record": record,
    "Title": title,
    "Cover Image Link": coverimage_link,
    "Author": author,
    "Publisher": publisher,
    "Date": date,
    "Price": price,
    "Pages": pages,
    "Binding": binding,
    "Tag": tag,
    "Publication Type": publication_type,
    "Note": note
})

All_SF.to_csv('pulp_publications1.csv', index=True, header=True)

In [None]:
#run google vit model for all the images
from transformers import pipeline
pipe = pipeline("image-classification", model="google/vit-base-patch16-224")

pulps = pd.read_csv('pulp_publications1.csv')
pulps = pulps[pulps['Binding'] == "pulp"]#filter for just the pulp publications
images = pulps['Cover Image Link']

labels_google = []

for i in images:
    output = pipe(i)
    labels_google.append(output)


In [None]:
#run microsoft model for all the images
pipe = pipeline("image-classification", model="microsoft/resnet-50")
labels_microsoft = []

for i in images:
    output = pipe(i)
    labels_microsoft.append(output)

In [None]:
#run oschamp/vit-artworkclassifier model for all the images

pipe = pipeline("image-classification", model="oschamp/vit-artworkclassifier")
labels_oschamp = []

for i in images:
    output = pipe(i)
    labels_oschamp.append(output)

In [None]:
#append generated labels to the csv, keeping only the labels with a confidance score above 0.3

def extract_labels(data): #tried writing a function here to be reused for each model's output.
    return [
        ', '.join([item['label'] for item in output if item['score'] >= 0.3])
        for output in data
    ]

#append them to the working dataframe
google_vit_tags = extract_labels(labels_google)
pulps['google_vit'] = google_vit_tags

microsoft_tags = extract_labels(labels_microsoft)
pulps['microsoft'] = microsoft_tags

oschamp_tags = extract_labels(labels_oschamp)
pulps['oschamp'] = oschamp_tags


In [None]:
#load tagging results from csv
outputs = pd.read_csv("outputs.csv")

#extract the record id from image name
outputs['Record'] = outputs['image name'].str.split('.').str[0]

#set an empty column in pulps and add tags from the wd14 tagging results by matching record id
pulps['wd14'] = ""
for i in range(len(pulps)):
    for j in range(len(outputs)):
        if pulps.loc[i, 'Record'] == outputs.loc[j, 'Record']:
            pulps.loc[i, 'wd14'] = outputs.loc[j, 'taglist']

In [None]:
pulps.to_csv('pulp_publications2.csv', index=True, header=True)