# Scrape DSpace
This file scrapes all theses in DSpace, added between 2019-2022. 

In [2]:
import xml.etree.ElementTree as ET
import requests
import os
import time
import urllib

In [36]:
def getRecordData(record):
    try: 
        metadata = record[1][0]

        identifier = None
        urls = {}

        for child in metadata:
            if(child.attrib['name'] == "others"): 
                for field in child: 
                    if(field.attrib["name"] == "handle"): 
                        identifier = field.text

            if(child.attrib['name'] == "bundles"): 
                for bundle in child:
                    name = bundle[0].text

                    if(name in {"TEXT", "ORIGINAL"}): 
                        bitstreams = bundle[1][0]    
                        for field in bitstreams: 
                            if field.attrib['name'] == "url": 
                                url = field.text
                                urls[name] = url
                                break
    except IndexError:
        print("Index Error")
        return None, None
    
    return identifier, urls

def parseXML(xmlfile):

    tree = ET.parse(xmlfile)
    root = tree.getroot()
    records = {}

    for record in root:
        identifier, urls = getRecordData(record)
        if identifier: records[identifier] = urls
        
    print(len(records), " records")
    return records

records = parseXML("dspace_theses_2019.xml")

Index Error
Index Error
Index Error
Index Error
45638  records


In [43]:
files = set(os.listdir("./full_text"))

for file in files:
    try:
        with open('./full_text/' + file) as f:
            if f.read(15) == "<!DOCTYPE html>": print(file)
    except: 
        print("ERROR ", file)
    

44870534-MIT.pdf.txt
ERROR  .DS_Store
25053516-MIT.pdf.txt
657327390-MIT.pdf.txt
939674230-MIT.pdf.txt
07346545-MIT.pdf.txt
429904157-MIT.pdf.txt


In [37]:
def download(url, output_folder): 
    files = set(os.listdir(output_folder))
    filename = url.split("/")[-1]
    
    if filename in files or not need_pdf(url): 
#         print("skipping", filename)
        return False
    
    try:
        response = requests.get(url)
    except UnicodeDecodeError:
        resolved_url = urllib.request.urlopen(url).geturl()
        response = requests.get(resolved_url)
    
    if response.status_code == 429: 
        return True
    elif response.status_code != 200:
        print(response.status_code)
    
    print("downloading", url)
    
    open(os.path.join(output_folder, filename), "wb").write(response.content)
    
    time.sleep(15)
    
    return False

def need_pdf(url): 
    filename = url.split("/")[-1]
    try:
        with open('./full_text/' + filename + ".txt") as f:
            return f.read(15) == "<!DOCTYPE html>"
    except: 
        return True
        

def delete(url, output_folder): 
    files = set(os.listdir(output_folder))
    filename = url.split("/")[-1]
    
    if filename not in files: return 
    
    path = os.path.join(output_folder, filename)
    print("deleting", path)
    os.remove(path)
    
    

In [38]:
count = 0

for identifier, urls in records.items():
    try: 
        timed_out = False
#         download_pdf = False
#         if "TEXT" in urls: 
#             timed_out = download(urls["TEXT"], "full_text")
#             if not timed_out: download_pdf = need_pdf(urls["TEXT"])    
        if "ORIGINAL" in urls: timed_out = download(urls["ORIGINAL"], "pdfs")
            
        sleep = 1
#         while timed_out or download_pdf: 
        while timed_out:
#             if timed_out: 
            print("timed out, sleeping", sleep, "minute(s)")
            time.sleep(60 * sleep)
            timed_out = False
            sleep += 1
#             if download_pdf: 
#                 if "ORIGINAL" in urls: timed_out = timed_out or 
            download(urls["ORIGINAL"], "pdfs")
#                 if not timed_out: download_pdf = False
#             else: 
#                 if "TEXT" in urls: timed_out = download(urls["TEXT"], "full_text")
#                 download_pdf = need_pdf(urls["TEXT"]) 

        count += 1
#         if count > 3000: break
    except KeyboardInterrupt:
        print('Interrupted')
#         if "TEXT" in urls: delete(urls["TEXT"], "full_text")
        if "ORIGINAL" in urls: delete(urls["ORIGINAL"], "pdfs")
            
        break

downloading https://mit.atmire.com/bitstream/1721.1/59104/2/657327390-MIT.pdf
downloading https://mit.atmire.com/bitstream/1721.1/101501/1/939674230-MIT.pdf
downloading https://mit.atmire.com/bitstream/1721.1/65023/2/25053516-MIT.pdf
Interrupted
deleting pdfs/25053516-MIT.pdf


In [8]:
for year in range(2022, 2023): 
    command = "pipenv run oai -h https://dspace.mit.edu/oai/request -o dspace_theses_" + str(year) + ".xml harvest -f " + str(year) + "-01-01 -u " + str(year) + "-12-31 -m xoai -s com_1721.1_7582"
    os.system(command)
        
     
        

2023-01-10 10:30:31,608 INFO harvester.cli.main(): Logger 'root' configured with level=INFO
2023-01-10 10:30:31,608 INFO harvester.cli.main(): No Sentry DSN found, exceptions will not be sent to Sentry
2023-01-10 10:30:31,608 INFO harvester.cli.harvest(): OAI-PMH harvesting from source https://dspace.mit.edu/oai/request with parameters: method=list, metadata_format=xoai, from_date=2022-01-01, until_date=2022-12-31, set=com_1721.1_7582, exclude_deleted=False
2023-01-10 10:30:32,453 INFO harvester.cli.harvest(): Writing records to output file: dspace_theses_2022.xml
2023-01-10 10:30:39,987 INFO harvester.oai.write_records(): Status update: 1000 records written to output file so far!
2023-01-10 10:30:48,388 INFO harvester.oai.write_records(): Status update: 2000 records written to output file so far!
2023-01-10 10:30:56,411 INFO harvester.oai.write_records(): Status update: 3000 records written to output file so far!
2023-01-10 10:31:04,773 INFO harvester.oai.write_records(): Status updat

In [44]:


def get_element_fields(element, field_names): 
    result = {}
    
    for field in element: 
        if field.attrib["name"] in field_names: result[field.attrib["name"]] = field.text
            
    return result

def get_sub_element(subelement): 
    for field in subelement[0]: 
        if field.attrib["name"] == "value": return field.text
rows = []

for year in range(2019, 2023):
    
    tree = ET.parse("dspace_theses_" + str(year) + ".xml")
    root = tree.getroot()
    for record in root:
        try: 
            metadata = record[1][0]
        except: 
            print(record)
            continue

        row = {}

        for element in metadata:
            ename = element.attrib['name']

            if ename == "others": row |= get_element_fields(element, {"handle", "identifier"})
            elif ename == "bundles": 
                for bundle in element: 
                    name = bundle[0].text

                    if name == "THUMBNAIL": 
                        row["image_url"] = get_element_fields(bundle[1][0], {"url"})["url"]
                        bitstreams = bundle[1][0]    
                    if name == "ORIGINAL": 
                        row["filename"] = get_element_fields(bundle[1][0], {"url"})["url"].split("/")[-1] + ".txt"
            elif ename == "dc": 
                for subelement in element: 
                    sname = subelement.attrib['name']
                    if sname == "contributor": 
                        for sselement in subelement: 
                            if sselement.attrib['name'] == "author": row["author"] = get_sub_element(sselement)
                            if sselement.attrib['name'] == "department": row["department"] = get_sub_element(sselement)    
                    elif sname == "date": 
                        for sselement in subelement: 
                            if sselement.attrib['name'] == "issued": row["issued"] = get_sub_element(sselement)
                    elif sname == "identifier":
                        for sselement in subelement: 
                            if sselement.attrib['name'] == "uri": row["uri"] = get_sub_element(sselement)
                            if sselement.attrib['name'] == "oclc": row["oclc"] = get_sub_element(sselement) 
                    elif sname == "description":
                        for sselement in subelement: 
                            if sselement.attrib['name'] == "abstract": 
                                abstract = []
                                for field in sselement[0]: 
                                    if field.attrib["name"] == "value": 
                                        if field.text: abstract.append(field.text)
                                row["abstract"] = "\n".join(abstract)
                    elif sname == "subject": row["subject"] = get_sub_element(subelement)
                    elif sname == "title": row["title"] = get_sub_element(subelement)

        rows.append(row)

    # print(len(records), " records")
    # return records

<Element '{http://www.openarchives.org/OAI/2.0/}record' at 0x1ed99d170>
<Element '{http://www.openarchives.org/OAI/2.0/}record' at 0x1342cc810>
<Element '{http://www.openarchives.org/OAI/2.0/}record' at 0x23a889c10>
<Element '{http://www.openarchives.org/OAI/2.0/}record' at 0x22da180e0>


In [46]:
df = pd.DataFrame(rows)
print(df.shape)
df.head()

(57777, 12)


Unnamed: 0,author,department,issued,uri,oclc,abstract,subject,title,image_url,filename,handle,identifier
0,"Vázquez, Maribel, 1971-",Massachusetts Institute of Technology. Dept. o...,2001,http://hdl.handle.net/1721.1/8302,50444365,Electrophoresis of DNA has become particularly...,Mechanical Engineering.,A study of loading parameters that affect DNA ...,https://mit.atmire.com/bitstream/1721.1/8302/5...,50444365-MIT.pdf.txt,1721.1/8302,oai:mit.atmire.com:1721.1/8302
1,"Dai, Siyu, S.M. Massachusetts Institute of Tec...",Massachusetts Institute of Technology. Departm...,2018,http://hdl.handle.net/1721.1/120230,1083120469,"For high-dimensional robots, motion planning i...",Mechanical Engineering.,Probabilistic motion planning and optimization...,https://mit.atmire.com/bitstream/1721.1/120230...,1083120469-MIT.pdf.txt,1721.1/120230,oai:mit.atmire.com:1721.1/120230
2,"Bury, Mark Eric",Massachusetts Institute of Technology. Dept. o...,1997,http://hdl.handle.net/1721.1/50310,37896579,,Aeronautics and Astronautics,Influence of Reynolds number and blade geometr...,https://mit.atmire.com/bitstream/1721.1/50310/...,37896579-MIT.pdf.txt,1721.1/50310,oai:mit.atmire.com:1721.1/50310
3,"Meng, Xianglin, S.M. Massachusetts Institute o...",Massachusetts Institute of Technology. Departm...,2018,http://hdl.handle.net/1721.1/117814,1051460349,Our goal is to understand the functioning of t...,Electrical Engineering and Computer Science.,Systemic risk in the interbank lending market,https://mit.atmire.com/bitstream/1721.1/117814...,1051460349-MIT.pdf.txt,1721.1/117814,oai:mit.atmire.com:1721.1/117814
4,"Verma, Malvika.",Massachusetts Institute of Technology. Departm...,2019,https://hdl.handle.net/1721.1/123066,1127292014,Lack of medication adherence is a worldwide pr...,Biological Engineering.,Gastric resident systems for large dose drug d...,https://dspace.mit.edu/bitstream/1721.1/123066...,1127292014-MIT.pdf.txt,1721.1/123066,oai:dspace.mit.edu:1721.1/123066


In [47]:
df.to_csv("metadata.csv")