In [None]:
import shutil
from lxml import etree
from glob import glob
import os
import logging
import json
import pandas as pd
import re
from datetime import datetime

# For scraping
import requests
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from datetime import datetime

## Initial Conf Details

In [None]:
# Basic setup
now = datetime.now()
dt_string = now.strftime("%Y%m%d%H%M%S")

JRN_CODE = 'afar'
LICENCE_FOLDER = 'license_info'
TEST_FOLDER = 'test_input'
ERROR_FOLDER = 'logs'
WD_PATH = 'venv/bin/chromedriver'
logging.basicConfig(filename='logs/log_file_{}_{}.log'.format(JRN_CODE, dt_string), filemode='w', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logging.info('running')


# Code for Scraping  Site ####
## (Skip this section if you already have the CSV files) 

In [None]:

driver = webdriver.Chrome(WD_PATH)

In [None]:
loi_url = 'https://www.mitpressjournals.org/loi/{}'.format(JRN_CODE)
driver.get(loi_url)
content = driver.page_source
soup = BeautifulSoup(content)


In [None]:
loi_content = soup.find_all('div', {'class':'js_issue'})
urls = []
for x in loi_content:
    link_tag =  x.find('a', recursive=False)['href']
    full_url = 'https://www.mitpressjournals.org{}'.format(link_tag)
    urls.append(full_url)

In [None]:
def scrape_toc():
    for url in urls:
        driver.get(url)
        content = driver.page_source
        soup = BeautifulSoup(content)
        doi_list, access_list = parse_html(soup, url)
    return doi_list, access_list

In [None]:
access_list = []
doi_list = []

In [None]:
master_list = []
def parse_html(soup, url):
    try:
        # We want to grab the tocContent element
        toc_content = soup.find("div", {"class": "tocContent"})
        article_entry = toc_content.find_all("table", {"class": "articleEntry"}, recursive=False)
        for article in article_entry:
            access_list.append(article.find("img", {"class": "accessIcon"}, recursive=True)['title'])
            doi_list.append(article.find("input", {"class": "tocToolCheckBox"}, recursive=True)['value'])
        return doi_list, access_list
    except NoneType as ne:
        print(ne)
        pass


In [None]:
doi_list, access_list = scrape_toc()
zipped = list(zip(doi_list, access_list))
df = pd.DataFrame(zipped,  columns=['DOI', 'ACCESS'])



In [None]:
driver.quit()

In [None]:
# Now that we've scraped all the file info
# We generate  a DF that holds all the access data and adds a new column for normalized article IDs

In [None]:
def normalize_doi(row):
    full_doi = row['DOI']
    normalized_doi = full_doi.split('/')[1].upper()
    return normalized_doi

df['NORMLIZED_DOI'] = df.apply(normalize_doi, axis=1)

# Save everything to a file, just in case
df.to_csv('{}/access_list_{}.csv'.format(LICENCE_FOLDER, JRN_CODE), index=None)


# Code for Checking and Updating the XML files ####

In [None]:
license_df = pd.read_csv('{}/access_list_{}.csv'.format(LICENCE_FOLDER, JRN_CODE))
print('loaded: {}'.format(license_df))

In [None]:
# Filter the license_df to only include OA articles
# Filter the license_df to only include Free article
# The gated articles don't require any changes.
oa_articles = license_df[license_df['ACCESS']=='Open Access']
free_articles = license_df[license_df['ACCESS']=='Free Access']


In [None]:
print(oa_articles)
print(free_articles)

In [None]:
# In order for lxml to process the files we need to provide the below info
nsmap = {
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'xlink': 'http://www.w3.org/1999/xlink',
}
attr_qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation")

parser = etree.XMLParser(recover=True)

In [None]:
# First we look for the issue_meta file and update the TOC as needed
# Next we look for OA and Free articles and add the license info as needed

def update_files():
    for root, directory, files in os.walk(TEST_FOLDER):
        for f in files:
            # If we have the issue_meta file we need to update it.
            # First we grab the path to the issue_meta file.
            # We then get df_filter, which holds all the DOIs in this issue
            # Then we get unique_headings, which holds the TOC headins if any exist. 
            # That info has been scraped from the website directly and lives in a CSV file
            # that we get from get_doi_list(). 
            # Once we have that we call convert_issue_xml(), which does the actual updating of the issue files
            if 'issue_meta' in f:
                path_to_issue_file = '{}/{}'.format(root, f)
#                 path_to_assets_folder = '{}/{}'.format(root, )
                issue_folder_name = root.split('/')[1]
                df_filter, unique_headings = get_doi_list('{}/{}/XML'.format(TEST_FOLDER, issue_folder_name))
                converted = convert_issue_xml(path_to_issue_file, df_filter, unique_headings, issue_folder_name)
#                 with open('{}'.format(path_to_issue_file), 'wb') as doc:
#                     doc.write(etree.tostring(converted, pretty_print = True))
            
            # If we have an article file we check to see if the license needs to added
            # Since the articles are named according to their DOI we can iterate over all the articles 
            # in each package and updated those in the filtered lists
            
            # This part is a bit inefficent and could do with some cleanup so that
            # we don't have two functions doing more or less the same thing. 
            # For now, however, it works. 
            if f.endswith('xml') and 'issue' not in f:
                article_path = '{}/{}'.format(root, f)
                article_name = f.split('.xml')[0].upper()
                if article_name.upper() in oa_articles['NORMLIZED_DOI'].values:
                    update_license_oa(article_path)
                    print(article_path)
                if article_name.upper() in free_articles['NORMLIZED_DOI'].values:
                    update_license_free(article_path)

In [None]:
def update_license_oa(article_path):
    tree = etree.parse(article_path, parser=parser)
    article_root  = tree.getroot()
    # Look for an existing license element
    license_tag = tree.find('.//{http://specifications.silverchair.com/xsd/1/22/SCJATS-journalpublishing.xsd}license')
    # If we find a license tag in the article we want to make sure it has the right license. 
    # If it hase a license element it SHOULD have the correct license, but we should check anyway
    # If it has the wrong license (anything other than 'open-access') just log it for review later
    if license_tag is not None:             
        if license_tag.get('license-type') != 'open-access':
            logging.info('File: {}. license found with incorrect license-type. Expectng \'open-access\' found \'{}\''.format(article_path, license_tag.get('license-type')))
        if license_tag.get('license-type') == 'open-access':
            logging.info('File: {}. license found with correct license-type.'.format(article_path))

    # We now have an OA ile without a license in the XML.
    # So that means we have to add it
    # We do that by finding the permission tag and appending a child
    else:
        permissions_tag = tree.find('.//{http://specifications.silverchair.com/xsd/1/22/SCJATS-journalpublishing.xsd}permissions')
        new_license_tag = etree.SubElement(permissions_tag, 'license')
        new_license_tag.attrib['license-type'] = 'open-access'
        new_license_tag.attrib['{%s}href' % nsmap['xlink']] = 'https://creativecommons.org/licenses/by/4.0/'

        new_license_p = etree.SubElement(new_license_tag, 'license-p')
        new_license_p.text = 'This is an open-access article distributed under the terms of the Creative Commons Attribution 4.0 International License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. For a full description of the license, please visit <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/legalcode">https://creativecommons.org/licenses/by/4.0/legalcode</ext-link>.'

        logging.info('File: {}. Updating file with OA license.'.format(article_path))

        with open(article_path, 'wb') as doc:
            doc.write(etree.tostring(tree, pretty_print = True))
        with open(article_path, 'r+') as f:
            text = f.read()
            text = re.sub('&lt;ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/legalcode"&gt;https://creativecommons.org/licenses/by/4.0/legalcode&lt;/ext-link&gt;', '<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/legalcode">https://creativecommons.org/licenses/by/4.0/legalcode</ext-link>', text)
            f.seek(0)
            f.write(text)
            f.truncate()

                   

In [None]:
def update_license_free(article_path):
    tree = etree.parse(article_path, parser=parser)
    article_root  = tree.getroot()

    license_tag = tree.find('.//{http://specifications.silverchair.com/xsd/1/22/SCJATS-journalpublishing.xsd}license')
    if license_tag is not None:             
        logging.info('File: {}. No update to free article. license found with some license. Check before updatingcorrect license-type.'.format(article_path))
    else:
        permissions_tag = tree.find('.//{http://specifications.silverchair.com/xsd/1/22/SCJATS-journalpublishing.xsd}permissions')
        new_license_tag = etree.SubElement(permissions_tag, 'license')
        new_license_tag.attrib['license-type'] = 'free'                        

        logging.info('File: {}. Updating file with free license.'.format(article_path))

        with open(article_path, 'wb') as doc:
            doc.write(etree.tostring(tree, pretty_print = True))

In [None]:
def convert_issue_xml(path_to_issue_file, df_filter, unique_headings, issue_folder_name):

    folder = os.path.dirname(os.path.dirname(path_to_issue_file)).split('/')[1]
    assets_folder = '{}/{}'.format(os.path.dirname(os.path.dirname(path_to_issue_file)), 'assets')
    issue_num = folder.split('-')[1]
    
    tree = etree.parse(path_to_issue_file, parser=parser)
    root = tree.getroot()

    # Some issues are missing their covers and we need to download them from the site
    # This checks if a cover exists and if it doesn't it grabs it
    def get_cover():
        if glob('{}/*cover.*'.format(assets_folder)):
            print('cover found in {}'.format(path_to_issue_file))
            pass
        else:
            print('cover missing')
            # Check if webdriver running
            try:
                driver.title
                print(True)
            except NameError as e:
                print(False)
                driver = webdriver.Chrome("/Users/kmcdouga/Dropbox (MIT)/Silverchair/Batch_2/venv/bin/chromedriver")

            # The URL to the issue homepage follows a pattern that we can stitch together using the volume and issue
            volume = root.find('.//{http://specifications.silverchair.com/xsd/1/21/SCJATS-journalissue.xsd}volume').text
            issue = root.find('.//{http://specifications.silverchair.com/xsd/1/21/SCJATS-journalissue.xsd}issue').text
            toc_path = 'https://www.mitpressjournals.org/toc/{}/{}/{}'.format(JRN_CODE, volume, issue)
            
            try:
                driver.get(toc_path)
                content = driver.page_source
                soup = BeautifulSoup(content, features='lxml')
            
                cover_tag = soup.find_all('img', {'alt':'Publication Cover'})
                cover_path = 'https://www.mitpressjournals.org/{}'.format(cover_tag[0]['src'])
                cover_name = cover_path.split('/')[-1]
            
                supplementary_material = etree.SubElement(root, 'supplementary-material')
                supplementary_material.attrib['{%s}href' % nsmap['xlink']] = cover_name
                supplementary_material.attrib['content-type'] = 'cover'
            
                response = requests.get(cover_path)
                save_path = '{}/{}'.format(assets_folder, cover_name)
                file = open(save_path, "wb")
                file.write(response.content)
                file.close()
                driver.quit()
            except Exception as e:
                print('error getting cover: {}'.format(e))
                try:
                    driver.quit()
                except Exception as e:
                    print(e)

    get_cover()

    
    
    # Now we update the issue_meta file adding the <toc> block
    toc_block = root.find('.//{http://specifications.silverchair.com/xsd/1/21/SCJATS-journalissue.xsd}table-of-contents')
    toc_block.getparent().remove(toc_block)


    table_of_contents = etree.SubElement(root, 'table-of-contents')
    
    # Look for articles without a heading in the TOC
    no_heading = df_filter[pd.isna(df_filter['article_title'])]

    
#     toc_sec = etree.SubElement(table_of_contents, 'toc-sec')
#     toc_title = etree.SubElement(toc_sec, 'title')

    if not no_heading.empty:
        toc_sec = etree.SubElement(table_of_contents, 'toc-sec')
        toc_title = etree.SubElement(toc_sec, 'title')
        for index, row in no_heading.iterrows():
            toc_content = etree.SubElement(toc_sec, 'toc-content')
            toc_content.attrib['pub-id-type'] = 'doi'
            toc_content.attrib['{%s}href' % nsmap['xlink']] = row['article_doi']

    for heading in unique_headings:
        toc_sec = etree.SubElement(table_of_contents, 'toc-sec')
        toc_title = etree.SubElement(toc_sec, 'title')
        toc_title.text = heading
        df_sec = df_filter[df_filter.article_title == heading]
        for index, row in df_sec.iterrows():
            toc_content = etree.SubElement(toc_sec, 'toc-content')
            toc_content.attrib['pub-id-type'] = 'doi'
            toc_content.attrib['{%s}href' % nsmap['xlink']] = row['article_doi']
    return root

def get_doi_list(path):
    delete_command = 'find \\{} -name \'*.DS_Store*\' -delete'.format(path)
    os.system(delete_command)

    doi_list = ['10.1162/{}'.format(os.path.splitext(item)[0].upper()) for item in os.listdir(path)]
    
    df = pd.read_csv('{}.csv'.format(JRN_CODE))

    def popTime(row):
        x = row['article_doi']
        return(x.upper())

    df['article_doi'] = df.apply(popTime, axis=1)
    df_filter = df[df.article_doi.isin(doi_list)]

    unique_headings = pd.unique(df_filter['article_title'])
    unique_headings = [value for value in unique_headings if pd.notna(value)]

    return df_filter, unique_headings



<!-- Caused by line breaks in source XML. Need to find *anything* after surname and string-name and remove accordingly. 
- all files need to be formatted and indented. 

Need to join and normalize to start:
F: \s+
R: ''

From there we need to fix formatting:
F: </surname> </
R: </surname></

F: </string-name> </
R: </string-name></

F: </given-names> </string-name>
R: </given-names></string-name>

F: </collab> </person-group>
R: </collab></person-group> -->