<h3> Extracting DOIs <h3>

In [36]:
import requests
import time
import json

def fetch_dois(url, params, issn, max_retries=5):
    dois = []
    doi_issue_dict = {}
    retries = 0

    while True:
        try:
            response = requests.get(url, params=params, timeout=10)
            
            if response.status_code == 429:
                print("Rate limit exceeded. Waiting 10 seconds...")
                time.sleep(10)
                continue
            elif response.status_code != 200:
                print(f"Error: {response.status_code}")
                retries += 1
                if retries >= max_retries:
                    print("Max retries reached. Exiting...")
                    break
                time.sleep(5)
                continue
            
            data = response.json()
            
            # Assuming the API returns items in message.items
            # Adjust this based on the actual response structure
            items = data["message"]["items"] if "items" in data["message"] else data["message"]
            
            if not items:
                print("No more items found. Exiting...")
                break
            
            for item in items:
                doi = item["DOI"]
                dois.append(doi)  # Add DOI to list
                
                # Check if 'title' exists in the item
                if "title" in item:
                    if isinstance(item["title"], list) and len(item["title"]) > 0:
                        title = item["title"][0]
                    else:
                        title = str(item["title"])
                else:
                    title = "No title available"
                    
                doi_issue_dict[doi] = title
            
            print(f"Retrieved {len(dois)} DOIs so far...")
            
            if len(dois) % 1000 == 0:
                with open(f"dois_{issn}_partial.txt", "w") as f:
                    for doi in dois:
                        f.write(f"{doi}\n")
            
            # Check if there are more pages
            next_cursor = data["message"].get("next-cursor")
            if not next_cursor:
                print("No more pages found. Exiting...")
                break
            
            params["cursor"] = next_cursor
            time.sleep(1)
            retries = 0
        
        except requests.exceptions.Timeout:
            print("Request timed out. Retrying...")
            retries += 1
            if retries >= max_retries:
                print("Max retries reached. Exiting...")
                break
            time.sleep(5)
        except Exception as e:
            print(f"An error occurred: {e}")
            # Save what we have so far instead of losing everything
            if dois:
                with open(f"dois_{issn}_error_partial.txt", "w") as f:
                    for doi in dois:
                        f.write(f"{doi}\n")
                with open(f"doi_issues_{issn}_title_error_partial.json", "w") as f:
                    json.dump(doi_issue_dict, f, indent=2)
                print(f"Saved {len(dois)} DOIs collected before error.")
            break

    # Save DOIs to a text file
    with open(f"dois_{issn}.txt", "w") as f:
        for doi in dois:
            f.write(f"{doi}\n")

    # Save DOI-issue mapping as JSON
    with open(f"doi_issues_{issn}_title.json", "w") as f:
        json.dump(doi_issue_dict, f, indent=2)

    print(f"Retrieved {len(dois)} DOIs. Saved to dois_{issn}.txt and doi_issues_{issn}_title.json")

In [31]:
import requests
import time
import json
import random

def fetch_dois(url, params, issn, max_retries=5, initial_delay=1, max_delay=30):
    dois = []
    doi_issue_dict = {}
    retries = 0
    current_delay = initial_delay
    request_count = 0

    while True:
        try:
            # Add jitter to avoid predictable patterns that might trigger rate limiting
            actual_delay = current_delay * (0.8 + 0.4 * random.random())  # 80-120% of current delay
            
            # Every 10 requests, take a longer break to be extra cautious
            if request_count > 0 and request_count % 10 == 0:
                print(f"Taking a short break after {request_count} requests...")
                time.sleep(actual_delay * 2)
            
            request_count += 1
            response = requests.get(url, params=params, timeout=10)
            
            if response.status_code == 429:
                print(f"Rate limit exceeded. Waiting {current_delay * 2} seconds...")
                time.sleep(current_delay * 2)
                # Double the delay for next time, up to max_delay
                current_delay = min(current_delay * 2, max_delay)
                continue
            elif response.status_code != 200:
                print(f"Error: {response.status_code}")
                retries += 1
                if retries >= max_retries:
                    print("Max retries reached. Exiting...")
                    break
                time.sleep(current_delay)
                continue
            
            # Gradually reset delay on successful requests
            if current_delay > initial_delay:
                current_delay = max(current_delay * 0.8, initial_delay)
            
            data = response.json()
            
            # Assuming the API returns items in message.items
            # Adjust this based on the actual response structure
            items = data["message"]["items"] if "items" in data["message"] else data["message"]
            
            if not items:
                print("No more items found. Exiting...")
                break
            
            for item in items:
                doi = item["DOI"]
                dois.append(doi)  # Add DOI to list
                
                # Check if 'title' exists in the item
                if "title" in item:
                    if isinstance(item["title"], list) and len(item["title"]) > 0:
                        title = item["title"][0]
                    else:
                        title = str(item["title"])
                else:
                    title = "No title available"
                    
                doi_issue_dict[doi] = title
            
            print(f"Retrieved {len(dois)} DOIs so far...")
            
            if len(dois) % 1000 == 0:
                with open(f"dois_{issn}_partial.txt", "w") as f:
                    for doi in dois:
                        f.write(f"{doi}\n")
                # Also save partial JSON results
                with open(f"doi_issues_{issn}_title_partial.json", "w") as f:
                    json.dump(doi_issue_dict, f, indent=2)
            
            # Check if there are more pages
            next_cursor = data["message"].get("next-cursor")
            if not next_cursor:
                print("No more pages found. Exiting...")
                break
            
            params["cursor"] = next_cursor
            # Use dynamic delay between requests
            time.sleep(actual_delay)
            retries = 0
        
        except requests.exceptions.Timeout:
            print(f"Request timed out. Waiting {current_delay} seconds before retrying...")
            retries += 1
            if retries >= max_retries:
                print("Max retries reached. Exiting...")
                break
            time.sleep(current_delay)
            # Increase delay for next attempt
            current_delay = min(current_delay * 1.5, max_delay)
        except Exception as e:
            print(f"An error occurred: {e}")
            # Save what we have so far instead of losing everything
            if dois:
                with open(f"dois_{issn}_error_partial.txt", "w") as f:
                    for doi in dois:
                        f.write(f"{doi}\n")
                with open(f"doi_issues_{issn}_title_error_partial.json", "w") as f:
                    json.dump(doi_issue_dict, f, indent=2)
                print(f"Saved {len(dois)} DOIs collected before error.")
            break

    # Save DOIs to a text file
    with open(f"dois_{issn}.txt", "w") as f:
        for doi in dois:
            f.write(f"{doi}\n")

    # Save DOI-issue mapping as JSON
    with open(f"doi_issues_{issn}_title.json", "w") as f:
        json.dump(doi_issue_dict, f, indent=2)

    print(f"Retrieved {len(dois)} DOIs. Saved to dois_{issn}.txt and doi_issues_{issn}_title.json")

Fetch DOIs of American Sociological Review

In [32]:
issn = "0003-1224"
url = f"https://api.crossref.org/journals/{issn}/works"
params = {"cursor": "*", "rows": 1000}
fetch_dois(url, params, issn, max_retries=5)

Retrieved 1000 DOIs so far...
Retrieved 2000 DOIs so far...
Retrieved 3000 DOIs so far...
Retrieved 4000 DOIs so far...
Retrieved 5000 DOIs so far...
Retrieved 6000 DOIs so far...
Retrieved 7000 DOIs so far...
Retrieved 8000 DOIs so far...
Retrieved 9000 DOIs so far...
Retrieved 10000 DOIs so far...
Taking a short break after 10 requests...
Retrieved 11000 DOIs so far...
Retrieved 12000 DOIs so far...
Retrieved 13000 DOIs so far...
Retrieved 13177 DOIs so far...
No more items found. Exiting...
Retrieved 13177 DOIs. Saved to dois_0003-1224.txt and doi_issues_0003-1224_title.json


Fetch DOIs of American Journal of Sociology

In [39]:
issn = "0002-9602"
url = f"https://api.crossref.org/journals/{issn}/works"
params = {"cursor": "*", "rows": 1000}
fetch_dois(url, params, issn, max_retries=5)

Retrieved 1000 DOIs so far...
Retrieved 2000 DOIs so far...
Retrieved 3000 DOIs so far...
Retrieved 4000 DOIs so far...
Retrieved 5000 DOIs so far...
Retrieved 6000 DOIs so far...
Retrieved 7000 DOIs so far...
Retrieved 8000 DOIs so far...
Retrieved 9000 DOIs so far...
Retrieved 10000 DOIs so far...
Taking a short break after 10 requests...
Retrieved 11000 DOIs so far...
Retrieved 12000 DOIs so far...
Retrieved 13000 DOIs so far...
Retrieved 14000 DOIs so far...
Retrieved 15000 DOIs so far...
Retrieved 16000 DOIs so far...
Retrieved 17000 DOIs so far...
Retrieved 18000 DOIs so far...
Retrieved 19000 DOIs so far...
Retrieved 20000 DOIs so far...
Taking a short break after 20 requests...
Retrieved 21000 DOIs so far...
Retrieved 22000 DOIs so far...
Retrieved 23000 DOIs so far...
Retrieved 24000 DOIs so far...
Retrieved 25000 DOIs so far...
Retrieved 26000 DOIs so far...
Retrieved 27000 DOIs so far...
Retrieved 27612 DOIs so far...
No more items found. Exiting...
Retrieved 27612 DOIs. Sav

Delete all irrelevant DOIs in AJS

In [None]:
import re

pattern = re.compile(r'10\.1086/ajs\.[^\s]+')

input_file = 'dois_0002-9602.txt'
output_file = 'dois_0002-9602_clean.txt'

with open(input_file, 'r', encoding='utf-8') as fin, \
    open(output_file, 'w', encoding='utf-8') as fout:
    
    for line in fin:
        cleaned_line = re.sub(pattern, '', line)
        if cleaned_line == '\n':
            continue
        
        fout.write(cleaned_line)

<h3> Download all ASR PDFs <h3>

In [13]:
def split_list(lst, n):
    k, m = divmod(len(lst), n)
    return [lst[i*k + min(i, m):(i+1)*k + min(i+1, m)] for i in range(n)]

def split_doc(issn, n):
    with open(f'dois_{issn}.txt', 'r') as f:
        dois = [line.strip() for line in f if line.strip()]
        print(f"Total DOIs: {len(dois)}")

    chunks = split_list(dois, n)
    lines = 0

    for i, chunk in enumerate(chunks, start=1):
        print(f"Processing chunk {i} with {len(chunk)} DOIs")
        lines += len(chunk)
        with open(f'dois_{issn}_batch_{i}.txt', 'w') as f_out:
            f_out.write('\n'.join(chunk) + '\n')

    if lines == len(dois):
        print("All DOIs processed and saved.")

In [7]:
split_doc('0003-1224', 6)

Total DOIs: 13168
Processing chunk 1 with 2195 DOIs
Processing chunk 2 with 2195 DOIs
Processing chunk 3 with 2195 DOIs
Processing chunk 4 with 2195 DOIs
Processing chunk 5 with 2194 DOIs
Processing chunk 6 with 2194 DOIs
All DOIs processed and saved.


In [14]:
from oafuncs.oa_down.literature import download5doi

In [None]:
download5doi(store_path=r'asr_pdf', txt_file=r'asr/asr_dois.txt')

<h3> Download all AJS PDFs <h3>

Trial

In [12]:
import undetected_chromedriver as uc
import time
import os

download_dir = os.path.abspath("downloads")
os.makedirs(download_dir, exist_ok=True)

options = uc.ChromeOptions()
prefs = {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True
}
options.add_experimental_option("prefs", prefs)

driver = uc.Chrome(options=options)

driver.get("https://www.journals.uchicago.edu/action/ssostart")
print("Please log in manually in the opened browser window.")
time.sleep(60)
input("After logging in, press Enter to continue...")


download_url = "https://www.journals.uchicago.edu/doi/pdf/10.1086/733012?download=true"
driver.get(download_url)
time.sleep(10)

print("Download should be complete. Check the download folder:", download_dir)
driver.quit()


2025-04-16 21:21:53,923 - INFO - patching driver executable /Users/tianleyee/Library/Application Support/undetected_chromedriver/undetected_chromedriver


Please log in manually in the opened browser window.
Download should be complete. Check the download folder: /Users/tianleyee/Desktop/downloads


Implement

In [4]:
import undetected_chromedriver as uc
import time
import os
import random

def login_and_download_papers(download_urls, download_dir="ajs_pdf", 
                              login_url="https://www.journals.uchicago.edu/action/ssostart"):
    
    download_dir = os.path.abspath(download_dir)
    os.makedirs(download_dir, exist_ok=True)
    
    options = uc.ChromeOptions()
    prefs = {
        "download.default_directory": download_dir,
        "plugins.always_open_pdf_externally": True
    }
    options.add_experimental_option("prefs", prefs)
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
    
    driver = uc.Chrome(options=options)
    
    driver.get(login_url)
    print("Please log in manually in the opened browser window.")
    time.sleep(60)
    input("After successfully logging in, press Enter to continue...")
    
    failed_downloads = []

    for url in download_urls:
        try:
            print(f"Downloading paper from: {url}")
            driver.get(url)
            time.sleep(2)
        except Exception as e:
            failed_downloads.append(url)
            print(f"Error downloading {url}: {e}")
            continue
    
    print("All downloads should be complete. Check the download folder:", download_dir)
    driver.quit()

In [None]:
with open('ajs/doi/dois_0002-9602.txt', 'r') as f:
    ajs_urls = [f"https://www.journals.uchicago.edu/doi/pdf/{doi}?download=true" for doi in f.read().splitlines()]

In [None]:
login_and_download_papers(ajs_urls[22000:])

Please log in manually in the opened browser window.
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/230277?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/228279?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/210508?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/210936?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/213400?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/210623?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/225917?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/343179?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/211402?download=true
Downloading paper from: https://www.journals.uchicago.edu/doi/pdf/10.1086/230433?download=tr

<h3> Get ASR paper dois

In [43]:
import json

with open('doi_issues_0003-1224_title.json', 'r') as f:
    asr_doi_titles = json.load(f)

In [44]:
with open("asr/doi_issues_0003-1224.json", "r") as f:
    asr_doi_issues = json.load(f)

In [47]:
with open('doi_issues_0003-1224_title.json', 'r') as f:
    asr_doi_titles = json.load(f)

In [51]:
asr_dois_2010s = []

for doi, issue in asr_doi_issues.items():
    try:
        number = int(issue.split('(')[0])
        year = number + 1935
        if 2010 <= year < 2020:
            asr_dois_2010s.append(doi)
    except ValueError:
        print(f"Skipping DOI {doi} due to parsing error with issue: {issue}")

Skipping DOI 10.1177/00031224241298008 due to parsing error with issue: N/A
Skipping DOI 10.1177/00031224241303459 due to parsing error with issue: N/A
Skipping DOI 10.1177/00031224241286480 due to parsing error with issue: N/A
Skipping DOI 10.1177/00031224241305357 due to parsing error with issue: N/A


In [57]:
asr_dois_2010_with_titles = {doi: title for doi, title in asr_doi_titles.items() if doi in asr_dois_2010s}

In [96]:
len(asr_dois_2010_with_titles)

484

In [92]:
asr_paper_2020_dois = []
asr_paper_2020_dois_with_titles = {}
excluded_patterns = [
    "Acknowledgment of Referees",
    "ASR 2010 to 2012",
    "Comments",
    "Correction",
    "Corrigendum",
    "Editorial Transition",
    "Editors' Comment",
    "Further Data",
    "Errata",
    "Erratum",
    "Index",
    "Meeting the Challenges of a 21st-Century Flagship Journal"
]


for doi, title in asr_dois_2010_with_titles.items():
    should_exclude = False
    for pattern in excluded_patterns:
        if pattern in title:
            should_exclude = True
            break
    
    if not should_exclude:
        asr_paper_2020_dois.append(doi)
        asr_paper_2020_dois_with_titles[doi] = title

In [97]:
for doi in asr_paper_2020_dois:
    print(doi)

10.1177/0003122419832497
10.1177/0003122411420814
10.1177/0003122413480362
10.1177/0003122411407736
10.1177/0003122411420815
10.1177/0003122412448050
10.1177/0003122411407748
10.1177/0003122412470829
10.1177/0003122413505198
10.1177/0003122413505588
10.1177/0003122419844992
10.1177/0003122414528936
10.1177/0003122415589170
10.1177/0003122418811112
10.1177/0003122412469204
10.1177/0003122419831228
10.1177/0003122411428221
10.1177/0003122410396195
10.1177/0003122418759544
10.1177/0003122416670655
10.1177/0003122415596999
10.1177/0003122418806284
10.1177/0003122414553657
10.1177/0003122410382639
10.1177/0003122418784909
10.1177/0003122417712729
10.1177/0003122411420816
10.1177/0003122410363563
10.1177/0003122410363564
10.1177/0003122419846849
10.1177/0003122413476034
10.1177/0003122413512316
10.1177/0003122413494759
10.1177/0003122414531596
10.1177/0003122416683394
10.1177/0003122415613078
10.1177/0003122418792836
10.1177/0003122417703087
10.1177/0003122419877135
10.1177/0003122417718165


<h3> Get AJS paper dois <h3>

In [19]:
import json

with open('doi_issues_0002-9602_title.json', 'r') as f:
    ajs_doi_titles = json.load(f)

In [28]:
ajs_paper_dois = []
ajs_paper_dois_with_titles = {}
excluded_patterns = [
    "<i>",  # Italics tags
    "A Letter from",
    "Abstracts of",
    "Acknowledgments",
    "Annual Meeting",
    "Appendix",
    "Announcement",
    "Author Index",
    "Back Matter",
    "Bibliography",
    'Bibliographical Note',
    "Book Notes",
    "Book Review",
    "BOOK REVIEWERS",
    "Book Received",
    "Books Received",
    "Comment on",
    "Contents of Volume",
    "Contributors",
    "Current Books",
    "Current Research Projects",
    "Editor's",
    "Editor’s",
    "Editorial",
    "Erratum",
    "Errata",
    "Foreword",
    "Front Matter",
    "In Memoriam",
    "Index to Volume",
    "Introduction to Symposium",
    "Letter to the Editor",
    "Letters to the Editor",
    "Masthead",
    "News and Notes",
    "News from Abroad",
    "No title available",
    "Notes and Abstracts",
    "Obituary",
    "Preface",
    "Recent Literature",
    "Rejoinder",
    "Reply",
    "Replies",
    "Municipal Review",
    "Review Essay",
    "Subject Index",
    "Students' Dissertations in Sociology",
    "the President of the American Sociological Society",
    "The Family",
    "Volume Information"
]

absolute_excluded_patterns = [
    "Communication", #?
    "Comment", #?
]

for doi, title in ajs_doi_titles.items():
    should_exclude = False
    for pattern in excluded_patterns:
        if pattern in title:
            should_exclude = True
            break
    for pattern in absolute_excluded_patterns:
        if pattern == title:
            should_exclude = True
            break
    
    if not should_exclude:
        ajs_paper_dois.append(doi)
        ajs_paper_dois_with_titles[doi] = title

In [98]:
len(ajs_paper_dois_with_titles)

5714

In [29]:
[title for title in ajs_paper_dois_with_titles.values()]

['Community Organization',
 'The Basis of Sociality',
 'When Truth Trumps Facts: Studies on Partisan Moral Flexibility in American Politics',
 'The Literary Interests of Chicago. I and II',
 'Education',
 'The First German Municipal Exposition. (Dresden, 1903.) III',
 'Attitudinal Multivalence in Relation to Culture and Personality',
 'A Movement to Preserve Social Science Source Materials',
 'The Relation of Sex to Primitive Social Control',
 'The Race-Preservation Dogma',
 'How Much Post-War Migration?',
 'The Russian Experiment',
 'The Structure of Intergenerational Exchanges in American Families',
 'Escape from the GDR, 1961–1989: Hybrid Exit Repertoires in a Disintegrating Leninist Regime',
 'Talking City Trouble: Interactional Vandalism, Social Inequality, and the “Urban Interaction Problem”',
 'Community Change and Patterns of Delinquency',
 'Introduction to Sociology. X',
 'Population of the United States, 1925 to 1975',
 'The Development of Rural Sociology',
 'The Sad Estate o

In [99]:
with open('ajs_paper_dois.txt', 'w') as f:
    for doi in ajs_paper_dois:
        f.write(f"{doi}\n")