# Description
The purpose of this script is to crawl your Reader account for any potential duplicate articles, as defined by an article's title.  Because there is the possibility that multiple articles have the same title but are *not* duplicates, you can create a note on the article called "not a dupe" and it will be excluded. For my purposes, further limiting the match by author etc resulted in too many false negatives (mainly when articles from Medium, or or articles from certain feeds), but future enhancements could be made to suit your personal use cases.

# Note
This does not automatically delete duplicates, because
1. The Reader API does not currently allow that, and
2. Even if it did, you'd want to have control over what *you* consider the right copy to delete

# Prerequisites
- Your own API key for your Reader account. API key is obtained at #https://readwise.io/reader_api
- A file named .env in the same directory, containing "API_KEY=yourkeyhere"

In [None]:
# Set the Reader api key from the .env file in the same directory. 

from dotenv import dotenv_values

config = dotenv_values(".env")
token = config['API_KEY']


In [None]:
import datetime
import requests
import time

# In VS, it may be helpful to surpress warnings
#requests.packages.urllib3.disable_warnings()


def fetch_reader_document_list_api():
    full_data = []
    next_page_cursor = None
    print("Beginning export api request...")
    print('--------------------------------------------------')
    while True:
        params = {}
        if next_page_cursor:
            params['pageCursor'] = next_page_cursor
        print("Making export api request with params " + str(params) + "...")
        response = requests.get(
            url="https://readwise.io/api/v3/list/",
            params=params,
            headers={"Authorization": f"Token {token}"}, verify=None #False
        )
        
        
        # According to the official Reader documentation, the rate limiting's default base rate is 20 requests per minute.
        # To account for this when dealing with a large library, this script can wait until it's clear to keep going
        
        if response.status_code == 429:
            print("Sleeping for: ",response.headers["Retry-After"])
            time.sleep(int(response.headers["Retry-After"]))
        
        else:
            full_data.extend(response.json().get('results'))
            next_page_cursor = response.json().get('nextPageCursor')
            
            if not next_page_cursor:
                break
    print("import complete") 
    return full_data



allData = fetch_reader_document_list_api()      




In [None]:

# since the API returns all things (not just articles, but highlights, etc as well), this will return a list of just the articles
def getArticles(fulllist):
    allArticles=[]

    for listitem in fulllist:
        if listitem['title'] != None:
            listitem['lowercasetitle']=listitem['title'].lower()
            allArticles.append(listitem)
            
    return allArticles

allArticles=getArticles(allData)


In [None]:

#assumes that the list input will have been cleaned up by getArticles()
#returns the list of titles that are duplicates to search for
def getDupeTitles(allArticles):
    distinct = []
    dupes = []

    for article in allArticles:
        title=article['lowercasetitle']
        if title in distinct:
            if title not in dupes and article['notes'] != "not a dupe" and article['title']!="":
                dupes.append(title)
        else:
            distinct.append(title)
    dupes.sort()
    return dupes

dupeTitles=getDupeTitles(allArticles)




In [None]:


#print the results in an easy to read format. 
def printDetails(dupeTitles,allArticles):

    if not dupeTitles:
        print('no dupes found!')
    else:
        for dupeTitle in dupeTitles:
            for article in allArticles:
               if dupeTitle == article['lowercasetitle']:
                 print('location: ',article['location'],' ,url: ',article['url'],' ,title: "',article['title'],'"',sep='')
            print('--------------------------------------------------')


printDetails(dupeTitles,allArticles)
