In [None]:
import requests
from csv import writer
from os.path import exists
import numpy as np
import pickle
import time
from dotenv import load_dotenv
import os

In [None]:
# Documentation at : https://github.com/berkmancenter/lumendatabase/wiki/Lumen-API-documentation
# Read the API Terms of Use at : https://lumendatabase.org/pages/api_terms

destFile = "output.csv"
infringing = "infringing_urls.csv"

load_dotenv()
# environment variables
APIkey = os.getenv("API_KEY")
userAgent = os.getenv("USER_AGENT")

typeSearch = "term"
endpoint = "https://lumendatabase.org/notices/search.json?"
headers = {'User-Agent': userAgent}
fieldnames = ['id',
            'type',
            'date_received',
            'sender_name',
            'jurisdictions',
            'copyrighted_urls',
            'infringing_urls',
            'searched_url',
            'score']


In [None]:
 # Search entry or MODE 1
init_url = ["https://newsworldinfo.over-blog.com/"]

# URLs or other words to skip.
to_skip = ["No URL submitted"]

# Words to search for when to recognize a blogging platform (MODE 3, SAVE_ONLY_BAD_URLS)
bad_words = ["blogspot", 
           "issuu", 
           "livejournal",
           "weebly",
           "wordpress",
           "tumblr",
           "over-blog",
            "food.blog"]


In [None]:
# # MODES (default = 1)

# 1 : Search the init_url list
# 2 : Search the infringing_urls list contained in LumenResult_0421.csv
# 3 : Search the urls found using bad_words
# 4 : Save the infringing urls found in a CSV File

GET_URLS = 1

# SAVE_ONLY_BAD_URLS

# True : Only saves results that include one of the bad_words in the infringing_url field (decrease noise)
# False: Saves all results without discriminating (increase noise)

SAVE_ONLY_BAD_URLS = True

def contains_bad_word(url):
    for u in bad_words:
        if u in url:
            return True,u
    return False,None

def get_infringing_urls():
    urls = []
    with open(destFile) as f:
        for row in f:
            lists = row.split(",")
            url = lists[5]
            url = url.split(";")
            for u in url:
                urls.append(u)
        urls.pop(0)
        
    urls =  list(dict.fromkeys(urls))
    return urls

def save_inf_urls():
    cnt = 0
    infringing_urls = set()
    saved_urls = set()
    if exists("infringing.pkl"):
        saved_urls = pickle.load(open('infringing.pkl','rb'))

    with open(destFile) as f:
        next(f)
        for row in f:
            lists = row.split(',')
            lists = lists[5].split(";")
            
            for u in lists:
                infringing_urls.add(u)

        infringing_urls = infringing_urls.difference(saved_urls)

    with open(infringing,'a') as f:
        writer_obj = writer(f)
        for u in infringing_urls:
            writer_obj.writerow([u])
            saved_urls.add(u)
            cnt+=1
    pickle.dump(saved_urls,open('infringing.pkl','wb'))
    print("Saved " + str(cnt) + " Infringing Urls")

In [None]:
def contains_bad_word(url):
    for u in bad_words:
        if u in url:
            return True,u
    return False,None

def get_infringing_urls():
    urls = []
    with open(destFile) as f:
        for row in f:
            lists = row.split(",")
            url = lists[6]
            url = url.split(";")
            for u in url:
                urls.append(u)
        urls.pop(0)
        
    urls =  list(dict.fromkeys(urls))
    return urls

def save_inf_urls():
    
    cnt = 0
    infringing_urls = set()
    saved_urls = set()
    
    if exists("infringing.pkl"):
        saved_urls = pickle.load(open('infringing.pkl','rb'))

    with open(destFile) as f:
        next(f)
        for row in f:
            lists = row.split(',')
            lists = lists[5].split(";")
            
            for u in lists:
                infringing_urls.add(u)

        infringing_urls = infringing_urls.difference(saved_urls)

    with open(infringing,'a') as f:
        writer_obj = writer(f)
        for u in infringing_urls:
            writer_obj.writerow([u])
            saved_urls.add(u)
            cnt+=1
    
    pickle.dump(saved_urls,open('infringing.pkl','wb'))
    print("Saved " + str(cnt) + " Infringing Urls")

In [None]:
def main():

    if GET_URLS == 4 and exists(destFile):
        save_inf_urls()
        exit()

    read_url = set()
    bad_urls = set()

    # Load the list of urls already read
    if exists("read_urls.pkl"):
        read_url = pickle.load(open("read_urls.pkl","rb"))

    # Load the list of bad urls found
    if exists("bad_urls.pkl"):
        bad_urls = pickle.load(open("bad_urls.pkl","rb"))
    
    # Mode selection
    if not exists(destFile) or GET_URLS == 1:
        urls = init_url
    elif GET_URLS == 2:
        urls = get_infringing_urls()
    elif exists("bad_urls.pkl") and GET_URLS == 3:
        urls = bad_urls
    else:
        urls = init_url
        
    # Counters
    cnt_inserted = 0
    cnt_total = 0

    # Check if destination file exits
    destFExist = True
    if not exists(destFile):
        destFExist = False
        bad_urls = set()
        read_url = set()
    
    # If destFile already exists then load the id column in a list.
    ids = []
    if destFExist:
        with open(destFile) as f:
            for row in f:
                lists = row.split(',')
                ids.append(lists[0])
            ids.pop(0)

    # Convert the ids from string to uint32
    ids = np.uint32(ids).tolist()

    # The total number of row in the destFile starts from the initial number of row
    cnt_total = len(ids)

    # Open destFile. If not exist it creates it automatically
    with open(destFile,'a', newline='') as f:
        writer_obj = writer(f)
        # If the file didn't exist initialize the field names row
        if not destFExist:
            writer_obj.writerow(fieldnames)
        
        # Subtract the already read urls from the complete list of urls.
        # In this way I have just the unread urls.
        # Add problematic urls manually to the list
        read_url = set(read_url)
        
        read_url = read_url.union(set(to_skip))
        urls = list(set(urls) - read_url)
        for url in urls:
            print("########")
            print("Start fetching data about "+url)

            page_num = 1

            while True:

                time.sleep(1)
                
                query = endpoint+typeSearch+"="+url+"&term-require-all=true"+"&"+"authentication_token="+APIkey
                print(query)

                response = requests.get(query+"&page="+str(page_num), headers=headers)
                print(response)

                # Check the status code before processing the data
                if response.status_code == 200:
                    response_data = response.json()

                    print("fetching relevant data")
                    for notice in response_data['notices']:
                        works = notice.get("works", [])
                        for work in works:
                            try:
                                # Store just the urls with unseen id and type "DMCA"
                                if (not notice.get("id") in ids and notice.get("type") == "DMCA" and notice.get("score") >= 20):
                                    

                                    copyrighted_urls = work.get("copyrighted_urls")
                                    infringing_urls =  work.get("infringing_urls")
                                    
                                    # If copywrighted_urls or infringing_urls is equal to None skip this iteration
                                    if copyrighted_urls == None or infringing_urls == None:
                                        continue

                                    copyrighted = ""
                                    infringing = ""
                                    
                                    # Create a string with all the copywrighted urls.
                                    for i in range(0,len(copyrighted_urls)):
                                        copyurl = copyrighted_urls[i].get("url")
                                        cbw,u = contains_bad_word(copyurl)
                                        if not SAVE_ONLY_BAD_URLS or SAVE_ONLY_BAD_URLS and cbw:
                                            copyrighted += ";" + copyurl
                                            # If a copywrighted url contains a bad word, store it in a list.                
                                            splits = copyurl.split("/")
                                                
                                            if u == "tumblr":
                                                if len(splits) >= 4:
                                                    if splits[3] == "blog":
                                                        copyurl = "/".join(splits[:6])
                                                    elif splits[3] == "post":
                                                        copyurl = "/".join(splits[:3])
                                                    else:
                                                        copyurl = "/".join(splits[:4])
                                            elif u == "wordpress" :
                                                if splits[2] == "wordpress.com":
                                                    copyurl = "/".join(splits[:len(splits)-1])
                                                else:
                                                    copyurl = "/".join(splits[:3])
                                            
                                            elif u == "issuu":
                                                    copyurl = "/".join(splits[:4])
                                            
                                            else:
                                                copyurl = "/".join(splits[:3])
                                            if cbw:
                                                bad_urls.add(copyurl)
                                            break
                                    
                                    if copyrighted != "":
                                        # Create a string with all the infringing urls.
                                        for i in range(0,len(infringing_urls)):
                                            infringing += ";" + infringing_urls[i].get("url")
                                        
                                        # Delete the first semicolon character
                                        copyrighted = copyrighted[1:]
                                        infringing = infringing[1:]

                                        to_write = [notice.get("id"),
                                                    notice.get("type"),
                                                    notice.get("date_received")[0:10],
                                                    notice.get("sender_name"),
                                                    notice.get("jurisdictions")[0].upper(),
                                                    copyrighted,
                                                    infringing,
                                                    url,
                                                    notice.get("score")]
                                        # Write the data to the CSV file without the header row
                                        writer_obj.writerow(to_write)
                                        # Store in the list the id
                                        ids.append(notice.get("id"))
                                        # Increase the counter of inserted ids
                                        cnt_inserted +=1
                            except Exception as e:
                                print("Error writing to CSV file:", e)

                read_url.add(url)
                
                # Store in .pkl files the read urls and the bad urls
                pickle.dump(bad_urls,open('bad_urls.pkl','wb'))
                pickle.dump(read_url,open('read_urls.pkl','wb'))

                if len (response_data['notices']) == 0:
                    break
                page_num += 1
        cnt_total+=cnt_inserted


    
    print("Total number of rows: " + str(cnt_total))
    print("Inserted rows: "+ str(cnt_inserted))

main()
