In [1]:
import requests #http://docs.python-requests.org/en/master/ 
from bs4 import BeautifulSoup
from datetime import datetime as dt
from datetime import timedelta
import time
import pandas as pd

In [2]:
url = "https://www.uspto.gov/about-us/events"
headers = {"User-Agent":"kekoziar_ptrc/0.0 (Language=Python/3.9.7; Host=https://github.com/kekoziar/ptrc; email=katherine.koziar@ucr.edu) python-requests/2.26.0"}
events = pd.DataFrame(columns = ['title', 'link', 'pubDate', 'description', 'page_owner','date_index'])

In [3]:
webpage = requests.get(url, headers=headers)

In [4]:
soup = BeautifulSoup(webpage.content, 'html.parser')
tr = soup.find_all('tr')

# Create dataframe

In [5]:
# this checks if the workshop is after the current date
url_start = "https://www.uspto.gov"
current_date = (dt.now() + timedelta(days=1)).strftime("%Y%m%d")
current_date = int(current_date)
date = 20221231

index = len(tr)-1

while(date >= current_date):
#    print(index)
    if(str(tr[index]).find("date=")!=-1):
        string = str(tr[index])
        date_index=(string.find("date="))+6
        date = int(string[date_index:date_index+8])
        if(date >= current_date):
            workshop_url = tr[index].find(href=True)['href']
    #        print("link: ", workshop_url)

            webpage = requests.get(url_start+workshop_url, headers = headers)
            soup_workshop = BeautifulSoup(webpage.content, 'html.parser')

            workshop_title = soup_workshop.find('meta', {"name":"dcterms.title"})
            workshop_pageowner = soup_workshop.find('meta', {"name":"uspto.pageowner"})

            workshop_description = soup_workshop.select_one('var.atc_description').text
            workshop_description = workshop_description[0:workshop_description.find('\n')].strip().replace('\xa0', ' ')
            workshop_description = workshop_description.replace('\u200b','')

            workshop_time = soup_workshop.select_one('var.atc_date_start').text
            workshop_time = workshop_time.replace(" ", "T")
            workshop_time = dt.fromisoformat(workshop_time)

            workshop_info = pd.DataFrame({
                "title": [workshop_title["content"]], 
                "link": [url_start+workshop_url], 
                "pubDate": [workshop_time.strftime("%a, %d %b %Y %X EST")], 
                "description": [workshop_description],
                "page_owner": [workshop_pageowner["content"]], 
                "date_index": [date]
            })        
            new = [events, workshop_info]
            events = pd.concat(new)
            time.sleep(1)
#    else:
#        print("not here")
    index = index - 1

In [6]:
events = events.sort_values(by=["date_index"])
events.to_csv("../data/events.csv")

# create xml file

In [7]:
xml_files = ["All", "Patent", "Trademark", "Other"]

In [8]:
xml_dict = {"All": ["<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<rss version=\"2.0\">\n<channel>\n"], 
             "Patent": ["<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<rss version=\"2.0\">\n<channel>\n"],
             "Trademark": ["<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<rss version=\"2.0\">\n<channel>\n"] , 
             "Other": ["<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<rss version=\"2.0\">\n<channel>\n"]
            }

In [9]:
for category in xml_files:
    xml_dict[category].append("<title>USPTO "+category+" Events</title>\n<link>https://guides.lib.ucr.edu/c.php?g=932732</link>\n")
    xml_dict[category].append("<description>This feed provides easy access to USPTO "+category+" events.</description>\n<language>en-us</language>\n")    
    xml_dict[category].append("<lastBuildDate>"+dt.now().strftime("%a, %d %b %Y %X PST")+"</lastBuildDate>\n<webMaster>katherine.koziar@ucr.edu</webMaster>\n\n")        

In [10]:
for i in range(len(events)):
    title = "\n<item> <title>"+events.iloc[i, 0]+"</title>\n"
    link = "\t<link>"+events.iloc[i,1]+"</link>\n"
    guid = "\t<guid>"+events.iloc[i,1]+"</guid>\n"
    pubDate = "\t<pubDate>"+events.iloc[i,2]+"</pubDate>\n"
    description = "\t<description>"+events.iloc[i,3]+"</description>\n"
    description = description.replace("’","'")
    enditem = "</item>\n\n"
    
    xml_dict["All"].append(title+link+guid+pubDate+description+enditem)
    
    if(events.iloc[i,4] == 'All Regions') or (events.iloc[i,4] == 'ecommerce'):
        xml_dict["Patent"].append(title+link+guid+pubDate+description+enditem)
    elif (events.iloc[i,4] == 'Trademarks'):
        xml_dict["Trademark"].append(title+link+guid+pubDate+description+enditem)
    else:
        xml_dict["Other"].append(title+link+guid+pubDate+description+enditem)


In [11]:
for category in xml_files:
    xml_dict[category].append("\n</channel>\n</rss>")    
    temp = " " 
    temp = temp.join(xml_dict[category])
    xml_file = open("../rss-"+category.lower()+".xml", "w")
    xml_file.write(temp)
    xml_file.close()

In [12]:
# to do next: read events from file, check events page against events to identify new ones and add to events dataframe
# use url