In [1]:
from openai import OpenAI
import time
import json
from nltk import tokenize
from difflib import get_close_matches
from datetime import datetime
from dotenv import load_dotenv
from dotenv import dotenv_values
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
config = dotenv_values(".env")

### Initialize Open AI client

* Create a .env file outside of the Jupyter Notebook

In [2]:
client = OpenAI(api_key=config['OPENAI_KEY'])

### Load in our data on .Gov Domains

In [3]:
gov_domains = pd.read_csv('../R Scripts/Government Domains/Government Domains with WHOIS Data from Cybersecurity & Infrastructure Security Agency.csv')
gov_domains['Agency'].value_counts()[0:10] #Number of websites associated to each agency...showing top 10

Agency
Department of Health and Human Services    131
General Services Administration            106
Department of the Treasury                  94
Department of Justice                       79
Department of the Interior                  67
Department of Commerce                      66
Department of Energy                        65
Department of Homeland Security             44
Executive Office of the President           39
Department of Defense                       39
Name: count, dtype: int64

In [4]:
gov_domains.head()

Unnamed: 0,Domain.name,Domain.type,Agency,Organization.name,City,State,Security.contact.email,Domain.Name,Registrar.WHOIS.Server,Registrar.URL,...,Security.Postal.Code,Security.Country,Security.Phone,Security.Email,Name.Server,DNSSEC,Registrant.Fax,Admin.Fax,president,simple_date
0,achp.gov,Federal - Executive,Advisory Council on Historic Preservation,Advisory Council on Historic Preservation,Washington,DC,domainsecurity@achp.gov,achp.gov,whois.nic.gov,https://get.gov,...,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,domainsecurity@achp.gov,ns-605.awsdns-11.net,signedDelegation,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,Bill Clinton,1997-10-02
1,arc.gov,Federal - Executive,Appalachian Regional Commission,Appalachian Regional Commission,Washington,DC,(blank),arc.gov,whois.nic.gov,https://get.gov,...,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,karl.ns.cloudflare.com,unsigned,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,Bill Clinton,1997-10-02
2,asc.gov,Federal - Executive,Appraisal Subcommittee of the Federal Financia...,Appraisal Subcommittee,Washington,DC,(blank),asc.gov,whois.nic.gov,https://get.gov,...,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,edns4.ultradns.org,signedDelegation,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,Bill Clinton,1997-10-02
3,aftac.gov,Federal - Executive,Department of Defense,Air Force Technical Applications Center,Patrick AFB,FL,dco@aftac.gov,aftac.gov,whois.nic.gov,https://get.gov,...,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,dco@aftac.gov,dstork.aftac.gov,unsigned,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,Bill Clinton,1997-10-02
4,ameslab.gov,Federal - Executive,Department of Energy,Ames Laboratory,Ames,IA,cybersec@ameslab.gov,ameslab.gov,whois.nic.gov,https://get.gov,...,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,cybersec@ameslab.gov,scsds.ameslab.gov,signedDelegation,,,Bill Clinton,1997-10-02


### Create a base message prompt 

* The objective is to iterate through the .gov domains we collected and extract information from them so that we can analyze similarity between website and eliminate those that are no longer active. This will be part of the data augmentation and cleaning.

In [6]:
def process_response(record_data):
    messages = []
    messages.append( 
        {
        "role": "system",
        "content": '''You are a helpful assistant that is assisting in analyzing the efficiency of the structure of government.
        You will be given a domain name, html associated with the domain, domain type, parent agency,
        organization name, city, state, date of creation of the website, and a last updated domain date. 
        If the website html is "No information found" assume that the organization is no longer active.
        If the website html points to another agency that original oranization name may be replaced by a new organization name.
        We want to get some key information for the organization and we want to create a set of keywords that can later be used to form a similarity matrix between organizations.
        You can use your own knowledge along with the information the user provides to generate your response. 
        Please return in a JSON format the following information:
        {
        original_organization_name: str,
        new_organization_name: str,
        organization_status: ['Active','Not Active'],
        summary_of_organization: str,
        ten_keywords_describing_organization: [str],
        estimated_funding:str,
        source_estimated_funding:str,
        estimated_number_of_employees:str,
        source_estimated_number_of_employees:str,
        }
        '''}, 
    )
    messages.append({
        'role':'user',
        'content':str(record_data)
    })
    chat = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=messages,
        response_format={ "type": "json_object" } 
    ) 
    reply = chat.choices[0].message.content
    return reply

### Given the above prompt we will extract only the core columns we plan to feed into ChatGPT

In [7]:
records = gov_domains[['Domain.name','Domain.type','Agency','Organization.name','City','State','Creation.Date','Updated.Date']].to_dict(orient='records')

### Now we will iterate through each domain, we will use requests and BS4 to scrape the text from the webpage. The webpage information will be combined with the above records and passed to ChatGPT. We will store the results in a list that we can later furhter clean and merge into the original data. 

In [8]:
store_responses = []

In [None]:
for i in range(len(records)):
    #Generate the https:// url to collect the webpage data
    url = 'https://'+gov_domains['Domain.name'].values[i]
    #Try to read the data
    try:
        #Max timeout set to 5-seconds
        response = requests.get(url,timeout=5)
        #read HTML into BS4 object
        soup = BeautifulSoup(response.text, "lxml")
        #focus on body text replacing unnecssary characters to minimze tokens sent to OpenAI
        website_text = soup.body.text.replace('\n',' ').replace('  ','').replace('\t','')
    except:
        #If no text found just define this as no information found for the domain
        website_text = 'No information found'
    #Merge in the data into the existing record data we have
    records[i]['html'] = website_text
    #Feed the combined data into chatgpt and store the response
    store_responses.append(process_response(records[i]))
    #Pause for an average of 6.5 seconds (random uniform)
    time.sleep(np.random.uniform(3,10,1)[0])
    #Print status
    print(i/len(records))

0.0
0.00072992700729927
0.00145985401459854
0.0021897810218978104
0.00291970802919708
0.0036496350364963502
0.004379562043795621
0.0051094890510948905
0.00583941605839416
0.006569343065693431
0.0072992700729927005
