In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv

import os
import pandas as pd
import openai

load_dotenv(find_dotenv("./.env"))
openai.api_key = os.environ.get("OPENAI_KEY")

In [2]:
with open("./Royal.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')
  

# Find all the cards with class "post active"
cards = soup.find_all('div', class_='post active')

# Initialize an empty list to store the data for each card
data_list = []

# Iterate through each card and extract the desired information
for card in cards:
    # Extract the full date from the card
    date = card.find('div', class_='time').text

    # Extract the <h2> content from the card
    h2_content = card.find('div', class_='card').find('h2').text

    main_content = card.find('div', class_='card').find('main').text

    # Create a dictionary with the extracted data
    card_data = {
        'Date': date,
        'Company': h2_content,
        'Info': main_content
    }

    # Append the dictionary to the list
    data_list.append(card_data)

# Convert the list of dictionaries into a Pandas DataFrame
df = pd.DataFrame(data_list)

# Print the DataFrame
df

Unnamed: 0,Date,Company,Info
0,19July2023,Braintree Public Schools,FOR IMMEDIATE RELEASERoyal\n Data Services off...
1,11June2023,Tachi-S Engineering USA,"Global Seat System CreatorAt\n Tachi-S, we hel..."
2,9June2023,PENNCREST School District,PENNCREST\n School District provides resources...
3,26May2023,Volt,"Volt\n Information Sciences, Inc. provides sta..."
4,26May2023,AFG Holdings,"AFG\n Holdings, Inc. is a fully integrated OEM..."
...,...,...,...
103,12January2023,T A Supply,PROOFPACK - W-9 / internal documents
104,12January2023,Chinery and Douglas,PROOFPACK - working documents
105,12January2023,Ruhrpumpen,PROOF PACK - Passports \ Finance \ Internal do...
106,6January2023,LEK / HABO,Lek/Habo\n realiseert al tientallen jaren met ...


In [3]:
industry_categories = [
    "Technology and IT",
    "Healthcare",
    "Finance and Insurance",
    "Manufacturing",
    "Retail and Commerce",
    "Government and Public Sector",
    "Energy and Utilities",
    "Media and Entertainment",
    "Agriculture and Food",
    "Automotive and Transportation",
    "Legal and Professional Services",
    "Environment and Natural Resources",
    "Sports and Recreation",
    "Research and Biotechnology",
    "Education",
    "Security and Surveillance"
]

data_categories = [
    "Intellectual Property",
    "Financial Data",
    "Customer Data",
    "Employee Data",
    "Strategic Plans",
    "Sensitive Communications",
    "Healthcare Records",
    "Classified Information",
    "Research and Development Data",
    "Legal and Compliance Information"
]

In [10]:
ans = {}

for idx in df.index:
    prompt = f'''

    I will be providing you with a body of text. The text starts off with the name of the organization, followed by information related to the organization.

    Your job is to extract 3 different information from the given text.

    1. Location of the organization (e.g. Country associated with it, e.g. Philadelphia = USA)
    2. Industry of the organization
    3. What was stolen

    For the 2nd answer, the list of industries you can choose from is given below. Stricly only choose 1 of the category that is given below.

    Industries to choose from: "{industry_categories}"

    For the 3rd answer, the categories regarding what was stolen, is also given below. Stricly only choose from the categories given below.

    Stolen categories: "{data_categories}"

    In the event where there is no information to answer a certain question, give your answer as "UNK". But try your best to give an answer (unless there is really none).

    I want your answer to be in a fixed format: "<Country>|<Industry>|[<What was stolen>]". You should strictly follow this format for your answer. It is also very important to encapsulate the third answer in square brackets [].
    An example of your answer would be "Malaysia|Technology and IT|[Intellectual Property, Sensitive Communications]".

    The text is given below,
    "{df['Company'][idx]}, {df['Info'][idx]}"
    '''

    response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    temperature=0,
    messages=[
        {"role": "system", "content": "You are a helpful assistant that is able to read through a body of text, and extract required information"},
        {"role": "user", "content": prompt},
        ]
    )   

    ans[df['Company'][idx]] = response['choices'][0]['message']['content']

In [11]:
print(ans)

{'Braintree Public Schools': 'UNK|UNK|[UNK]', 'Tachi-S Engineering USA': 'USA|Automotive and Transportation|UNK', 'PENNCREST School District': 'UNK|UNK|[UNK]', 'Volt': 'UNK|Technology and IT|[UNK]', 'AFG Holdings': 'UNK|Manufacturing|UNK', 'Mitutoyo': 'UNK|Manufacturing|[UNK]', 'The Best Connection': 'United Kingdom|Retail and Commerce|[UNK]', 'DirectViz Solutions': 'UNK|Technology and IT|UNK', 'BM Precision': 'UNK|Manufacturing|[UNK]', 'Groupe Sovitrat Interim and Recrutement': 'France|Human Resources|[UNK]', 'Grange Packing Solutions': 'UNK|Manufacturing|UNK', 'Ruhrpumpen': 'UNK|UNK|[UNK]', 'Haworth Tompkins': 'UNK|UNK|UNK', 'Colrich': 'UNK|UNK|UNK', 'Coos Bay': 'UNK|UNK|[UNK]', 'Dotcom Distribution': 'UNK|UNK|UNK', 'Westside': 'UNK|UNK|UNK', 'TA Supply': 'UNK|UNK|UNK', 'Agostini Insurance Brokers': 'UNK|Finance and Insurance|UNK', 'Trinity Exploration and Production': 'UNK|Energy and Utilities|[UNK]', 'Chinery and Douglas': 'UNK|UNK|UNK', 'Livingston': 'UNK|UNK|[Employee Data, Finan

In [12]:
df['Country'] = None
df['Industry'] = None
df['Stolen'] = None

df

Unnamed: 0,Date,Company,Info,Country,Industry,Stolen
0,19July2023,Braintree Public Schools,FOR IMMEDIATE RELEASERoyal\n Data Services off...,,,
1,11June2023,Tachi-S Engineering USA,"Global Seat System CreatorAt\n Tachi-S, we hel...",,,
2,9June2023,PENNCREST School District,PENNCREST\n School District provides resources...,,,
3,26May2023,Volt,"Volt\n Information Sciences, Inc. provides sta...",,,
4,26May2023,AFG Holdings,"AFG\n Holdings, Inc. is a fully integrated OEM...",,,
...,...,...,...,...,...,...
103,12January2023,T A Supply,PROOFPACK - W-9 / internal documents,,,
104,12January2023,Chinery and Douglas,PROOFPACK - working documents,,,
105,12January2023,Ruhrpumpen,PROOF PACK - Passports \ Finance \ Internal do...,,,
106,6January2023,LEK / HABO,Lek/Habo\n realiseert al tientallen jaren met ...,,,


In [14]:
for idx, (k, v) in enumerate(ans.items()):
    if df['Company'][idx] == k:
        temp = v.split("|")

        df['Country'][idx] = temp[0]
        df['Industry'][idx] = temp[1]

        if temp[2][0] == "[":
            temp[2] = temp[2][1:]
        
        if temp[2][len(temp[2])-1] == "]":
            temp[2] = temp[2][:-1]
        
        df['Stolen'][idx] = temp[2]

print(len(ans))

105


In [15]:
# Increase the number of rows and columns to display
pd.set_option('display.max_rows', None)    # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

df

Unnamed: 0,Date,Company,Info,Country,Industry,Stolen
0,19July2023,Braintree Public Schools,FOR IMMEDIATE RELEASERoyal\n Data Services off...,UNK,UNK,UNK
1,11June2023,Tachi-S Engineering USA,"Global Seat System CreatorAt\n Tachi-S, we hel...",USA,Automotive and Transportation,UNK
2,9June2023,PENNCREST School District,PENNCREST\n School District provides resources...,UNK,UNK,UNK
3,26May2023,Volt,"Volt\n Information Sciences, Inc. provides sta...",UNK,Technology and IT,UNK
4,26May2023,AFG Holdings,"AFG\n Holdings, Inc. is a fully integrated OEM...",UNK,Manufacturing,UNK
5,26May2023,Mitutoyo,Mitutoyo\n is one of the world's leading manuf...,UNK,Manufacturing,UNK
6,26May2023,The Best Connection,"Headquartered\n in Bromsgrove, United Kingdom,...",United Kingdom,Retail and Commerce,UNK
7,26May2023,DirectViz Solutions,Directviz\n Solutions (DVS) provides the infor...,UNK,Technology and IT,UNK
8,26May2023,BM Precision,"B\n & M Precision, Inc. is capable of producin...",UNK,Manufacturing,UNK
9,26May2023,Groupe Sovitrat Interim and Recrutement,Groupe\n Sovitrat Interim & Recrutement is a c...,France,Human Resources,UNK


In [16]:
df.to_excel('output.xlsx')