# Handling ABC Articles

In [1]:
from unstructured.partition.auto import partition
from unstructured.documents.elements import *
from unstructured.staging.weaviate import create_unstructured_weaviate_class, stage_for_weaviate

import weaviate
from weaviate.util import generate_uuid5

from dotenv import load_dotenv
import os
import json
from datetime import date
import tqdm
load_dotenv()

True

In [2]:
weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")
openai_api_key=os.getenv("OPENAI_API_KEY")

## Weaviate schemas
Schemas are patterns for data. The template that exists is a good base but this layer is ripe for customization. 

Allowing user input here can assist with searching later on. The recommendation engine can use tags to help put useful information in front of me. Auto-ingesting data could also use tags for reliable sources (e.g. #media)

A blank text box (upload note - i like the name reverie) which would be useful for experimenting and curating a database.

What follows is the definition a generalized 'unstructured document' class for Weaviate. Once this is working, next step is to make one for a specific use-case, e.g. abc online articles. 

In [3]:
article_url = "https://www.abc.net.au/news/2023-07-11/covid-response-shows-indigenous-voice-needed-fiona-stanley-says/102582244"
import requests
from unstructured.partition.auto import partition
from unstructured.staging.weaviate import stage_for_weaviate
from unstructured.partition.html import partition_html

# Fetch data from URL
html_content = requests.get(article_url).text


# Assuming html_content is the HTML content you got from the URL
elements = partition_html(text=html_content)
data_objects = stage_for_weaviate(elements)

In [4]:
data_objects

[{'filetype': 'text/html',
  'page_number': 1,
  'text': "Indigenous community's handling of COVID shows why Voice to Parliament needed, Fiona Stanley says",
  'category': 'UncategorizedText'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'By Daryna Zadvirna',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'Posted',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'updated',
  'category': 'NarrativeText'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'ABC News: Daryna Zadvirna)',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'Help keep family & friends informed by sharing this article',
  'category': 'NarrativeText'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'abc.net.au/news/covid-response-shows-indigenous-voice-needed-fiona-stanley-says/102582244',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'Link copied',
  '

In [5]:
abc_class = {
    'class': 'ABCNewsArticle',
    'description': 'Class for ABC News articles',
    'properties': [
        {'name': 'text', 'dataType': ['text']},
        {'name': 'category', 'dataType': ['text']},
        {'name': 'article_name', 'dataType': ['text']},
        {'name': 'date', 'dataType': ['text']},
        {'name': 'url', 'dataType': ['text']},
        {'name': 'tags', 'dataType': ['text']},
        {'name': 'upload_note', 'dataType': ['text']},
    ],
    'vectorizer': 'text2vec-openai', 
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False
        }
    },

}


client = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key),
    additional_headers= {
        "X-OpenAI-Api-Key": openai_api_key,
    }
)


In [6]:
unstructured_class = {
    'class': 'UnstructuredDocument',
    'description': 'General class for all documents (todo: add more specific classes)',
    'properties': [
        {'name': 'text', 'dataType': ['text']},
        {'name': 'category', 'dataType': ['text']},
        {'name': 'filename', 'dataType': ['text']},
        {'name': 'file_directory', 'dataType': ['text']},
        {'name': 'date', 'dataType': ['text']},
        {'name': 'filetype', 'dataType': ['text']},
        {'name': 'attached_to_filename', 'dataType': ['text']},
        {'name': 'page_number', 'dataType': ['int']},
        {'name': 'page_name', 'dataType': ['text']},
        {'name': 'url', 'dataType': ['text']},
        {'name': 'sent_from', 'dataType': ['text']},
        {'name': 'sent_to', 'dataType': ['text']},
        {'name': 'subject', 'dataType': ['text']},
        {'name': 'header_footer_type', 'dataType': ['text']},
        {'name': 'text_as_html', 'dataType': ['text']},
        {'name': 'regex_metadata', 'dataType': ['text']},
        {'name': 'tags', 'dataType': ['text']},
        {'name': 'upload_note', 'dataType': ['text']},
    ],
    'vectorizer': 'text2vec-openai', 
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False
        }
    },

}

## Modifying the schema
Below code blocks are used to update the schema in Weaviate. Don't need to run this every time.

### Create schema with the defined classes

In [7]:
schema = {"classes": [unstructured_class, abc_class]}
# add the ABCNewsArticle class to the schema
#schema["classes"].append(abc_class)
client.schema.create(schema)

### Print defined schemas

In [21]:

schema = {"classes": [abc_class]}

print(schema)
client.schema.create(schema)


{'classes': [{'class': 'ABCNewsArticle', 'description': 'Class for ABC News articles', 'properties': [{'name': 'text', 'dataType': ['text']}, {'name': 'category', 'dataType': ['text']}, {'name': 'article_name', 'dataType': ['text']}, {'name': 'date', 'dataType': ['text']}, {'name': 'url', 'dataType': ['text']}, {'name': 'tags', 'dataType': ['text']}, {'name': 'upload_note', 'dataType': ['text']}], 'vectorizer': 'text2vec-openai', 'moduleConfig': {'text2vec-openai': {'vectorizeClassName': False}}}]}


### Delete all classes in schema

In [11]:
schema = client.schema.get()

for class_info in schema['classes']:
    class_name = class_info['class']
    client.schema.delete_class(class_name)

### Show properties of the first class in the schema

In [8]:
schema = client.schema.get()
schema['classes'][1]

{'class': 'ABCNewsArticle',
 'description': 'Class for ABC News articles',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'text2vec-openai': {'model': 'ada',
   'modelVersion': '002',
   'type': 'text',
   'vectorizeClassName': False}},
 'multiTenancyConfig': {'enabled': False},
 'properties': [{'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-openai': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'text',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-openai': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'category',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-openai': {'skip'

## Ingest document data using Unstructured
Unstructured allows essentially any file to be uploaded and text data extracted. Testing on documents in `../data/`.

In [16]:
doc_elements = partition("../data/Politics and the English Language - George Orwell.pdf")
data_objects = stage_for_weaviate(doc_elements)

for key in ['filename', 'file_directory', 'filetype', 'page_number', 'text', 'category']:
    print("{0}: {1}".format(key, data_objects[0][key]))

filename: Politics and the English Language - George Orwell.pdf
file_directory: ../data
filetype: application/pdf
page_number: 1
text: Politics and the English Language - George Orwell
category: Title


In [25]:
data_objects[0].keys()

dict_keys(['filetype', 'page_number', 'text', 'category'])

Not sure how much of the above data actually needs to be stored. Store it all for now, but there's likely more interesting metadata to be added here. Examples might be a sentiment analysis score, GPT-generated summary, external links and internal links, etc. 

Upload to Weaviate with the uploaded data fitting the defined schema:

In [9]:
# show the first data object of UncategorizedTest 
data_objects[0]['text']

"Indigenous community's handling of COVID shows why Voice to Parliament needed, Fiona Stanley says"

In [10]:
upload_note = "Fiona Stanley praises the Aboriginal response to COVID-19, calling it the best in the world and a strong argument for an Indigenous Voice to Parliament. She highlighted that contrary to expectations, there were significantly fewer COVID-19 cases among Indigenous Australians compared to their non-Indigenous counterparts at the start of the pandemic. Stanley attributes this success to the high level of input and self-determination in health responses during the pandemic, stating that the Aboriginal response is the best example of why a Voice is going to make such a difference."
tags = "voice to parliament, covid-19"


with client.batch(batch_size=10) as batch:
    for i, d in enumerate(data_objects):  
        properties = {
            'category': d['category'],
            'text': d['text'],
            'article_name' : data_objects[0]['text'],
            'url': article_url,
            'filetype': d['filetype'],
            'date': date.today().strftime("%Y-%m-%d"),
            'upload_note': upload_note, # testing
            'tags': tags # testing
        }
        batch.add_data_object(
            properties,
            'ABCNewsArticle',
            uuid=generate_uuid5(properties),
        )

## Querying Weaviate
Once the data is in Weaviate, a NearText search (to research) can be performed to find concepts similar to user input:

In [35]:
nearText = {"concepts": ["renters", "cost of living"]}

client = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key),
    additional_headers= {
        "X-OpenAI-Api-Key": openai_api_key,
    }
)


response = (
    client.query  # start a new query
    .get("ABCNewsArticle", ["text", "article_name", "upload_note", "tags"])  # get objects of the "UnstructuredDocument" class and retrieve their "text" property
    .with_near_text(nearText)  # find objects that are semantically similar to the text in "nearText"
    .with_limit(6)  # limit the results to the top 2 most similar objects
    #.with_where({
    #    "path": ["tags"],
    #    "operator": "Equal",
    #    "valueString": "updatedtest"
    #})
    .do()  # execute the query
)



In [12]:
client = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key),
    additional_headers= {
        "X-OpenAI-Api-Key": openai_api_key,
    }
)

response = (
    client.query  # start a new query
    .get("ABCNewsArticle", ["text", "article_name", "category"])  # limit the results to the top 2 most similar objects
    .with_where({
        "path": ["tags"],
        "operator": "Equal",
        "valueString": "test"
    })
    .do()  # execute the query
)

In [20]:
# extract text from response
print("title: " + response['data']['Get']['ABCNewsArticle'][0]['text'])
for i, r in enumerate(response['data']['Get']['ABCNewsArticle']):
    print("\ncategory: {0}\ncontent: {1}"
    .format(r['category'], r['text']))

title: Former Australian of the Year Fiona Stanley believes Aboriginal people had the best response to COVID-19 "in the world", and says it proves an Indigenous Voice to Parliament will improve health outcomes for First Nations people.

category: NarrativeText
content: Former Australian of the Year Fiona Stanley believes Aboriginal people had the best response to COVID-19 "in the world", and says it proves an Indigenous Voice to Parliament will improve health outcomes for First Nations people.

category: UncategorizedText
content: Indigenous community's handling of COVID shows why Voice to Parliament needed, Fiona Stanley says

category: NarrativeText
content: Help keep family & friends informed by sharing this article

category: Title
content: Copy link

category: NarrativeText
content: Link copied

category: Title
content: By Daryna Zadvirna

category: Title
content: abc.net.au/news/covid-response-shows-indigenous-voice-needed-fiona-stanley-says/102582244

category: Title
content: AB