# Handling ABC Articles

In [1]:
from unstructured.partition.auto import partition
from unstructured.documents.elements import *
from unstructured.staging.weaviate import create_unstructured_weaviate_class, stage_for_weaviate

import weaviate
from weaviate.util import generate_uuid5

from dotenv import load_dotenv
import os
import json
from datetime import date
import tqdm
load_dotenv()

True

In [2]:
weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")
openai_api_key=os.getenv("OPENAI_API_KEY")

## Weaviate schemas
Schemas are patterns for data. The template that exists is a good base but this layer is ripe for customization. 

Allowing user input here can assist with searching later on. The recommendation engine can use tags to help put useful information in front of me. Auto-ingesting data could also use tags for reliable sources (e.g. #media)

A blank text box (upload note - i like the name reverie) which would be useful for experimenting and curating a database.

What follows is the definition a generalized 'unstructured document' class for Weaviate. Once this is working, next step is to make one for a specific use-case, e.g. abc online articles. 

In [7]:
article_url = "https://www.abc.net.au/news/2023-07-11/australian-renters-mould-exposure-impacting-mental-health/102563872?sf267830207=1"
import requests
from unstructured.partition.auto import partition
from unstructured.staging.weaviate import stage_for_weaviate
from unstructured.partition.html import partition_html

# Fetch data from URL
html_content = requests.get(article_url).text


# Assuming html_content is the HTML content you got from the URL
elements = partition_html(text=html_content)
data_objects = stage_for_weaviate(elements)

In [8]:
data_objects

[{'filetype': 'text/html',
  'page_number': 1,
  'text': 'Renters more likely than owner-occupiers to face health risks from house quality, expert says',
  'category': 'UncategorizedText'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'ABC RN',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'By Sam Nichols for The Money',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'Posted',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'ABC News: Jack Fisher)',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'Help keep family & friends informed by sharing this article',
  'category': 'NarrativeText'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'abc.net.au/news/australian-renters-mould-exposure-impacting-mental-health/102563872',
  'category': 'Title'},
 {'filetype': 'text/html',
  'page_number': 1,
  'text': 'Link copied',
  'category': 'N

In [32]:
abc_class = {
    'class': 'ABCNewsArticle',
    'description': 'Class for ABC News articles',
    'properties': [
        {'name': 'text', 'dataType': ['text']},
        {'name': 'category', 'dataType': ['text']},
        {'name': 'article_name', 'dataType': ['text']},
        {'name': 'date', 'dataType': ['text']},
        {'name': 'url', 'dataType': ['text']},
        {'name': 'tags', 'dataType': ['text']},
        {'name': 'upload_note', 'dataType': ['text']},
    ],
    'vectorizer': 'text2vec-openai', 
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False
        }
    },

}


client = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key),
    additional_headers= {
        "X-OpenAI-Api-Key": openai_api_key,
    }
)


## Modifying the schema
Below code blocks are used to update the schema in Weaviate. Don't need to run this every time.

### Create schema with the defined classes

In [None]:
schema = {"classes": [unstructured_class]}
# add the ABCNewsArticle class to the schema
schema["classes"].append(abc_class)
client.schema.create(schema)

### Print defined schemas

In [21]:

schema = {"classes": [abc_class]}

print(schema)
client.schema.create(schema)


{'classes': [{'class': 'ABCNewsArticle', 'description': 'Class for ABC News articles', 'properties': [{'name': 'text', 'dataType': ['text']}, {'name': 'category', 'dataType': ['text']}, {'name': 'article_name', 'dataType': ['text']}, {'name': 'date', 'dataType': ['text']}, {'name': 'url', 'dataType': ['text']}, {'name': 'tags', 'dataType': ['text']}, {'name': 'upload_note', 'dataType': ['text']}], 'vectorizer': 'text2vec-openai', 'moduleConfig': {'text2vec-openai': {'vectorizeClassName': False}}}]}


### Delete all classes in schema

In [11]:
schema = client.schema.get()

for class_info in schema['classes']:
    class_name = class_info['class']
    client.schema.delete_class(class_name)

### Show properties of the first class in the schema

In [24]:
schema = client.schema.get()
schema['classes'][1]

{'class': 'ABCNewsArticle',
 'description': 'Class for ABC News articles',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'text2vec-openai': {'model': 'ada',
   'modelVersion': '002',
   'type': 'text',
   'vectorizeClassName': False}},
 'properties': [{'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-openai': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'text',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-openai': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'category',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-openai': {'skip': False,
     'vectorizePropertyName': Fals

## Ingest document data using Unstructured
Unstructured allows essentially any file to be uploaded and text data extracted. Testing on documents in `../data/`.

In [16]:
doc_elements = partition("../data/Politics and the English Language - George Orwell.pdf")
data_objects = stage_for_weaviate(doc_elements)

for key in ['filename', 'file_directory', 'filetype', 'page_number', 'text', 'category']:
    print("{0}: {1}".format(key, data_objects[0][key]))

filename: Politics and the English Language - George Orwell.pdf
file_directory: ../data
filetype: application/pdf
page_number: 1
text: Politics and the English Language - George Orwell
category: Title


In [25]:
data_objects[0].keys()

dict_keys(['filetype', 'page_number', 'text', 'category'])

Not sure how much of the above data actually needs to be stored. Store it all for now, but there's likely more interesting metadata to be added here. Examples might be a sentiment analysis score, GPT-generated summary, external links and internal links, etc. 

Upload to Weaviate with the uploaded data fitting the defined schema:

In [29]:
# show the first data object of UncategorizedTest 
data_objects[0]['text']

'Renters more likely than owner-occupiers to face health risks from house quality, expert says'

In [34]:
upload_note = "hello weaviate"
tags = "test, weaviate, python"


with client.batch(batch_size=10) as batch:
    for i, d in enumerate(data_objects):  
        properties = {
            'category': d['category'],
            'text': d['text'],
            'article_name' : data_objects[0]['text'],
            'url': article_url,
            'filetype': d['filetype'],
            'date': date.today().strftime("%Y-%m-%d"),
            'upload_note': upload_note, # testing
            'tags': tags # testing
        }
        batch.add_data_object(
            properties,
            'ABCNewsArticle',
            uuid=generate_uuid5(properties),
        )

## Querying Weaviate
Once the data is in Weaviate, a NearText search (to research) can be performed to find concepts similar to user input:

In [35]:
nearText = {"concepts": ["renters", "cost of living"]}

client = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key),
    additional_headers= {
        "X-OpenAI-Api-Key": openai_api_key,
    }
)


response = (
    client.query  # start a new query
    .get("ABCNewsArticle", ["text", "article_name", "upload_note", "tags"])  # get objects of the "UnstructuredDocument" class and retrieve their "text" property
    .with_near_text(nearText)  # find objects that are semantically similar to the text in "nearText"
    .with_limit(6)  # limit the results to the top 2 most similar objects
    #.with_where({
    #    "path": ["tags"],
    #    "operator": "Equal",
    #    "valueString": "updatedtest"
    #})
    .do()  # execute the query
)



In [36]:
# extract text from response
for i, r in enumerate(response['data']['Get']['ABCNewsArticle']):
    print("{0}: \nupload note: {1}\ntags: {2}\n{3}\n"
    .format(i+1, r['upload_note'], r['tags'], r['text'][:100] + '...'))

1: 
upload note: hello weaviate
tags: test, weaviate, python
Cost of Living...

2: 
upload note: hello weaviate
tags: test, weaviate, python
What is inflationary psychology and how is it impacting our rising cost of living?...

3: 
upload note: hello weaviate
tags: test, weaviate, python
'You're talking about some people paying 3 or 4 per cent more on their mortgage ... overnight'...

4: 
upload note: hello weaviate
tags: test, weaviate, python
'You're talking about some people paying 3 or 4 per cent more on their mortgage ... overnight'...

5: 
upload note: hello weaviate
tags: test, weaviate, python
Professor Bentley says that's partly because when people are paying more than they can afford, it "g...

6: 
upload note: hello weaviate
tags: test, weaviate, python
"Just the experience of financial hardship and stress that's associated with being in unaffordable r...

