Figuring out how to handle Unstructured data

In [1]:
from unstructured.partition.auto import partition
from unstructured.documents.elements import *
from unstructured.staging.weaviate import create_unstructured_weaviate_class, stage_for_weaviate

import weaviate
from weaviate.util import generate_uuid5

from dotenv import load_dotenv
import os
import json
from datetime import date
import tqdm
load_dotenv()

True

In [2]:
weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")
openai_api_key=os.getenv("OPENAI_API_KEY")

### Create a schema in Weaviate

In [35]:
unstructured_class = {
    'class': 'UnstructuredDocument',
    'description': 'General class for all documents (todo: add more specific classes)',
    'properties': [
        {'name': 'text', 'dataType': ['text']},
        {'name': 'category', 'dataType': ['text']},
        {'name': 'filename', 'dataType': ['text']},
        {'name': 'file_directory', 'dataType': ['text']},
        {'name': 'date', 'dataType': ['text']},
        {'name': 'filetype', 'dataType': ['text']},
        {'name': 'attached_to_filename', 'dataType': ['text']},
        {'name': 'page_number', 'dataType': ['int']},
        {'name': 'page_name', 'dataType': ['text']},
        {'name': 'url', 'dataType': ['text']},
        {'name': 'sent_from', 'dataType': ['text']},
        {'name': 'sent_to', 'dataType': ['text']},
        {'name': 'subject', 'dataType': ['text']},
        {'name': 'header_footer_type', 'dataType': ['text']},
        {'name': 'text_as_html', 'dataType': ['text']},
        {'name': 'regex_metadata', 'dataType': ['text']}
    ],
    'vectorizer': 'text2vec-openai', 
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False
        }
    },

}


client = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key),
    additional_headers= {
        "X-OpenAI-Api-Key": openai_api_key,
    }
)


In [12]:

schema = {"classes": [unstructured_class]}
client.schema.create(schema)
# print the schema

schema = client.schema.get()
print(schema)


{'classes': [{'class': 'UnstructuredDocument', 'description': 'General class for all documents (todo: add more specific classes)', 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2}, 'cleanupIntervalSeconds': 60, 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}}, 'moduleConfig': {'text2vec-openai': {'model': 'ada', 'modelVersion': '002', 'type': 'text', 'vectorizeClassName': False}}, 'properties': [{'dataType': ['text'], 'indexFilterable': True, 'indexSearchable': True, 'moduleConfig': {'text2vec-openai': {'skip': False, 'vectorizePropertyName': False}}, 'name': 'text', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexSearchable': True, 'moduleConfig': {'text2vec-openai': {'skip': False, 'vectorizePropertyName': False}}, 'name': 'category', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexSearchable': True, 'moduleConfig': {'text2vec-openai': {'skip': False, 'vectorizePropertyName': False}}, 'name': 'file

In [38]:
schema = client.schema.get()
print(schema)

{'classes': [{'class': 'UnstructuredDocument', 'description': 'General class for all documents (todo: add more specific classes)', 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2}, 'cleanupIntervalSeconds': 60, 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}}, 'moduleConfig': {'text2vec-openai': {'model': 'ada', 'modelVersion': '002', 'type': 'text', 'vectorizeClassName': False}}, 'properties': [{'dataType': ['text'], 'indexFilterable': True, 'indexSearchable': True, 'moduleConfig': {'text2vec-openai': {'skip': False, 'vectorizePropertyName': False}}, 'name': 'text', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexSearchable': True, 'moduleConfig': {'text2vec-openai': {'skip': False, 'vectorizePropertyName': False}}, 'name': 'category', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexSearchable': True, 'moduleConfig': {'text2vec-openai': {'skip': False, 'vectorizePropertyName': False}}, 'name': 'file

# Delete all classes

In [11]:
# get the schema
schema = client.schema.get()

for class_info in schema['classes']:
    class_name = class_info['class']
    client.schema.delete_class(class_name)



In [16]:
schema['classes'][0]['properties']

[{'dataType': ['text'],
  'indexFilterable': True,
  'indexSearchable': True,
  'moduleConfig': {'text2vec-openai': {'skip': False,
    'vectorizePropertyName': False}},
  'name': 'text',
  'tokenization': 'word'},
 {'dataType': ['text'],
  'indexFilterable': True,
  'indexSearchable': True,
  'moduleConfig': {'text2vec-openai': {'skip': False,
    'vectorizePropertyName': False}},
  'name': 'category',
  'tokenization': 'word'},
 {'dataType': ['text'],
  'indexFilterable': True,
  'indexSearchable': True,
  'moduleConfig': {'text2vec-openai': {'skip': False,
    'vectorizePropertyName': False}},
  'name': 'filename',
  'tokenization': 'word'},
 {'dataType': ['text'],
  'indexFilterable': True,
  'indexSearchable': True,
  'moduleConfig': {'text2vec-openai': {'skip': False,
    'vectorizePropertyName': False}},
  'name': 'file_directory',
  'tokenization': 'word'},
 {'dataType': ['text'],
  'indexFilterable': True,
  'indexSearchable': True,
  'moduleConfig': {'text2vec-openai': {'skip

In [23]:


for key in ['filename', 'file_directory', 'filetype', 'page_number', 'text', 'category']:
    print("{0}: {1}".format(key, data_objects[0][key]))

filename: Politics and the English Language - George Orwell.pdf
file_directory: ../data
filetype: application/pdf
page_number: 1
text: Politics and the English Language - George Orwell
category: Title


In [28]:
# get current date as string in format YYYY-MM-DD
today = date.today()
today = today.strftime("%Y-%m-%d")
print("Today's date:", today)


Today's date: 2023-07-04


In [32]:
doc_elements = partition("../data/Politics and the English Language - George Orwell.pdf")
data_objects = stage_for_weaviate(doc_elements)



In [36]:

with client.batch(batch_size=10) as batch:
    for i, d in enumerate(data_objects):  
        properties = {
            'category': d['category'],
            'text': d['text'],
            'filename': d['filename'],
            'page_number': d['page_number'],
            'filetype': d['filetype'],
            'date': date.today().strftime("%Y-%m-%d")
        }
        batch.add_data_object(
            properties,
            'UnstructuredDocument',
            uuid=generate_uuid5(properties),
        )

In [41]:
nearText = {"concepts": ["cliche"]}

response = (
    client.query
    .get("UnstructuredDocument", ["text"])
    .with_near_text(nearText)
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "UnstructuredDocument": [
                {
                    "text": "Dying metaphors. A newly invented metaphor assists thought by evoking a visual image, while on the other hand a metaphor which is technically \"dead\" (e.g. iron resolution) has in effect reverted to being an ordinary word and can generally be used without loss of vividness. But in between these two classes there is a huge dump of worn-out metaphors which have lost all evocative power and are merely used because they save people the trouble of inventing phrases for themselves. Examples are: Ring the changes on, take up the cudgel for, toe the line, ride roughshod over, stand shoulder to shoulder with, play into the hands of, no axe to grind, grist to the mill, fishing in troubled waters, on the order of the day, Achilles' heel, swan song, hotbed. Many of these are used without knowledge of their meaning (what is a \"rift,\" for instance?), and incompatible metaphors are

In [37]:

# iterate over all classes and get all objects
for class_info in schema['classes']:
    class_name = 'UnstructuredDocument'
    
    # construct the GraphQL query
    query = f"""
    {{
      Get {{
        {class_name} {{
          _additional {{
            id
          }}
          ... on {class_name} {{
            text
            page_number
          }}
        }}
      }}
    }}
    """
    
    # run the query
    result = client.query(query)
    
    # print the result
    print(result)

TypeError: 'Query' object is not callable