In [None]:
import os
import weaviate
import requests
import json
from dotenv import load_dotenv

load_dotenv()
weaviate_api = os.getenv('WEAVIATE_API')
huggingface_api = os.getenv('HUGGINGFACE_API')


client = weaviate.Client(
    url = "https://corprep-vdb-bk49tyjx.weaviate.network",  
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api),
    additional_headers = {
        "X-HuggingFace-Api-Key": huggingface_api
    }
)

In [None]:
class_obj = {
  "class": "Newscatcher",
  "vectorizer": "text2vec-huggingface",
  "moduleConfig": {
    "text2vec-huggingface": {
      "model": "sentence-transformers/all-MiniLM-L6-v2", 
      "vectorizeClassName": "false"
    }
  },
  "properties": [
    {
    'name': 'title',
    'dataType': ['text'],
    'moduleConfig': {
      'text2vec-huggingface': {
        'vectorizePropertyName': 'false'
      }
    }
    },
    {
    'name': 'publishedDate',
    'dataType': ['date'],
    'moduleConfig': {
      'text2vec-huggingface': {
        'vectorizePropertyName': 'false'
      }
    }
    },
    {
    'name': 'summary',
    'dataType': ['text'],
    'moduleConfig': {
      'text2vec-huggingface': {
        'vectorizePropertyName': 'false'
      }
    }
    },
    {
    'name': 'topic',
    'dataType': ['text'],
    'moduleConfig': {
      'text2vec-huggingface': {
        'vectorizePropertyName': 'false'
      }
    }
    },
    {
    'name': 'isOpinion',
    'dataType': ['boolean'],
    'moduleConfig': {
      'text2vec-huggingface': {
        'vectorizePropertyName': 'false'
      }
    }
    },
    {
    'name': 'country',
    'dataType': ['text'],
    'moduleConfig': {
      'text2vec-huggingface': {
        'vectorizePropertyName': 'false'
      }
    }
    },
    {
    'name': 'language',
    'dataType': ['text'],
    'moduleConfig': {
      'text2vec-huggingface': {
        'vectorizePropertyName': 'false'
      }
    }
    }
  ]
}
  

client.schema.create_class(class_obj)

In [None]:
client.schema.delete_all()

In [None]:
client.schema.get('Newscatcher')

In [None]:
client.query.aggregate("Newscatcher").with_meta_count().do()

In [None]:
import glob
import pprint

path = 'C:/mldev/corprep/data'

# Prepare a batch process
with client.batch (
    batch_size=100,
    dynamic=True,
    timeout_retries=5,
    connection_error_retries=5
) as batch:
    for filename in glob.glob(os.path.join(path, '*.json')):
        with open(filename, 'r') as current_file:
            data = json.loads(current_file.read())
            for v in data['articles']:
                   
                properties = {
                    'title': v['title'],
                    # 'publishedDate': val['articles'][0]['published_date'],
                    'summary': v['summary'],
                    'topic': v['topic'],
                    'isOpinion': v['is_opinion'],
                    'country': v['country'],
                    'language': v['language']                    
                }
                            
                batch.add_data_object(properties, "Newscatcher")
            


In [319]:
obj = client.data_object.get()
pprint.pprint(obj)

{'deprecations': None, 'objects': []}


In [None]:
client.batch.delete_objects(
    class_name='Newscatcher',
    where={
        'path': ['country'],
        'operator': 'Equal',
        'valueText': 'US'
    }
)