In [None]:
from haystack.document_stores import WeaviateDocumentStore
from haystack import Pipeline
import os
from dotenv import load_dotenv
import kaggle
import pandas as pd
from haystack.nodes import EmbeddingRetriever, TextConverter, PreProcessor

In [2]:
# Load environment variables (if any)
load_dotenv("../.env")
openai_key = os.getenv("OPENAI_API_KEY")
weaviate_key = os.getenv("WEAVIATE_API_KEY")

In [22]:
# Download the data
kaggle.api.authenticate()
kaggle.api.dataset_download_files('papercool/organics-purchase-indicator', path='./', unzip=True)

df = pd.read_csv('organics.csv')
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')

# for all values under all columns, transform all values to string
df = df.applymap(str)

df.to_json('organics.json', orient='records', lines=False)

  _warn(f"unclosed running multiprocessing pool {self!r}",
  response_data.getheaders())
  df = df.applymap(str)


In [29]:
import json
with open('organics.json') as f:
    data = json.load(f)

data[0]

{'customer_loyalty_id': '140',
 'gender': 'U',
 'geographic_region': 'Midlands',
 'loyalty_status': 'Gold',
 'neigborhood_cluster-55_level': '16.0',
 'neighborhood_cluster-7_level': 'C',
 'television_region': 'Wales & West',
 'affluence_grade': '10',
 'age': '76',
 'frequency': '1',
 'frequency_percent': '0.00%',
 'loyalty_card_tenure': '4',
 'organics_purchase_count': '0',
 'organics_purchase_indicator': '0',
 'total_spend': '16000.0'}

In [33]:
# Step 1: Extract and Transform Data
transformed_documents = []

for i in range(len(data)):

    transformed_documents.append( {"content":data[i]})


In [34]:
from haystack.document_stores import FAISSDocumentStore


document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)

# Step 2: Write Transformed Data to Document Store
document_store.write_documents(transformed_documents)

ValidationError: 2 validation errors for Document
content
  str type expected (type=type_error.str)
content
  instance of DataFrame expected (type=type_error.arbitrary_type; expected_arbitrary_type=DataFrame)

In [17]:
import weaviate

auth_config = weaviate.AuthApiKey(api_key=weaviate_key)

client = weaviate.Client(
  url="https://kaggle-dataset-jxnlf95q.weaviate.network",
  auth_client_secret=auth_config
)

client.schema.get()  # Get the schema to test connection

{'classes': []}

In [24]:
class_obj = {
    "class": "Organics3",
    "description": "Information from a customer's organic purchases",  # description of the class
    "properties": [
        {
            "dataType": ["text"],
            "description": "Unique identifier for customer loyalty",
            "name": "customer_loyalty_id",
        },
        {
            "dataType": ["text"],
            "description": "Gender of the customer",
            "name": "gender",
        },
        {
            "dataType": ["text"],
            "description": "Geographic region of the customer",
            "name": "geographic_region",
        },
        {
            "dataType": ["text"],
            "description": "Loyalty status of the customer",
            "name": "loyalty_status",
        },
        {
            "dataType": ["text"],
            "description": "Neighborhood cluster at 55 level",
            "name": "neigborhood_cluster_55_level",
        },
        {
            "dataType": ["text"],
            "description": "Neighborhood cluster at 7 level",
            "name": "neighborhood_cluster_7_level",
        },
        {
            "dataType": ["text"],
            "description": "Television region of the customer",
            "name": "television_region",
        },
        {
            "dataType": ["text"],
            "description": "Affluence grade of the customer",
            "name": "affluence_grade",
        },
        {
            "dataType": ["text"],
            "description": "Age of the customer",
            "name": "age",
        },
        {
            "dataType": ["text"],
            "description": "Frequency of purchases",
            "name": "frequency",
        },
        {
            "dataType": ["text"],
            "description": "Frequency percent of purchases",
            "name": "frequency_percent",
        },
        {
            "dataType": ["text"],
            "description": "Tenure of loyalty card",
            "name": "loyalty_card_tenure",
        },
        {
            "dataType": ["text"],
            "description": "Count of organic purchases",
            "name": "organics_purchase_count",
        },
        {
            "dataType": ["text"],
            "description": "Indicator of organic purchases",
            "name": "organics_purchase_indicator",
        },
        {
            "dataType": ["text"],
            "description": "Total spend by the customer",
            "name": "total_spend",
        },
    ],
    "vectorizer": "text2vec-openai",
}


# add the schema
client.schema.create_class(class_obj)

# get the schema
schema = client.schema.get()

In [None]:
client.schema.get() 

In [None]:
document_store = WeaviateDocumentStore(host='https://kaggle-dataset-jxnlf95q.weaviate.network',
                                       embedding_dim=768,
                                       port=8080,
                                       api_key=weaviate_key)


In [None]:
converter = TextConverter(remove_numeric_tables=False, valid_languages=["en"])
doc_txt = converter.convert(file_path="organics.txt", meta=None)[0]

In [None]:

preprocessor = PreProcessor()
retriever = EmbeddingRetriever(document_store = document_store,
                               embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1")

indexing_pipeline = Pipeline()
indexing_pipeline.add_node(component=converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["PDFConverter"])
indexing_pipeline.add_node(component=retriever, name="Retriever", inputs=["PreProcessor"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["Retriever"])

indexing_pipeline.run(file_paths=["filename.pdf"])