In [61]:
from langchain_community.vectorstores import Neo4jVector
import os 
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase
import re
from typing import List
import polars as pl

def remove_html_tags(text):
    clean = re.sub(r'<.*?>', '', text)
    return clean



URI = "bolt://localhost:7687"
AUTH = ("neo4j", "fairusecases")

load_dotenv()

    
class Custom_Embeddings:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', token = os.environ["HUGGING_FACE"])
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model.encode(t)[0] for t in texts]
            
    def embed_query(self, query: str) -> List[float]:
        return self.model.encode([query])[0]



In [58]:
## Dealing with Conclusion
def opinion_does_not_have_conclusion(tx):
    
    results = tx.run("""
        MATCH (f)-[:OF]-(o:Opinion)
        RETURN o.URL as url, f.Document as Document, labels(f) AS Label
        """
    )
         
    return results.to_df()


with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
            df = session.execute_read(opinion_does_not_have_conclusion)

In [62]:
df = pl.from_pandas(df)

In [None]:
df.with_columns(
    pl.col("Document").map_elements(remove_html_tags),
    pl.col("Label").map_elements(lambda x: x[0])
).with_columns(
    pl.col("Document").map_elements(lambda x: [i for i in x.split("\n") if i != ""]).alias("Chunks")
).explode("Chunks").with_columns()

url,Document,Label,Chunks
str,str,str,str
"""https://www.co…","""**(Overall Ass…","""Conclusion""","""**(Overall Ass…"
"""https://www.co…","""**(Overall Ass…","""Conclusion""","""With respect t…"
"""https://www.co…","""**(4) the effe…","""Market""","""**(4) the effe…"
"""https://www.co…","""**(4) the effe…","""Market""","""The fourth fac…"
"""https://www.co…","""**(4) the effe…","""Market""","""The district c…"
"""https://www.co…","""**(4) the effe…","""Market""","""What this sugg…"
"""https://www.co…","""The extent of …","""Amount""","""The extent of …"
"""https://www.co…","""The extent of …","""Amount""","""Quantitatively…"
"""https://www.co…","""The second fac…","""Nature""","""The second fac…"
"""https://www.co…","""The second fac…","""Nature""","""Big League Sal…"


In [None]:
from langchain.text_splitter import CharacterTextSplitter

# Define chunking strategy
text_splitter = CharacterTextSplitter(separator="\n")
# Chunk the document
documen

TypeError: expected string or bytes-like object

In [36]:
set(no_conclusion["Length"])

{384}

In [19]:
# Create the vectorstore for our existing graph
Neo4jVector.from_existing_graph(
    embedding = Custom_Embeddings(),
    url="bolt://localhost:7687",
    username="neo4j",
    password="fairusecases",
    index_name="OpinionIndex",
    node_label="Opinion",
    text_node_properties=["Document"],
    embedding_node_property="OpinionEmbedding",
)

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `db.create.setVectorProperty`: Caused by: java.lang.IllegalArgumentException: 'vector' must be a non-null numerical array}

In [54]:
embedding = model.embed_query("HI")[0]

In [None]:
## Manual Embeddings
def opinion_does_not_have_conclusion(tx, embedding):
    
    results = tx.run("""
        MATCH (o:Opinion)
        SET o.OpinionEmbeddings = $embedding
        """, embedding = embedding
    )
         
    return results


with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
            df = session.execute_write(opinion_does_not_have_conclusion, embedding)