In [1]:
!pip install llama-index --quiet
!pip install neo4j --quiet
!pip install llama-index-graph-stores-neo4j --quiet
!pip install llama-parse --quiet
!pip install qdrant_client --quiet
!pip install llama-index-vector-stores-qdrant --quiet
!pip install llama-index-embeddings-fastembed --quiet
!pip install llama-index-llms-groq --quiet
!pip install aiofiles --quiet

In [2]:
# HuggingFace Data Preprocessing Script
########################################

import os
import re
import sys
import ast
import pandas as pd
import numpy as np
from datetime import datetime
from google.colab import drive

# Set numpy and pandas display options
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/caselaw_rag_data/opinions_2015_before_2024.csv')
dataset = df.copy()

# Extract multiple attorney IDs
def extract_attorneys_ids(text):
    if not isinstance(text, str):
        return None
    attorneys_pattern = r'<attorneys id="(.*?)">'
    attorneys_ids = re.findall(attorneys_pattern, text)
    return ', '.join(attorneys_ids) if attorneys_ids else None

dataset['attorneys_id'] = dataset['headmatter'].apply(extract_attorneys_ids)

def remove_specific_xml_tags(text):
    """
    Remove specific XML tags from the text while retaining certain attributes.

    Args:
        text (str): The text containing XML tags.

    Returns:
        str: Text with specified XML tags removed.
    """
    if isinstance(text, str):
        # Retain specified opening tags with their attributes and remove angle brackets
        text = re.sub(r'<(parties|docketnumber|decisiondate) id="([^"]*)">', r'\1 id="\2"', text)
        text = re.sub(r'<attorneys id="([^"]*)">', r'attorneys id="\1"', text)
        text = re.sub(r'<span citation-index="([^"]*)" class="([^"]*)" label="([^"]*)">', r'span citation-index="\1" class="\2" label="\3"', text)
        # Remove specific closing tags
        text = re.sub(r'</(attorneys|court|decisiondate|docketnumber|parties|span)>', '', text)
        # Remove all other tags
        text = re.sub(r'<.*?>', '', text)
        # Normalize spaces but keep newlines intact
        text = re.sub(r'[ ]{2,}', ' ', text)  # Replace multiple spaces with a single space
        return text.strip()  # Clean up any leading/trailing whitespace
    return text  # Return unchanged if not a string

# Clean the 'headmatter' column by removing specified tags
dataset['headmatter'] = dataset['headmatter'].apply(remove_specific_xml_tags)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
dataset.head()

Unnamed: 0,id,judges,date_filed,date_filed_is_approximate,slug,case_name_short,case_name,case_name_full,attorneys,nature_of_suit,posture,syllabus,headnotes,summary,disposition,history,other_dates,cross_reference,correction,citation_count,precedential_status,arguments,citations,opinions,court_short_name,court_full_name,court_type,court_jurisdiction,opinion_id,type,per_curiam,author_id,author_str,opinion_text,parties,docketnumber,court,decisiondate,attorneys_headmatter,p,otherdate,seealso,judges_headmatter,citation,summary_headmatter,disposition_headmatter,headmatter,num_opinions,0_0,1_1,2_2,3_3,4_4,5_5,6_6,7_7,8_8,year_filed,attorneys_id
0,7311944,Spatt,2015-01-02,False,placide-eugene-v-visiting-nurse-service,Placide-Eugene,Placide-Eugene v. Visiting Nurse Service,Marie PLACIDE-EUGENE v. VISITING NURSE SERVICE...,"White Ricotta & Marks, P.C., by: Thomas Ricott...",,,,,,,,,,,0,Published,,['86 F. Supp. 3d 132'],"[{'author_str': 'Spatt', 'per_curiam': False, ...",E.D. New York,"District Court, E.D. New York",FD,"New York, NY",7229848,020lead,False,,Spatt,"\nMEMORANDUM OF DECISION AND ORDER\nSPATT, Dis...",,,,,,,,,,,,,{},1,"{'author_str': 'Spatt', 'per_curiam': False, '...",,,,,,,,,2015,
1,7311182,Hillman,2015-01-02,False,thirkield-v-neary-hunter-obgyn-llc,Thirkield,"Thirkield v. Neary & Hunter Ob/Gyn, LLC","Kimberly THIRKIELD v. NEARY & HUNTER OB/GYN, L...","Michael O. Shea, Law Office of Michael Q_ ^ Wi...",,,,,,,,,,,0,Published,,['76 F. Supp. 3d 339'],"[{'author_str': 'Hillman', 'per_curiam': False...",D. Massachusetts,"District Court, D. Massachusetts",FD,"Massachusetts, MA",7229086,020lead,False,,Hillman,\n*342ORDER AND MEMORANDUM OF DECISION ON DEFE...,,,,,,,,,,,,,{},1,"{'author_str': 'Hillman', 'per_curiam': False,...",,,,,,,,,2015,
2,2766830,"Brown, Edwards, Rogers",2015-01-06,False,janet-howard-v-penny-pritzker,,Janet Howard v. Penny Pritzker,"Janet HOWARD, Appellee. Joyce MEGGINSON, Appel...","Elizabeth C. Bullock, appointed by the court, ...",,,,,,,,"Argued Oct. 10, 2014.",,,1,Published,,"['413 U.S. App. D.C. 389', '775 F.3d 430']","[{'author_str': 'Rogers', 'per_curiam': False,...",D.C. Circuit,Court of Appeals for the D.C. Circuit,F,"USA, Federal",2766830,010combined,False,,Rogers,United States Court of Appeals\n FOR ...,"Janet HOWARD, Appellee. Joyce MEGGINSON, Appel...","Nos. 12-5370, 12-5392.","United States Court of Appeals, District of Co...","Decided Jan. 6, 2015.","Brian P. Hudak, Assistant U.S. Attorney, argue...",,"Argued Oct. 10, 2014.",,"Before: ROGERS and BROWN, Circuit Judges, and ...",,,,"{'parties': 'Janet HOWARD, Appellee. Joyce MEG...",1,"{'author_str': 'Rogers', 'per_curiam': False, ...",,,,,,,,,2015,
3,7311845,Smith,2015-01-07,False,keeton-v-big-lots-stores-inc,Keeton,"Keeton v. Big Lots Stores, Inc.","Sondra KEETON v. BIG LOTS STORES, INC.","Kimberly R. Dodson, Law Offices of Kimberly R....",,,,,,,,,,,0,Published,,['84 F. Supp. 3d 1290'],"[{'author_str': 'Smith', 'per_curiam': False, ...",N.D. Alabama,"District Court, N.D. Alabama",FD,"Alabama, AL",7229749,020lead,False,,Smith,"\nMEMORANDUM OPINION AND ORDER\nLYNWOOD SMITH,...",,,,,,,,,,,,,{},1,"{'author_str': 'Smith', 'per_curiam': False, '...",,,,,,,,,2015,
4,7311254,"Kollar, Kotelly",2015-01-07,False,king-v-holder,King,King v. Holder,"Willard T. KING, Jr. v. Eric Himpton HOLDER, Jr.","Donald B. Terrell, Donald B. Terrell Associate...",,,,,,,,,,,3,Published,,['77 F. Supp. 3d 146'],"[{'author_str': 'Kollarkotelly', 'per_curiam':...",District of Columbia,"District Court, District of Columbia",FD,"USA, Federal",7229158,020lead,False,,Kollarkotelly,"\nMEMORANDUM OPINION\n(January 7, 2015)\nCOLLE...",,,,,,,,,,,,,{},1,"{'author_str': 'Kollarkotelly', 'per_curiam': ...",,,,,,,,,2015,


In [4]:
aggregated_df = dataset.copy()

def safe_clean_text(text):
    try:
        if pd.isna(text):
            return ""
        if not isinstance(text, str):
            text = str(text)
        # Remove curly braces, square brackets, and single quotes
        text = re.sub(r'[\{\}\[\]\']', '', text)
        # Remove any leading/trailing whitespace
        text = text.strip()
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)
        return text
    except Exception as e:
        return ""

def safe_apply_clean_text(series):
    return series.apply(safe_clean_text)

# Applying safe_clean_text to individual columns before concatenation
columns_to_clean = ['headmatter', 'date_filed', 'author_str', 'opinion_id', 'type', 'opinion_text']

for col in columns_to_clean:
    if col in aggregated_df.columns:
        aggregated_df[col] = safe_apply_clean_text(aggregated_df[col])

# Function to concatenate fields, skipping empty ones
def concatenate_fields(row):
    fields = []
    if row['headmatter']:
        fields.append(row['headmatter'])
    fields.extend([
        f"Date Filed: {row['date_filed']}",
        f"Author: {row['author_str']}",
        f"Opinion ID: {row['opinion_id']}",
        f"Type: {row['type']}",
        f"Opinion Text:\n{row['opinion_text']}"
    ])
    return '\n'.join(filter(None, fields))

# Apply the concatenation function
aggregated_df['case_opinions'] = aggregated_df.apply(concatenate_fields, axis=1)

# Reset index and select only 'id' and 'case_opinions' columns
aggregated_df = aggregated_df[['id', 'case_opinions']].reset_index(drop=True)

In [5]:
aggregated_df.sort_values(by='id', ascending=True).head()

Unnamed: 0,id,case_opinions
2,2766830,"parties: Janet HOWARD, Appellee. Joyce MEGGINS..."
6,2767581,"parties: Tony REAVES, Appellant, v. PENNSYLVAN..."
8,2768601,"parties: Leroy HENDRIX, Plaintiff, v. Janet NA..."
9,2768709,Date Filed: 2015-01-09\nAuthor: \nOpinion ID: ...
11,2768889,"parties: Lawrence WRIGHT, Plaintiff, v. WASTE ..."


In [6]:
print(aggregated_df['case_opinions'][0])

Date Filed: 2015-01-02
Author: Spatt
Opinion ID: 7229848
Type: 020lead
Opinion Text:
MEMORANDUM OF DECISION AND ORDER SPATT, District Judge. On August 25, 2011, the Plaintiff Marie Placide-Eugene (the “Plaintiff’) filed a claim of discrimination with the New York State Division of Human Rights (the “NYSDHR”), alleging that her former employer, the Defendant Visiting Nurse Service of New York (“VNSNY”) had subjected her to discrimination due to her national origin. On March 7, 2012, the United States Equal Employment Opportunity Commission issued to the Plaintiff a right to sue letter. On June 4, 2012, the Plaintiff commenced this action against VNSNY and certain former supervisors, Eloise Goldberg (“Goldberg”), Jill Mendelson (“Mendelson”) and Marian Haas (“Haas”) pursuant to Title VII of the Civil Rights Act of 1964, 42 U.S.C. § 2000e et seq. (“Title VH”), 42 U.S.C. §§ 1981 and 1983, and the New York State Human Rights Law, New York State Executive Law § 290 et seq. (the “NYSHRL”). Th

In [7]:
dataset['id'].isna().sum(), aggregated_df['id'].isna().sum()

(0, 0)

In [8]:
dataset['id'].nunique(), aggregated_df['id'].nunique()

(2735, 2735)

In [9]:
column_stats = pd.DataFrame()
for col in dataset.columns:
  # if int((dataset[col].isna().sum()/len(dataset))*100) < 95:
  column_stats = pd.concat(
      [column_stats,
       pd.DataFrame([[col, int(dataset[col].isna().sum()),
                      str(int((dataset[col].isna().sum()/len(dataset))*100))+'%',
                      int(dataset[col].notna().sum()),
                      str(int((dataset[col].notna().sum()/len(dataset))*100))+'%']],
                           columns=['Column Name', 'Null Values', 'Null %', 'Not Null Values', 'Not Null Value %'])], ignore_index=True
      )

pd.DataFrame(column_stats).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
Column Name,id,judges,date_filed,date_filed_is_approximate,slug,case_name_short,case_name,case_name_full,attorneys,nature_of_suit,posture,syllabus,headnotes,summary,disposition,history,other_dates,cross_reference,correction,citation_count,precedential_status,arguments,citations,opinions,court_short_name,court_full_name,court_type,court_jurisdiction,opinion_id,type,per_curiam,author_id,author_str,opinion_text,parties,docketnumber,court,decisiondate,attorneys_headmatter,p,otherdate,seealso,judges_headmatter,citation,summary_headmatter,disposition_headmatter,headmatter,num_opinions,0_0,1_1,2_2,3_3,4_4,5_5,6_6,7_7,8_8,year_filed,attorneys_id
Null Values,0,661,0,0,0,1092,0,1112,1142,2077,2924,2883,2924,2911,2922,2922,2524,2912,2924,0,0,2916,1013,0,0,0,0,0,0,0,0,2478,1301,0,2281,2314,2306,2295,2372,2864,2677,2922,2537,2874,2921,2922,0,0,0,2643,2663,2867,2919,2924,2924,2924,2924,0,2924
Null %,0%,22%,0%,0%,0%,37%,0%,38%,39%,71%,100%,98%,100%,99%,99%,99%,86%,99%,100%,0%,0%,99%,34%,0%,0%,0%,0%,0%,0%,0%,0%,84%,44%,0%,78%,79%,78%,78%,81%,97%,91%,99%,86%,98%,99%,99%,0%,0%,0%,90%,91%,98%,99%,100%,100%,100%,100%,0%,100%
Not Null Values,2924,2263,2924,2924,2924,1832,2924,1812,1782,847,0,41,0,13,2,2,400,12,0,2924,2924,8,1911,2924,2924,2924,2924,2924,2924,2924,2924,446,1623,2924,643,610,618,629,552,60,247,2,387,50,3,2,2924,2924,2924,281,261,57,5,0,0,0,0,2924,0
Not Null Value %,100%,77%,100%,100%,100%,62%,100%,61%,60%,28%,0%,1%,0%,0%,0%,0%,13%,0%,0%,100%,100%,0%,65%,100%,100%,100%,100%,100%,100%,100%,100%,15%,55%,100%,21%,20%,21%,21%,18%,2%,8%,0%,13%,1%,0%,0%,100%,100%,100%,9%,8%,1%,0%,0%,0%,0%,0%,100%,0%


In [10]:
dataset.drop(['headmatter', 'opinion_text', '0_0',	'1_1',	'2_2',	'3_3',	'4_4',	'5_5',	'6_6',	'7_7',	'8_8',	'year_filed'], axis=1, inplace=True)

In [11]:
node_columns = ['id', 'judges', 'author_id', 'author_str', 'court_type', 'court_jurisdiction', 'case_name_full', 'slug', 'date_filed', 'decisiondate', 'nature_of_suit',
                'posture', 'disposition', 'citation_count', 'precedential_status', 'attorneys', 'case_name', 'citations', 'opinion_id', 'type', 'docketnumber']

# Drop columns not needed for graph nodes
df_nodes = dataset[node_columns].copy()

dataset_nodes = df_nodes.drop_duplicates(subset='id', keep='first')

In [None]:
import pandas as pd
import numpy as np
from neo4j import AsyncGraphDatabase
import asyncio
from google.colab import userdata, drive
import logging
from typing import List
from qdrant_client import QdrantClient
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex, Document, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.groq import Groq
from concurrent.futures import ThreadPoolExecutor

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

def safe_convert(v):
    if isinstance(v, (list, np.ndarray)):
        return [str(item) for item in v if pd.notna(item)]
    elif pd.isna(v):
        return None
    else:
        return str(v)

class AsyncNeo4jLoader:
    def __init__(self, uri, username, password):
        self.driver = AsyncGraphDatabase.driver(uri, auth=(username, password))

    async def close(self):
        await self.driver.close()

    async def test_connection(self):
        try:
            async with self.driver.session() as session:
                result = await session.run("RETURN 1 AS num")
                record = await result.single()
                assert record["num"] == 1
                logger.info("Successfully connected to Neo4j database")
        except Exception as e:
            logger.error(f"Failed to connect to Neo4j database: {str(e)}")
            raise

    async def clear_database(self):
        query = "MATCH (n) DETACH DELETE n"
        async with self.driver.session() as session:
            await session.run(query)
        logger.info("Database cleared")

    async def create_constraints(self):
        constraints = [
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Case) REQUIRE n.id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Judge) REQUIRE n.id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Attorney) REQUIRE n.id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Court) REQUIRE n.id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Party) REQUIRE n.id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Citation) REQUIRE n.id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Opinion) REQUIRE n.id IS UNIQUE",
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Docket) REQUIRE n.id IS UNIQUE"
        ]
        async with self.driver.session() as session:
            for constraint in constraints:
                await session.run(constraint)
        logger.info("Constraints created")

    async def load_data(self, df):
        async with self.driver.session() as session:
            for i, row in df.iterrows():
                try:
                    logger.info(f"Processing row {i}")
                    await self._create_case_graph(session, row)
                except Exception as e:
                    logger.error(f"Error processing row {i}: {str(e)}")
                    logger.error(f"Problematic row data: {row.to_dict()}")
        logger.info(f"Loaded {len(df)} cases")

    async def _create_case_graph(self, session, row):
        # Create Case node
        case_query = """
        MERGE (c:Case {id: $id})
        SET c.case_name = $case_name,
            c.date_filed = $date_filed,
            c.decisiondate = $decisiondate,
            c.slug = $slug,
            c.nature_of_suit = $nature_of_suit,
            c.posture = $posture,
            c.disposition = $disposition,
            c.citation_count = $citation_count,
            c.precedential_status = $precedential_status,
            c.case_name_full = $case_name_full
        """
        case_params = {k: safe_convert(v) for k, v in dict(row).items()}

        try:
            await session.run(case_query, case_params)

        except Exception as e:
            logger.error(f"Error processing row {row['id']}: {str(e)}")
            logger.error(f"Problematic query: {case_query}")
            logger.error(f"Problematic parameters: {case_params}")
            raise

        # Create Judge nodes and relationships
        if isinstance(row['judges'], str) and row['judges']:
            judges = [judge.strip() for judge in row['judges'].split(',')]
            judge_query = """
            UNWIND $judges AS judge
            MERGE (j:Judge {name: judge})
            WITH j
            MATCH (c:Case {id: $id})
            MERGE (c)-[:DECIDED_BY]->(j)
            """
            await session.run(judge_query, {'id': str(row['id']), 'judges': judges})

        # Create Author (Judge) node and relationship
        if pd.notna(row['author_id']) and pd.notna(row['author_str']):
            author_query = """
            MERGE (a:Judge {id: $author_id})
            SET a.name = $author_str
            WITH a
            MATCH (c:Case {id: $id})
            MERGE (c)-[:AUTHORED_BY]->(a)
            """
            await session.run(author_query, {
                'id': str(row['id']),
                'author_id': str(row['author_id']),
                'author_str': str(row['author_str'])
            })

        # Create Court node and relationship
        if pd.notna(row['court_type']) and pd.notna(row['court_jurisdiction']):
            court_query = """
            MERGE (ct:Court {id: $court_id})
            SET ct.type = $court_type,
                ct.jurisdiction = $court_jurisdiction
            WITH ct
            MATCH (c:Case {id: $id})
            MERGE (c)-[:HEARD_IN]->(ct)
            """
            await session.run(court_query, {
                'id': str(row['id']),
                'court_id': f"court_{row['id']}",
                'court_type': str(row['court_type']),
                'court_jurisdiction': str(row['court_jurisdiction'])
            })

        # Create Attorney nodes and relationships
        if pd.notna(row['attorneys']):
            attorneys = str(row['attorneys']).split(', ')
            attorney_query = """
            UNWIND $attorneys AS attorney
            MERGE (a:Attorney {name: attorney})
            WITH a
            MATCH (c:Case {id: $id})
            MERGE (c)-[:REPRESENTED_BY]->(a)
            """
            await session.run(attorney_query, {'id': str(row['id']), 'attorneys': attorneys})

        # Create Party nodes (Plaintiff and Defendant) and relationships
        if pd.notna(row['case_name']):
            parties = str(row['case_name']).split(' v. ')
            if len(parties) == 2:
                plaintiff, defendant = parties
                party_query = """
                MERGE (p:Party:Plaintiff {name: $plaintiff_name})
                MERGE (d:Party:Defendant {name: $defendant_name})
                WITH p, d
                MATCH (c:Case {id: $id})
                MERGE (p)-[:FILED_CASE]->(c)
                MERGE (c)-[:AGAINST]->(d)
                """
                await session.run(party_query, {
                    'id': str(row['id']),
                    'plaintiff_name': plaintiff,
                    'defendant_name': defendant
                })

        # Create Citation nodes and relationships
        citations = row['citations']
        if isinstance(citations, (list, np.ndarray)) and len(citations) > 0:
            citations = [str(c) for c in citations if pd.notna(c)]
            if citations:
                citation_query = """
                UNWIND $citations AS citation
                MERGE (cit:Citation {text: citation})
                WITH cit
                MATCH (c:Case {id: $id})
                MERGE (c)-[:CITED_BY]->(cit)
                """
                await session.run(citation_query, {'id': str(row['id']), 'citations': citations})

        # Create Opinion node and relationship
        if pd.notna(row['opinion_id']) and pd.notna(row['type']):
            opinion_query = """
            MERGE (o:Opinion {id: $opinion_id})
            SET o.type = $type
            WITH o
            MATCH (c:Case {id: $id})
            MERGE (c)-[:HAS_OPINION]->(o)
            """
            await session.run(opinion_query, {
                'id': str(row['id']),
                'opinion_id': str(row['opinion_id']),
                'type': str(row['type'])
            })

        # Create Docket node and relationship
        if 'docketnumber' in row and pd.notna(row['docketnumber']):
            docket_query = """
            MERGE (d:Docket {id: $docketnumber})
            WITH d
            MATCH (c:Case {id: $id})
            MERGE (c)-[:HAS_DOCKET]->(d)
            """
            await session.run(docket_query, {
                'id': str(row['id']),
                'docketnumber': str(row['docketnumber'])
            })

async def load_data_to_neo4j(df: pd.DataFrame):
    url = userdata.get('NEO4J_URL')
    username = "neo4j"
    password = userdata.get('NEO4J_PASSWORD')

    loader = AsyncNeo4jLoader(url, username, password)
    try:
        await loader.test_connection()
        await loader.clear_database()
        await loader.create_constraints()
        await loader.load_data(df)
        logger.info("Neo4j database updated successfully!")
    except Exception as e:
        logger.error(f"Error updating Neo4j database: {str(e)}")
    finally:
        await loader.close()

async def load_batch(batch, vector_store, splitter):
    loaded_documents = [Document(text=doc) for doc in batch]
    nodes = splitter.get_nodes_from_documents(loaded_documents)
    vector_index = VectorStoreIndex(nodes, storage_context=StorageContext.from_defaults(vector_store=vector_store))
    vector_index.storage_context.persist()

async def load_data_to_qdrant(documents: List, batch_size=1000, max_workers=4):
        qdrant_client = QdrantClient(url=userdata.get("QDRANT_URL"), api_key=userdata.get("QDRANT_API_KEY"))
        vector_store = QdrantVectorStore(client=qdrant_client, collection_name="law_docs")
        splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=100)

        batches = [documents[i:i+batch_size] for i in range(0, len(documents), batch_size)]

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            await asyncio.gather(*[load_batch(batch, vector_store, splitter) for batch in batches])

        logging.info("Qdrant database update completed")

async def main():
    embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
    Settings.embed_model = embed_model
    Settings.chunk_size = 1024
    await load_data_to_neo4j(dataset_nodes)
    await load_data_to_qdrant(aggregated_df['case_opinions'].to_list())

# Use this for Google Colab
if __name__ == "__main__":
    from IPython import get_ipython
    ipython = get_ipython()
    if ipython is not None:
        ipython.run_line_magic('matplotlib', 'inline')
        ipython.run_line_magic('autoawait', 'on')
        await main()
    else:
        asyncio.run(main())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]