In [7]:
'''
Resource of Code
https://github.com/langchain-ai/langchain/blob/master/cookbook/RAPTOR.ipynb
'''

'\nResource of Code\nhttps://github.com/langchain-ai/langchain/blob/master/cookbook/RAPTOR.ipynb\n'

In [8]:
'''
!pip install umap-learn
!pip install torch
!pip install transformers
!pip install sentence-transformers
!pip install scikit-learn
!pip install urllib3
!pip install tenacity
!pip install faiss-cpu
!pip install langchain_community
!pip install langchain_openai
!pip install matplotlib
!pip install rank_bm25
!pip install fitz
!pip install frontend
!pip install ipywidgets
'''

'\n!pip install umap-learn\n!pip install torch\n!pip install transformers\n!pip install sentence-transformers\n!pip install scikit-learn\n!pip install urllib3\n!pip install tenacity\n!pip install faiss-cpu\n!pip install langchain_community\n!pip install langchain_openai\n!pip install matplotlib\n!pip install rank_bm25\n!pip install fitz\n!pip install frontend\n!pip install ipywidgets\n'

In [9]:
# pip install -U langchain umap-learn scikit-learn langchain_community tiktoken langchain-openai langchainhub langchain-chroma langchain-anthropic

In [10]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import json
import tiktoken
folder_path = '../statute_json_files/'

In [11]:
openai_api_key = <your_key>

In [12]:
from langchain_openai import OpenAIEmbeddings
embd = OpenAIEmbeddings(openai_api_key=openai_api_key, model = "text-embedding-3-small", disallowed_special = ())

In [13]:
from langchain_openai import ChatOpenAI
model = ChatOpenAI(model_name="gpt-4o-mini", openai_api_key=openai_api_key, temperature=0.0)

In [34]:
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Optional
import umap
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from langchain.prompts import ChatPromptTemplate

def simple_output_parser(output):
    """
    A simple function to parse the output text from AIMessage objects.
    """
    if hasattr(output, "content"):  # Check if the object has the 'content' attribute
        return output.content.strip()  # Access the content and strip it
    raise AttributeError(f"Unexpected output type: {type(output)}")

RANDOM_SEED = 523  # Set your desired random seed here

# Global clustering functions
def global_cluster_embeddings(
    embeddings: np.ndarray,
    dim: int,
    n_neighbors: Optional[int] = None,
    metric: str = "cosine",
) -> np.ndarray:
    """
    Perform global dimensionality reduction on the embeddings using UMAP.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - dim: The target dimensionality for the reduced space.
    - n_neighbors: Optional; the number of neighbors to consider for each point.
    - metric: The distance metric to use for UMAP.

    Returns:
    - A numpy array of the embeddings reduced to the specified dimensionality.
    """
    if n_neighbors is None:
        n_neighbors = int((len(embeddings) - 1) ** 0.5)
    return umap.UMAP(
        n_neighbors=n_neighbors, n_components=dim, metric=metric
    ).fit_transform(embeddings)


def local_cluster_embeddings(
    embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine"
) -> np.ndarray:
    """
    Perform local dimensionality reduction on the embeddings using UMAP.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - dim: The target dimensionality for the reduced space.
    - num_neighbors: The number of neighbors to consider for each point.
    - metric: The distance metric to use for UMAP.

    Returns:
    - A numpy array of the embeddings reduced to the specified dimensionality.
    """
    return umap.UMAP(
        n_neighbors=num_neighbors, n_components=dim, metric=metric
    ).fit_transform(embeddings)


def get_optimal_clusters(
    embeddings: np.ndarray, max_clusters: int = 50, random_state: int = RANDOM_SEED
) -> int:
    """
    Determine the optimal number of clusters using the Bayesian Information Criterion (BIC).

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - max_clusters: The maximum number of clusters to consider.
    - random_state: Seed for reproducibility.

    Returns:
    - An integer representing the optimal number of clusters found.
    """
    max_clusters = min(max_clusters, len(embeddings))
    n_clusters = np.arange(1, max_clusters)
    bics = []
    for n in n_clusters:
        gm = GaussianMixture(n_components=n, random_state=random_state)
        gm.fit(embeddings)
        bics.append(gm.bic(embeddings))
    return n_clusters[np.argmin(bics)]


def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 42):
    """
    Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - threshold: The probability threshold for assigning an embedding to a cluster.
    - random_state: Seed for reproducibility.

    Returns:
    - A tuple containing the cluster labels and the number of clusters determined.
    """
    n_clusters = get_optimal_clusters(embeddings)
    gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
    gm.fit(embeddings)
    probs = gm.predict_proba(embeddings)
    labels = [np.where(prob > threshold)[0] for prob in probs]
    return labels, n_clusters


def perform_clustering(
    embeddings: np.ndarray,
    dim: int,
    threshold: float,
) -> List[np.ndarray]:
    """
    Perform clustering on the embeddings by first reducing their dimensionality globally,
    then clustering using a Gaussian Mixture Model, and finally performing local clustering.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - dim: The target dimensionality for UMAP reduction.
    - threshold: The probability threshold for assigning an embedding to a cluster.

    Returns:
    - A list of numpy arrays, where each array contains the cluster IDs for each embedding.
    """
    if len(embeddings) <= dim + 1:
        # Return direct assignment for insufficient data
        return [np.array([0]) for _ in range(len(embeddings))]

    # Global dimensionality reduction
    reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)
    
    # Global clustering
    global_clusters, n_global_clusters = GMM_cluster(reduced_embeddings_global, threshold)

    all_local_clusters = [np.array([]) for _ in range(len(embeddings))]
    total_clusters = 0

    for i in range(n_global_clusters):
        # Extract embeddings belonging to the current global cluster
        cluster_mask = np.array([i in gc for gc in global_clusters])
        global_cluster_embeddings_ = embeddings[cluster_mask]

        if len(global_cluster_embeddings_) == 0:
            continue
        if len(global_cluster_embeddings_) <= dim + 1:
            # Direct assignment for small clusters
            local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
            n_local_clusters = 1
        else:
            # Local dimensionality reduction and clustering
            reduced_embeddings_local = local_cluster_embeddings(global_cluster_embeddings_, dim)
            local_clusters, n_local_clusters = GMM_cluster(reduced_embeddings_local, threshold)

        for j in range(n_local_clusters):
            cluster_mask = np.array([j in lc for lc in local_clusters])
            local_cluster_embeddings_ = global_cluster_embeddings_[cluster_mask]
            indices = np.where((embeddings == local_cluster_embeddings_[:, None]).all(-1))[1]

            for idx in indices:
                all_local_clusters[idx] = np.append(all_local_clusters[idx], j + total_clusters)

        total_clusters += n_local_clusters

    return all_local_clusters


# Embedding and clustering utility functions
def embed(texts):
    """
    Generate embeddings for a list of text documents.

    Parameters:
    - texts: List[str], a list of text documents to be embedded.

    Returns:
    - numpy.ndarray: An array of embeddings for the given text documents.
    """
    text_embeddings = embd.embed_documents(texts)
    return np.array(text_embeddings)


def embed_cluster_texts(texts):
    """
    Embeds a list of texts and clusters them, returning a DataFrame with texts, embeddings, and cluster labels.

    Parameters:
    - texts: List[str], a list of text documents to be processed.

    Returns:
    - pandas.DataFrame: DataFrame containing the original texts, embeddings, and cluster labels.
    """
    text_embeddings_np = embed(texts)
    cluster_labels = perform_clustering(text_embeddings_np, 10, 0.1)
    df = pd.DataFrame({
        "text": texts,
        "embd": list(text_embeddings_np),
        "cluster": cluster_labels
    })
    return df


def fmt_txt(df: pd.DataFrame) -> str:
    """
    Formats the text documents in a DataFrame into a single string.

    Parameters:
    - df: DataFrame containing the 'text' column with text documents to format.

    Returns:
    - A single string where all text documents are joined by a specific delimiter.
    """
    return "--- --- \n --- --- ".join(df["text"].tolist())


def embed_cluster_summarize_texts(
    texts: List[str], level: int
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Embeds, clusters, and summarizes a list of texts.

    Parameters:
    - texts: A list of text documents to be processed.
    - level: An integer parameter defining the depth or detail of processing.

    Returns:
    - Tuple containing two DataFrames:
      1. `df_clusters`: contains the original texts, embeddings, and cluster assignments.
      2. `df_summary`: contains summaries for each cluster and their identifiers.
    """
    df_clusters = embed_cluster_texts(texts)

    expanded_list = []
    for _, row in df_clusters.iterrows():
        for cluster in row["cluster"]:
            expanded_list.append({"text": row["text"], "embd": row["embd"], "cluster": cluster})

    expanded_df = pd.DataFrame(expanded_list)
    all_clusters = expanded_df["cluster"].unique()

    print(f"--Generated {len(all_clusters)} clusters--")

    # Summarization chain setup
    template = """
    Write a summary of the following, including as many key details as possible: {context}
    Max words: 150.
    Documentation:
    {context}
    """
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model | simple_output_parser

    summaries = []
    for i in all_clusters:
        df_cluster = expanded_df[expanded_df["cluster"] == i]
        formatted_txt = fmt_txt(df_cluster)
        summaries.append(chain.invoke({"context": formatted_txt}))

    df_summary = pd.DataFrame({
        "summaries": summaries,
        "level": [level] * len(summaries),
        "cluster": list(all_clusters),
    })

    return df_clusters, df_summary


def recursive_embed_cluster_summarize(
    texts: List[str], level: int = 1, n_levels: int = 3
) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
    """
    Recursively embeds, clusters, and summarizes text documents over multiple levels.

    Parameters:
    - texts: List[str], the list of text documents.
    - level: Integer indicating the current depth of recursion (default 1).
    - n_levels: Integer indicating the number of recursion levels (default 3).

    Returns:
    - Dictionary where keys are the recursion levels, and values are tuples containing 
      DataFrames for clusters and summaries at that level.
    """
    dfs = {}
    df_clusters, df_summary = embed_cluster_summarize_texts(texts, level)

    if level <= n_levels:
        next_texts = df_summary["summaries"].tolist()
        dfs[level] = (df_clusters, df_summary)
        dfs.update(recursive_embed_cluster_summarize(next_texts, level + 1, n_levels))

    return dfs

In [51]:
'''
1 - find resources 
'''
texts = [
    # Cluster 1: Education News
    "The field of education is evolving rapidly with the integration of digital tools and online learning platforms, making education more accessible to people around the world.",
    "Adaptive learning technology is personalizing the educational experience, allowing students to learn at their own pace and according to their unique needs.",
    "Virtual reality (VR) is being used to create immersive learning environments, helping students gain practical knowledge in fields such as medicine, engineering, and history.",
    "Educational institutions are increasingly adopting AI-driven tools to automate administrative tasks and enhance the learning experience through smart content delivery.",
    "The shift towards remote and hybrid learning models has changed the traditional classroom dynamic, leading to innovations in teaching strategies and digital collaboration.",
    
    # Cluster 2: Criminal News
    "Chicago man charged in fatal shooting of suburban ...8 hours ago — Oak Park Police Detective Allan Reddins was the first Oak Park officer killed in the line of duty since 193.",
    "Violent crime rates have been rising in urban centers, with many communities facing challenges related to gang violence, robberies, and assaults. Law enforcement agencies are working to implement strategies for public safety and crime prevention.",
    "Recent studies indicate that cities with higher levels of economic disparity often experience more violent crime. Addressing issues such as poverty, education, and job opportunities is seen as a long-term solution to reduce violent incidents.",
    "The advent of new technologies has brought both opportunities and challenges to crime prevention. Surveillance systems, facial recognition, and data analytics are being used to combat violent crime but raise privacy concerns.",
    "Police departments are increasingly turning to community-based strategies to foster trust and cooperation with the public. Such initiatives aim to reduce violent crime rates through proactive measures and enhanced communication.",
    "Violent crime prevention programs that focus on youth engagement and education have shown positive results in reducing incidents. Programs that mentor young people and provide safe recreational activities are crucial in building safer communities.",
    "Homicides and aggravated assaults continue to challenge law enforcement agencies across the globe. Effective response strategies and public cooperation are key to reducing these crimes.",
    "Urban violence often correlates with socioeconomic issues. Solutions that aim to reduce violence include improved housing, better access to education, and job creation to alleviate the root causes of violent crime.",
    "There's a shooting incident.",
    
    # Cluster 3: Financial News
    "The financial market has been experiencing fluctuations due to global economic uncertainties, including geopolitical tensions and inflation concerns.",
    "Investment in sustainable and green technologies is on the rise as investors seek to support environmentally friendly initiatives while diversifying their portfolios.",
    "Cryptocurrency continues to attract attention from both investors and regulators, with debates about its potential for mainstream adoption and regulatory frameworks.",
    "Central banks around the world are adjusting interest rates in response to economic conditions, aiming to balance growth and inflation control.",
    "The stock market has seen significant movements in the tech sector, driven by innovations in artificial intelligence, cloud computing, and next-generation technologies."
]


# Perform recursive embedding, clustering, and summarization over 3 levels
results = recursive_embed_cluster_summarize(texts, level=1, n_levels=2)

--Generated 4 clusters--
--Generated 1 clusters--
--Generated 1 clusters--


In [52]:
# Display the output at the first level
df_clusters_level1, df_summary_level1 = results[1]
print("\nSummarized clusters (Level 1):")
print(df_summary_level1)


Summarized clusters (Level 1):
                                           summaries  level  cluster
0  The education sector is rapidly evolving due t...      1      1.0
1  A Chicago man has been charged in the fatal sh...      1      3.0
2  Violent crime rates are increasing in urban ar...      1      0.0
3  The financial market is currently experiencing...      1      2.0


In [53]:
df_summary_level1['summaries'].iloc[0] # cluster 2

'The education sector is rapidly evolving due to the integration of digital tools and online platforms, enhancing global accessibility. Adaptive learning technology personalizes education, enabling students to learn at their own pace based on individual needs. Virtual reality (VR) is being utilized to create immersive environments, providing practical knowledge in fields like medicine, engineering, and history. Additionally, educational institutions are increasingly implementing AI-driven tools to automate administrative tasks and improve learning experiences through smart content delivery. The transition to remote and hybrid learning models has transformed traditional classroom dynamics, fostering innovations in teaching strategies and digital collaboration.'

In [54]:
df_summary_level1['summaries'].iloc[1] # cluster 3

'A Chicago man has been charged in the fatal shooting of Oak Park Police Detective Allan Reddins, marking the first time an Oak Park officer has been killed in the line of duty since 193. This incident highlights ongoing challenges in crime prevention, as law enforcement grapples with rising homicide and aggravated assault rates. New technologies, such as surveillance systems, facial recognition, and data analytics, are being employed to combat violent crime, though they raise significant privacy concerns. In response, police departments are increasingly adopting community-based strategies to build trust and cooperation with the public, aiming to reduce crime through proactive measures and improved communication. Effective response strategies and public collaboration remain crucial in addressing these persistent issues in law enforcement.'

In [55]:
df_summary_level1['summaries'].iloc[2]

'Violent crime rates are increasing in urban areas, with communities grappling with gang violence, robberies, and assaults. Law enforcement is actively developing strategies for public safety and crime prevention. Research shows that cities with significant economic disparity tend to have higher violent crime rates, highlighting the need to address poverty, education, and job opportunities as long-term solutions. Effective violent crime prevention programs focusing on youth engagement and education have demonstrated success in reducing incidents. These initiatives, which include mentoring and providing safe recreational activities, are essential for fostering safer communities. Overall, urban violence is closely linked to socioeconomic issues, and addressing root causes through improved housing, education access, and job creation is vital for reducing violent crime.'

In [58]:
df_summary_level1['summaries'].iloc[3]

'The financial market is currently experiencing fluctuations due to global economic uncertainties, including geopolitical tensions and inflation concerns. Amidst this volatility, investment in sustainable and green technologies is increasing as investors aim to support environmentally friendly initiatives while diversifying their portfolios. Cryptocurrency remains a focal point for both investors and regulators, sparking discussions about its potential for mainstream adoption and the need for regulatory frameworks. Central banks worldwide are adjusting interest rates to balance economic growth and inflation control. Additionally, the stock market has seen notable movements in the tech sector, propelled by innovations in artificial intelligence, cloud computing, and next-generation technologies.'

In [56]:
# Display the output at the first level
df_clusters_level2, df_summary_level2 = results[2]
print("\nSummarized clusters (Level 2):")
print(df_summary_level2)


Summarized clusters (Level 2):
                                           summaries  level  cluster
0  The education sector is rapidly evolving with ...      2        0


In [57]:
df_summary_level2['summaries'].iloc[0]

'The education sector is rapidly evolving with digital tools and online platforms enhancing global accessibility. Adaptive learning technology allows personalized education, while virtual reality (VR) creates immersive learning environments in fields like medicine and engineering. AI-driven tools are being implemented to automate administrative tasks and improve learning experiences. The shift to remote and hybrid learning models has transformed classroom dynamics, fostering innovative teaching strategies.\n\nIn law enforcement, a Chicago man has been charged with the fatal shooting of Oak Park Police Detective Allan Reddins, the first officer killed in the line of duty since 193. This incident underscores challenges in crime prevention amid rising homicide rates. Police are adopting new technologies and community-based strategies to build public trust and reduce crime.\n\nUrban areas are facing increasing violent crime rates linked to socioeconomic disparities. Effective prevention pr

In [70]:
a = ["abc"]
b = "def"

In [74]:
a = ["abc"] # this is level 0 stuff
b = ["def"] # this is RAPTOR summerized stuff
a.extend(b) # add element
print(a)  

['abc', 'def']


In [75]:
type(all_texts)

list

In [42]:
from langchain_chroma import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain import hub

# Initialize all_texts with your preprocessed text
all_texts = texts.copy()

# Iterate through the results to extract summaries and add them to all_texts
for level in sorted(results.keys()):
    summaries = results[level][1]["summaries"].tolist()
    print("curr summaries", summaries)
    all_texts.extend(summaries) # -> extend ?????
    print("One level is done")

curr summaries ['The education sector is rapidly evolving due to the integration of digital tools and online platforms, enhancing global accessibility. Adaptive learning technology personalizes education, enabling students to learn at their own pace based on individual needs. Virtual reality (VR) is being utilized to create immersive environments, providing practical knowledge in fields like medicine, engineering, and history. Additionally, educational institutions are increasingly implementing AI-driven tools to automate administrative tasks and improve learning experiences through smart content delivery. The transition to remote and hybrid learning models has transformed traditional classroom dynamics, fostering innovations in teaching strategies and digital collaboration.', 'A Chicago man has been charged in the fatal shooting of Oak Park Police Detective Allan Reddins, marking the first time an Oak Park officer has been killed in the line of duty since 193. This incident highlights

In [None]:
# Iterate through the results to extract summaries and add them to all_texts
for level in sorted(results.keys()):
    summaries = results[level][2]["summaries"].tolist()
    print("curr summaries", summaries)
    all_texts.extend(summaries)
    print("One level is done")

In [None]:
# This is your tree
# all_texts