In [1]:
!pip install pyarrow
!pip install fastparquet

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd

# Lire le fichier Parquet
df = pd.read_parquet("./Top20_Centroide_HDBSCAN.parquet")

# Afficher les dimensions et les premières lignes
print(f"Dimensions du DataFrame : {df.shape}")
print(df.head())


Dimensions du DataFrame : (6560, 13)
                          file  \
0  meyers-a/deleted_items/351.   
1  meyers-a/deleted_items/355.   
2  meyers-a/deleted_items/354.   
3  meyers-a/deleted_items/436.   
4  meyers-a/deleted_items/343.   

                                             message  \
0  Message-ID: <15489782.1075841297698.JavaMail.e...   
1  Message-ID: <27677080.1075841297796.JavaMail.e...   
2  Message-ID: <27053622.1075841297772.JavaMail.e...   
3  Message-ID: <19044266.1075841299880.JavaMail.e...   
4  Message-ID: <19500157.1075841297502.JavaMail.e...   

                                        parsed_email  \
0  {'Message-ID': '<15489782.1075841297698.JavaMa...   
1  {'Message-ID': '<27677080.1075841297796.JavaMa...   
2  {'Message-ID': '<27053622.1075841297772.JavaMa...   
3  {'Message-ID': '<19044266.1075841299880.JavaMa...   
4  {'Message-ID': '<19500157.1075841297502.JavaMa...   

                                      Message-ID               X-FileName  \
0  <154

In [3]:
df.shape

(6560, 13)

In [7]:
import pandas as pd
import json
from langchain_openai import ChatOpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration de l'API
url = "https://aristote-dispatcher.mydocker-run-vd.centralesupelec.fr/v1"
llm = ChatOpenAI(
    base_url=url,
    model="casperhansen/llama-3-70b-instruct-awq",
    api_key="nothing"  # Remplacez "nothing" par votre clé API si nécessaire
)

# Prompt système
system_prompt = """
You are an expert in text classification and analysis. Your task is to analyze the provided emails and determine:
1. Whether the cluster of emails is of legal type (i.e., related to law, legal contracts, compliance, litigation, regulatory issues, or intellectual property).

Follow these rules to classify a cluster of emails as "legal" or "not legal":
- Consider a cluster as "legal" if it discusses topics like:
  - Contracts or agreements (e.g., "Please review the attached contract," "We need to sign the NDA").
  - Regulatory compliance (e.g., "This is required to comply with GDPR").
  - Legal disputes or litigation (e.g., "Our legal team is handling the case").
  - Intellectual property (e.g., patents, copyrights, trademarks).
  - Requests for legal advice (e.g., "Can you confirm if this complies with the law?").
  - Policies or guidelines that have legal implications (e.g., "New workplace harassment policy").
- Consider a cluster as "not legal" if it discusses operational, technical, personal, or non-legal matters, even if it uses formal language.

Include additional context if necessary to explain your decision.

Return the results in the following JSON format:
{
    "hdbscan_cluster": <hdbscan_cluster>,  
    "is_legal": <true/false>,  
    "justification": "<brief explanation of why the cluster was classified as legal or not legal>"  
}


Make sure your response is strictly valid JSON and that the classification is accurate based on the above instructions.
"""

# Fonction pour traiter un cluster d'e-mails
def process_cluster(row):
    hdbscan_cluster = row["hdbscan_cluster"]
    emails = row["final_body"]

    batch_text = "\n\n".join([f"Email {i+1}: {email}" for i, email in enumerate(emails)])
    batch_prompt = f"The following emails belong to cluster {hdbscan_cluster}:\n\n{batch_text}"

    messages = [
        ("system", system_prompt),
        ("human", batch_prompt),
    ]

    try:
        response = llm.invoke(messages)
        response_content = response.content if hasattr(response, "content") else None

        if response_content:
            start_idx = response_content.find("{")
            end_idx = response_content.rfind("}")
            if start_idx != -1 and end_idx != -1:
                json_part = response_content[start_idx:end_idx + 1]
                try:
                    result = json.loads(json_part)
                    result["hdbscan_cluster"] = hdbscan_cluster
                except json.JSONDecodeError:
                    result = {
                        "hdbscan_cluster": hdbscan_cluster,
                        "is_legal": None,
                        "justification": "Invalid JSON response from the model."
                    }
            else:
                result = {
                    "hdbscan_cluster": hdbscan_cluster,
                    "is_legal": None
                }
        else:
            result = {
                "hdbscan_cluster": hdbscan_cluster,
                "is_legal": None
            }

    except Exception as e:
        result = {
            "hdbscan_cluster": hdbscan_cluster,
            "is_legal": None
        }

    return result

# Grouper les e-mails par cluster
grouped_emails = df.groupby("hdbscan_cluster").agg({
    "final_body": list
}).reset_index()

# Liste pour stocker les résultats
results = []

# Utilisation de ThreadPoolExecutor pour le multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_cluster = {executor.submit(process_cluster, row): row for _, row in grouped_emails.iterrows()}
    for future in tqdm(as_completed(future_to_cluster), total=len(grouped_emails), desc="Processing clusters"):
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            print(f"Error processing cluster: {e}")

# Convertir les résultats en DataFrame
results_df = pd.DataFrame(results)

# Ajouter la colonne "is_legal" au DataFrame initial
df = pd.merge(df, results_df[["hdbscan_cluster", "is_legal"]], on="hdbscan_cluster", how="left")

# Sauvegarder les résultats dans un fichier CSV
output_csv_file = "Legal_clusters_classification_hdbscan.csv"
df.to_csv(output_csv_file, index=False)

print(f"Analysis results have been saved to {output_csv_file}.")


Processing clusters: 100%|██████████| 328/328 [22:36<00:00,  4.14s/it]


Analysis results have been saved to Legal_clusters_classification_hdbscan.csv.


In [8]:
df.to_excel("Legal_clusters_classification_hdbscan.xlsx", index= False)

In [9]:
df.shape

(6560, 15)

In [10]:
df.columns

Index(['file', 'message', 'parsed_email', 'Message-ID', 'X-FileName', 'Body',
       'final_body', 'embedding', 'spam', 'From', 'KMeans_Labels',
       'hdbscan_cluster', 'Distance_to_Centroid', 'is_legal_x', 'is_legal_y'],
      dtype='object')