In [57]:
import openai
import pandas as pd
import json
import time
from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv
import os

load_dotenv()

client = openai.OpenAI(api_key=os.getenv("API_KEY"))

# ‚úÖ LOAD THE DATASET
file_path = "results/generalized_normalized_ner_with_embeddings.xlsx"  # Update with your actual file path
try:
    df = pd.read_excel(file_path)
    print(f"‚úÖ Loaded dataset with {len(df)} rows.")
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")
    exit()

# ‚úÖ LOAD SEMANTIC SIMILARITY MODEL
print("üîÑ Loading sentence embedding model for category mapping...")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
print("‚úÖ Model loaded successfully.")

# ‚úÖ PREDEFINED RELATIONSHIP CATEGORIES
standard_relationships = {
    "legal_violation": ["legal", "lawsuit", "court_case", "regulatory_violation"],
    "corporate_action": ["business_decision", "merger", "acquisition"],
    "government_regulation": ["regulation", "policy", "law_enforcement"],
    "geographical_association": ["location", "place", "geographical"],
    "crime_involvement": ["criminal", "crime", "fraud", "arrest", "indictment"],
    "financial_transaction": ["finance", "funding", "money_transfer", "seizure", "investment"],
    "scientific_discovery": ["science", "discovery", "innovation", "technology_breakthrough"],
    "historical_event": ["history", "historical_event", "war", "political_event"]
}

# ‚úÖ FUNCTION TO GROUP SIMILAR RELATIONSHIP TYPES
def map_to_standard_relationship(relationship_type):
    if not relationship_type:
        return "unknown"

    relationship_type = relationship_type.lower().replace("_", " ")
    best_match = None
    best_score = 0.7  

    for standard, variations in standard_relationships.items():
        variations.append(standard)  
        embeddings1 = embedding_model.encode(relationship_type, convert_to_tensor=True)
        embeddings2 = embedding_model.encode(variations, convert_to_tensor=True)

        similarities = util.pytorch_cos_sim(embeddings1, embeddings2).squeeze().tolist()
        max_similarity = max(similarities)

        if max_similarity > best_score:
            best_match = standard
            best_score = max_similarity

    if best_match is None:
        best_match = relationship_type.replace(" ", "_")
        standard_relationships[best_match] = [relationship_type]
        print(f"üîÑ New relationship type detected and added: {best_match}")

    return best_match

# ‚úÖ Function to format the prompt for GPT-4o
def generate_prompt(text, entities):
#     return f"""
# You are an expert in **relationship extraction**. Your task is to extract and classify all possible relationships between named entities in the given text.

# ---

# ### ** Instructions:**
# 1. **Extract ALL possible relationships**  
#    - Identify **explicit** relationships (clearly stated in the text).  
#    - Identify **implicit** relationships (that can be inferred based on real-world knowledge).  
#    - If no meaningful relationship exists, **set `"relationship_type": "null"`**.

# 2. **Format the Output as JSON (ONLY JSON):**  
#    Each relationship should include:
#    - `"head"`: The first entity in the relationship.
#    - `"tail"`: The second entity in the relationship.
#    - `"relationship"`: A clear description of how they are related.
#    - `"relationship_type"`: The most suitable category (if unknown, still return it).

# --- 

# Text to Analyze:{text}

# Entities: {entities}

# """

    return f"""
    
You are an expert in relationship extraction. Your task is to extract and classify all possible relationships between named entities in the given text.

1. Extract ALL possible and unique relationships between ALL entity pairs  
   - Identify explicit relationships (clearly stated in the text).  
   - Identify implicit relationships (that can be inferred based on real-world knowledge).  
   - If no meaningful relationship exists, set "relationship_type": "null"

2. Format the Output as JSON (STRICTLY JSON):  
   ```json
   {{
     "relationships": [
       {{
         "head": "<Entity 1>",
         "tail": "<Entity 2>",
         "relationship": "<Description of relationship>",
         "relationship_type": "<Categorized relationship type>"
       }},
       ...
     ]
   }}
    """

def extract_relationships_gpt(text, entities):
    prompt = generate_prompt(text, entities)

    try:
        print("üì§ Sending request to GPT-4o...")
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "system", "content": prompt}],
            temperature=0.3,
            response_format={ "type": "json_object" }
        )
        response_content = response.choices[0].message.content
        print("‚úÖ GPT-4o response received.")

        try:
            extracted_relationships = json.loads(response_content)
            return extracted_relationships
        except json.JSONDecodeError as e:
            print(response_content)
            print(f"‚ùå Error parsing GPT-4o response as JSON: {e}")
            return []
    except Exception as e:
        print(f"‚ùå API request failed: {e}")
        return []

results = []
for index, row in df[0:5].iterrows():
    text = row["Text"]
    entities = row["Normalized_NER_Entities"]
    if not text or not entities:
        print(f"‚ö†Ô∏è Skipping row {index + 1} due to missing data.")
        continue

    print(f"üîé Processing excerpt {index + 1}/{len(df)}...")

    relationships = extract_relationships_gpt(text, entities)

    for rel in relationships['relationships']:
        original_type = rel["relationship_type"]
        rel["relationship_type"] = map_to_standard_relationship(original_type)
        if original_type != rel["relationship_type"]:
            print(f"üîÑ Grouped '{original_type}' under '{rel['relationship_type']}'")

    results.append({
        "excerpt": f"Excerpt {index + 1}",
        "relationships": relationships
    })

    time.sleep(2)


output_file = "results/extracted_relationships_jcc.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print(f"‚úÖ Relationship extraction complete! Results saved to {output_file}")



‚úÖ Loaded dataset with 1509 rows.
üîÑ Loading sentence embedding model for category mapping...
‚úÖ Model loaded successfully.
üîé Processing excerpt 1/1509...
üì§ Sending request to GPT-4o...
‚úÖ GPT-4o response received.
üîÑ New relationship type detected and added: family
üîÑ New relationship type detected and added: residence
üîÑ New relationship type detected and added: employment
üîÑ Grouped 'location' under 'geographical_association'
üîé Processing excerpt 2/1509...
üì§ Sending request to GPT-4o...
‚úÖ GPT-4o response received.
üîÑ Grouped 'location' under 'geographical_association'
üîé Processing excerpt 3/1509...
üì§ Sending request to GPT-4o...
‚úÖ GPT-4o response received.
üîÑ Grouped 'location' under 'geographical_association'
üîé Processing excerpt 4/1509...
üì§ Sending request to GPT-4o...
‚úÖ GPT-4o response received.
üîÑ New relationship type detected and added: founder
üîÑ New relationship type detected and added: leadership
üîÑ New relationship type 

In [73]:
import openai
import pandas as pd
import json
import time
from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv
import os
import re
import ast
import itertools

load_dotenv()

client = openai.OpenAI(api_key=os.getenv("API_KEY"))

# ‚úÖ LOAD THE DATASET
file_path = "results/generalized_normalized_ner_with_embeddings.xlsx"  # Update with your actual file path
try:
    df = pd.read_excel(file_path)
    print(f"‚úÖ Loaded dataset with {len(df)} rows.")
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")
    exit()

# ‚úÖ Load Semantic Similarity Model
print("üîÑ Loading sentence embedding model for category mapping...")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
print("‚úÖ Model loaded successfully.")

# ‚úÖ Predefined Relationship Categories
# standard_relationships = {
#     "legal_violation": ["legal", "lawsuit", "court_case", "regulatory_violation"],
#     "corporate_action": ["business_decision", "merger", "acquisition"],
#     "government_regulation": ["regulation", "policy", "law_enforcement"],
#     "geographical_association": ["location", "place", "geographical"],
#     "crime_involvement": ["criminal", "crime", "fraud", "arrest", "indictment"],
#     "financial_transaction": ["finance", "funding", "money_transfer", "seizure", "investment"],
#     "scientific_discovery": ["science", "discovery", "innovation", "technology_breakthrough"],
#     "historical_event": ["history", "historical_event", "war", "political_event"]
# }

def load_standard_relationships():
    relations_file = "standard_relationships.json"
    """Loads the relationship dictionary from a JSON file or initializes a default version."""
    if os.path.exists(relations_file) and os.path.getsize(relations_file) > 0:
        try:
            with open(relations_file, "r", encoding="utf-8") as f:
                return json.load(f)
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Relationship dictionary corrupted. Resetting to default.")
    return {
        "legal_violation": ["legal", "lawsuit", "court_case", "regulatory_violation"],
        "corporate_action": ["business_decision", "merger", "acquisition"],
        "government_regulation": ["regulation", "policy", "law_enforcement"],
        "geographical_association": ["location", "place", "geographical"],
        "crime_involvement": ["criminal", "crime", "fraud", "arrest", "indictment"],
        "financial_transaction": ["finance", "funding", "money_transfer", "seizure", "investment"],
        "scientific_discovery": ["science", "discovery", "innovation", "technology_breakthrough"],
        "historical_event": ["history", "historical_event", "war", "political_event"]
    }

# ‚úÖ Load Relationship Categories
standard_relationships = load_standard_relationships()


# ‚úÖ Function to Save Updated Relationship Dictionary
def save_standard_relationships():
    relations_file = "standard_relationships.json"
    """Saves the updated relationship dictionary to a JSON file without duplicates."""
    # ‚úÖ Remove duplicates before saving
    for key in standard_relationships:
        standard_relationships[key] = list(set(standard_relationships[key]))  # ‚úÖ Convert to unique list

    with open(relations_file, "w", encoding="utf-8") as f:
        json.dump(standard_relationships, f, indent=4, ensure_ascii=False)
    
    print(f"‚úÖ Updated relationship dictionary saved to {relations_file}")


# ‚úÖ Function to Group Similar Relationship Types
def map_to_standard_relationship(relationship_type):
    """Maps relationship types to standard categories using semantic similarity."""
    
    # ‚úÖ Ensure relationship_type is a string
    if not isinstance(relationship_type, str):
        print(f"‚ö†Ô∏è Warning: Converting {relationship_type} to string.")
        relationship_type = str(relationship_type)

    relationship_type = relationship_type.lower().replace("_", " ")
    best_match = None
    best_score = 0.7  

    for standard, variations in standard_relationships.items():
        variations = list(set(variations))  # ‚úÖ Ensure variations are unique before comparison

        # ‚úÖ Encode embeddings safely
        embeddings1 = embedding_model.encode([relationship_type], convert_to_tensor=True)  # **Wrapped in a list**
        embeddings2 = embedding_model.encode(variations, convert_to_tensor=True)

        similarities = util.pytorch_cos_sim(embeddings1, embeddings2).squeeze().tolist()

        # ‚úÖ Ensure similarities is a list before using max()
        if isinstance(similarities, float):  
            similarities = [similarities]

        max_similarity = max(similarities)

        if max_similarity > best_score:
            best_match = standard
            best_score = max_similarity

    if best_match is None:
        best_match = relationship_type.replace(" ", "_")
        standard_relationships.setdefault(best_match, set()).add(relationship_type)  # ‚úÖ Use set to avoid duplicates
        print(f"üîÑ New relationship type detected and added: {best_match}")
        save_standard_relationships()  # ‚úÖ Save updated dictionary

    # ‚úÖ Convert variations back to a unique list before saving
    standard_relationships[best_match] = list(set(standard_relationships[best_match]))  

    return best_match

def split_entities(entities, batch_size=5):
    entity_names = [e[0] for e in entities]  # Extract entity names only
    all_combinations = [pair for pair in itertools.combinations(entity_names, 2) if pair[0] != pair[1]]
    """Splits entities into smaller batches for better API performance."""
    return [all_combinations[i:i + batch_size] for i in range(0, len(all_combinations), batch_size)]

# ‚úÖ Function to Generate All Possible Entity Pairs
def get_entity_pairs(entities):
    """Returns all possible unique entity pairs."""
    return list(itertools.combinations([e[0] for e in entities], 2))

def extract_relationships_gpt(text, entity_batch, max_retries=3, timeout=15):
    """Sends multiple API requests, each handling a subset of entity pairs."""
    
    prompt = generate_prompt(text, entity_batch)

    for attempt in range(max_retries):
        try:
            print(f"üì§ Sending request to GPT-4o (Attempt {attempt+1})...")
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "system", "content": prompt}],
                temperature=0.3,
                timeout=timeout
            )
            response_content = response.choices[0].message.content
            print("‚úÖ GPT-4o response received.")

            # üîç Debugging: Print Raw GPT-4o Response
            print(f"üìù Raw GPT-4o Response:\n{response_content}")

            # ‚úÖ Extract JSON from markdown block if present
            match = re.search(r'```json\n(.*?)\n```', response_content, re.DOTALL)
            if match:
                response_content = match.group(1)

            extracted_relationships = json.loads(response_content)

            # ‚úÖ Ensure response is always a list
            if isinstance(extracted_relationships, dict):
                extracted_relationships = [extracted_relationships]

            # ‚úÖ Check and convert `relationship_type` to string
            for rel in extracted_relationships:
                for relationship_entry in rel["relationships"]:
                    if not isinstance(relationship_entry["relationship_type"], str):
                        print(f"‚ö†Ô∏è Warning: Converting {relationship_entry['relationship_type']} to string.")
                        relationship_entry["relationship_type"] = str(relationship_entry["relationship_type"])

                    relationship_entry["relationship_type"] = map_to_standard_relationship(
                        relationship_entry["relationship_type"]
                    )

            return extracted_relationships
        except json.JSONDecodeError as e:
            print(f"‚ùå Error parsing GPT-4o response as JSON (Attempt {attempt+1}): {e}")
        except Exception as e:
            print(f"‚ùå API request failed (Attempt {attempt+1}): {e}")

        time.sleep(2)

    return []

# ‚úÖ Function to Generate Prompt for GPT-4o
def generate_prompt(text, entity_pairs):
    """Generates the prompt for GPT-4o, processing only a subset of entity pairs."""
    entity_pairs_text = "\n".join([f"- {pair[0]} ‚Üî {pair[1]}" for pair in entity_pairs])

    return f"""
You are an expert in **relationship extraction**. Your task is to extract and classify **all possible relationships** between named entities in the given text.

---

### **üîπ Instructions:**
1. **Extract relationships for these specific entity pairs:**  
{entity_pairs_text}

2. **Extract ALL possible and unique relationships between ALL entity pairs given above**  
   - **Identify direct relationships** (clearly stated in the text).  
   - **Identify indirect relationships** (that can be inferred based on real-world knowledge or logical reasoning).  
   - **Consider connections through third entities** (e.g., A relates to C because of B).  
   - **Capture contextual relationships** (even if the entities are not explicitly linked in a single sentence).  
   - **If no relationship exists, return `"relationship_type": "null"`, but only if there is absolutely NO logical connection.**  

3. **Ensure the output is ALWAYS a JSON ARRAY `[]`, even if there is only one relationship.**  

4. **Format the Output as JSON ONLY:**  
   Each entity pair should be structured as follows:  
   - `"head"`: The first entity in the relationship.  
   - `"tail"`: The second entity in the relationship.  
   - `"relationships"`: A **list** containing **all relationships** between the two entities.  

---

### **üìå Example Output Format (Strict JSON Array):**
```json
[
  {{
    "head": "Entity 1",
    "tail": "Entity 2",
    "relationships": [
      {{
        "relationship": "First relationship description",
        "relationship_type": "First relationship category"
      }},
      {{
        "relationship": "Second relationship description",
        "relationship_type": "Second relationship category"
      }}
    ]
  }},
  {{
    "head": "Entity 3",
    "tail": "Entity 4",
    "relationships": [
      {{
        "relationship": "Third relationship description",
        "relationship_type": "Third relationship category"
      }}
    ]
  }}
]

Return ONLY a JSON array ([]), NOT a JSON object ({{}}).

--- 

Text to Analyze:{text}

Entities: {entities}


"""

def convert_entities_string(entities_str):
    try:
        # ‚úÖ Convert string to a Python list using `ast.literal_eval()`
        entities = ast.literal_eval(entities_str)

        # ‚úÖ Ensure the output is a list of lists (convert tuples if needed)
        if isinstance(entities, list) and all(isinstance(e, tuple) for e in entities):
            entities = [list(e) for e in entities]  # Convert tuples to lists

        return entities
    except (SyntaxError, ValueError) as e:
        print(f"‚ùå Error parsing entities string: {e}")
        return []

def load_existing_results(output_file):
    """Loads existing results from the JSON file to resume from where the script left off."""
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        try:
            with open(output_file, "r", encoding="utf-8") as f:
                existing_results = json.load(f)
                return existing_results
        except json.JSONDecodeError:
            print("‚ö†Ô∏è JSON file is corrupted or empty. Starting fresh...")
            return []
    return []

# ‚úÖ Function to save results incrementally (appends new data)
def save_results_incrementally(output_file, new_entry):
    """Appends new extracted relationships to the JSON file after each iteration."""
    existing_results = load_existing_results(output_file)

    # ‚úÖ Check if the excerpt has already been processed (skip duplicates)
    processed_excerpts = {entry["excerpt"] for entry in existing_results}
    if new_entry["excerpt"] in processed_excerpts:
        print(f"‚úÖ Skipping already processed: {new_entry['excerpt']}")
        return

    # ‚úÖ Append the new entry and save
    existing_results.append(new_entry)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(existing_results, f, indent=4, ensure_ascii=False)
    print(f"‚úÖ Progress saved to {output_file}")

# ‚úÖ Output file path
output_file = "results/extracted_relationships_multiple.json"

# ‚úÖ Load existing results to resume from where it left off
existing_results = load_existing_results(output_file)

# ‚úÖ Extract already processed excerpts to avoid re-processing
processed_excerpts = {entry["excerpt"] for entry in existing_results}

results = existing_results  # Continue appending to existing results

# ‚úÖ Main Loop
for index, row in df.iterrows():
    text = row.get("Text", "").strip()
    entities = row.get("Normalized_NER_Entities", "[]").strip()

    if not text or not entities:
        print(f"‚ö†Ô∏è Skipping row {index + 1} due to missing data.")
        continue

    excerpt_name = f"Excerpt {index + 1}"

    # ‚úÖ Skip if already processed
    if excerpt_name in processed_excerpts:
        print(f"‚úÖ Skipping already processed: {excerpt_name}")
        continue

    print(f"üîé Processing {excerpt_name} ({index + 1}/{len(df)})...")

    entity_batches = split_entities(convert_entities_string(entities), batch_size=5)

    all_relationships = []
    for batch in entity_batches:
        print(f"üîÑ Processing batch: {batch}")
        relationships = extract_relationships_gpt(text, batch)
        all_relationships.extend(relationships)

    new_entry = {
        "excerpt": excerpt_name,
        "relationships": all_relationships
    }

    # ‚úÖ Save results incrementally
    save_results_incrementally(output_file, new_entry)

    time.sleep(2)

print(f"‚úÖ Relationship extraction complete! Results saved to {output_file}")



‚úÖ Loaded dataset with 1509 rows.
üîÑ Loading sentence embedding model for category mapping...
‚úÖ Model loaded successfully.
‚úÖ Skipping already processed: Excerpt 1
‚úÖ Skipping already processed: Excerpt 2
‚úÖ Skipping already processed: Excerpt 3
‚úÖ Skipping already processed: Excerpt 4
‚úÖ Skipping already processed: Excerpt 5
‚úÖ Skipping already processed: Excerpt 6
‚úÖ Skipping already processed: Excerpt 7
‚úÖ Skipping already processed: Excerpt 8
‚úÖ Skipping already processed: Excerpt 9
‚úÖ Skipping already processed: Excerpt 10
‚úÖ Skipping already processed: Excerpt 11
‚úÖ Skipping already processed: Excerpt 12
‚úÖ Skipping already processed: Excerpt 13
‚úÖ Skipping already processed: Excerpt 14
‚úÖ Skipping already processed: Excerpt 15
‚úÖ Skipping already processed: Excerpt 16
‚úÖ Skipping already processed: Excerpt 17
‚úÖ Skipping already processed: Excerpt 18
‚úÖ Skipping already processed: Excerpt 19
‚úÖ Skipping already processed: Excerpt 20
‚úÖ Skipping already 

KeyboardInterrupt: 