In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/ZenerDiodeBasicsSymbolCharacteristicsApplicationsProsConsExplained')
df



In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
!pip install transformers sentencepiece --quiet

In [None]:
!pip install nltk spacy transformers
!python -m nltk.downloader punkt stopwords
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
from tqdm import tqdm
import nltk
nltk.download('punkt_tab')

nltk.download('punkt')
nltk.download('stopwords')

rebel_model_name = "Babelscape/rebel-large"
rebel_tokenizer = AutoTokenizer.from_pretrained(rebel_model_name)
rebel_model = AutoModelForSeq2SeqLM.from_pretrained(rebel_model_name).to("cuda" if torch.cuda.is_available() else "cpu")
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True, device=0 if torch.cuda.is_available() else -1)

def preprocess_text(text):
    text = re.sub(r'\n+', ' ', text)

    text = re.sub(r'["“”‘’]', '', text)
    text = re.sub(r'\s+', ' ', text)

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return " ".join(filtered_tokens)

def extract_rebel_triplets(decoded_text):
    triples = []
    try:
        segments = decoded_text.split('<triplet>')
        for segment in segments:
            if segment.strip() == "":
                continue
            parts = segment.strip().split('<subj>')
            if len(parts) < 2: continue
            head = parts[0].strip()

            rest = parts[1].split('<obj>')
            if len(rest) < 2: continue
            relation = rest[0].strip()
            tail = rest[1].strip()

            head = re.sub(r'</?[a-z]+>', '', head).strip()
            relation = re.sub(r'</?[a-z]+>', '', relation).strip()
            tail = re.sub(r'</?[a-z]+>', '', tail).strip()

            if head and relation and tail:
                triples.append((head, relation, tail))
    except Exception as e:
        print(f"REBEL parsing error: {e}")
    return triples
def get_entity_type(entity_text, ner_results):
    for ent in ner_results:
        if entity_text.lower() in ent['word'].lower():
            return ent['entity_group']
    return "UNKNOWN"

def process_dataframe(df):
    all_enriched_triples = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        original_text = row['text']
        clean_text = preprocess_text(original_text)

        inputs = rebel_tokenizer(clean_text, return_tensors="pt", truncation=True, max_length=512).to(rebel_model.device)
        outputs = rebel_model.generate(**inputs, max_new_tokens=512, num_beams=5)
        decoded_output = rebel_tokenizer.decode(outputs[0], skip_special_tokens=False)

        triples = extract_rebel_triplets(decoded_output)

        ner_results = ner_pipeline(original_text)

        enriched_triples = []
        for head, relation, tail in triples:
            head_type = get_entity_type(head, ner_results)
            tail_type = get_entity_type(tail, ner_results)

            enriched_triples.append({
                'video_id': row['video_id'],
                'start_time': row['start_time'],
                'end_time': row['end_time'],
                'head': head,
                'head_type': head_type,
                'relation': relation,
                'tail': tail,
                'tail_type': tail_type,
                'text': original_text
            })

        all_enriched_triples.append(enriched_triples)

    df['enriched_triples'] = all_enriched_triples
    return df



In [None]:
final_df = process_dataframe(df)
from pprint import pprint
pprint(final_df['enriched_triples'].iloc[0])

In [None]:
import re

def fix_noisy_triples(triples):
    fixed_triples = []

    for triple in triples:
        head = triple['head'].strip()
        relation = triple['relation'].strip()
        tail = triple['tail'].strip()
        head_type = triple.get('head_type', 'UNKNOWN')
        tail_type = triple.get('tail_type', 'UNKNOWN')

        head = head.replace('</s>', '').strip()
        relation = relation.replace('</s>', '').strip()
        tail = tail.replace('</s>', '').strip()
        if not head or not tail or not relation:
            continue
        if head.lower() == tail.lower():
            continue
        if relation.lower() in [head.lower(), tail.lower()]:
            continue
        tail_as_relation_patterns = ['part of', 'type of', 'used for', 'used by', 'based on', 'known as']
        if any(tail.lower().startswith(p) for p in tail_as_relation_patterns):
            relation, tail = tail, relation
        if head_type == 'UNKNOWN' and tail_type == 'UNKNOWN' and len(relation.split()) > 4:
            continue
        relation = re.sub(r'\b(\w+)\s+\1\b', r'\1', relation, flags=re.IGNORECASE)

        fixed_triple = triple.copy()
        fixed_triple.update({
            'head': head,
            'relation': relation,
            'tail': tail
        })
        fixed_triples.append(fixed_triple)

    return fixed_triples


In [None]:
df['enriched_triples'] = df['enriched_triples'].apply(fix_noisy_triples)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def validate_triple_pos(triple):
    """
    POS tag check for REBEL triples.
    - Head and Tail should be NOUN/PROPN
    - Relation should be VERB/ADP/ADJ
    """
    head_doc = nlp(triple['head'])
    tail_doc = nlp(triple['tail'])
    rel_doc = nlp(triple['relation'])

    head_valid = any(token.pos_ in ['NOUN', 'PROPN'] for token in head_doc)
    tail_valid = any(token.pos_ in ['NOUN', 'PROPN'] for token in tail_doc)
    rel_valid = any(token.pos_ in ['VERB', 'ADP', 'ADJ'] for token in rel_doc)

    return head_valid and tail_valid and rel_valid


In [None]:
df['validated_triples'] = df['enriched_triples'].apply(
    lambda triples: [triple for triple in triples if validate_triple_pos(triple)]
)


In [None]:
!pip install sentence-transformers


In [None]:
final_df.columns

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2"  )  # or keep your preferred 384D/128D model

def embed_segments(df):
    df["segment_embedding"] = df["text"].apply(lambda x: model.encode(x, normalize_embeddings=True).tolist())
    return df

final_df = embed_segments(final_df)

In [None]:
final_df["enriched_triples"]

In [None]:
final_df.head()

In [None]:
!pip install Neo4j

In [None]:
URI="Your URI"
NEO4J_USERNAME="USERNAME"
NEO4J_PASSWORD="Password"

In [None]:
from neo4j import GraphDatabase
import ast

driver = GraphDatabase.driver(URI,auth=("neo4j",NEO4J_PASSWORD))

def push_to_neo4j(tx, video_id, start_time, end_time, text, embedding, triples):
    #section_id = f"{video_id}_{start_time:.2f}_{end_time:.2f}"
    section_id = f"{video_id}_{float(start_time):.2f}_{float(end_time):.2f}"

    # MERGE the Video node first (avoids duplicates)
    tx.run("""
        MERGE (v:Video {video_id: $video_id})
        ON CREATE SET v.name = $video_id
    """, video_id=video_id)

    # Create Section node and set attributes
    tx.run("""
        MERGE (s:Section {id: $section_id})
        SET s.video_id = $video_id,
            s.start_time = $start_time,
            s.end_time = $end_time,
            s.text = $text,
            s.embedding = $embedding
    """, section_id=section_id, video_id=video_id,
         start_time=start_time, end_time=end_time,
         text=text, embedding=embedding)

    # Link Section to the Video
    tx.run("""
        MATCH (v:Video {video_id: $video_id})
        MATCH (s:Section {id: $section_id})
        MERGE (v)-[:HAS_SECTION]->(s)
    """, video_id=video_id, section_id=section_id)

    for triple in triples:
        head = triple['head']
        tail = triple['tail']
        relation = triple['relation']
        head_type = triple.get('head_type', 'UNKNOWN')
        tail_type = triple.get('tail_type', 'UNKNOWN')

        tx.run("""
            MERGE (h:Entity {name: $head})
            ON CREATE SET h.type = $head_type
        """, head=head, head_type=head_type)

        tx.run("""
            MERGE (t:Entity {name: $tail})
            ON CREATE SET t.type = $tail_type
        """, tail=tail, tail_type=tail_type)

        tx.run("""
            MATCH (h:Entity {name: $head})
            MATCH (t:Entity {name: $tail})
            MERGE (h)-[r:RELATION {type: $relation}]->(t)
        """, head=head, tail=tail, relation=relation)

        tx.run("""
            MATCH (s:Section {id: $section_id})
            MATCH (h:Entity {name: $head})
            MATCH (t:Entity {name: $tail})
            MERGE (s)-[:MENTIONS]->(h)
            MERGE (s)-[:MENTIONS]->(t)
            MERGE (s)-[:MENTIONS_RELATION {type: $relation}]->(h)
            MERGE (s)-[:MENTIONS_RELATION {type: $relation}]->(t)
        """, section_id=section_id, head=head, tail=tail, relation=relation)

with driver.session() as session:
    for idx, row in final_df.iterrows():
        video_id = row['video_id']
        start_time = row['start_time']
        end_time = row['end_time']
        text = row['text']
        embedding = row['segment_embedding']

        triples = row['enriched_triples']
        if isinstance(triples, str):
            triples = ast.literal_eval(triples)

        session.write_transaction(push_to_neo4j, video_id, start_time, end_time, text, embedding, triples)

driver.close()
