In [None]:
#import libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [None]:
# Load the CSV data
csv_path = r'C:\Users\Jade Ana-Maria\peckham1\NLP_for_Creatives\NLP_for_Creatives\final_project\data\preprocessed_lll_data.csv'
df = pd.read_csv(csv_path)
paragraphs = df['paragraph'].tolist()
paragraph_sources = df['source'].tolist()
post_titles = df['title'].tolist()

Initialise the models

In [None]:
# 1. Sentence-BERT model for encoding text into embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# 2. Question-answering pipeline for extracting specific answers
qa_model = pipeline(
    "question-answering",
    model="deepset/roberta-base-squad2",
    tokenizer="deepset/roberta-base-squad2"
)

In [None]:
# 3. Summarization pipeline for condensing long answers
summarizer_model = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    max_length=150,
    min_length=40,
    use_auth_token= hf_token # This will be None if no token is set
)


In [None]:
# 4. Intent classification for detecting query types
intent_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", use_auth_token= hf_token)

In [None]:
# Create embeddings for each paragraph
paragraph_embeddings = embedding_model.encode(paragraphs)

In [None]:
# Define the answer_question function
def find_relevant_content(query, top_k=3):
    # Encode the query
    query_embedding = embedding_model.encode([query])[0]
    
    # Calculate similarity scores
    similarities = cosine_similarity(
        [query_embedding], 
        paragraph_embeddings
    )[0]
    
    # Get indices of top K most similar paragraphs
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    relevant_content = []
    for idx in top_indices:
        relevant_content.append({
            'paragraph': paragraphs[idx],
            'source': paragraph_sources[idx],
            'title': post_titles[idx],
            'score': similarities[idx]
        })
        
    return relevant_content


In [None]:
def answer_question(query):
    # First, find relevant content
    relevant_content = find_relevant_content(query, top_k=3)
    
    if not relevant_content:
        return {
            'answer': "I'm sorry, I couldn't find information about that in the La Leche League resources.",
            'source': None,
            'context': None
        }
    
    # Combine the most relevant paragraphs into a context
    context = " ".join([item['paragraph'] for item in relevant_content])
    
    # Use question-answering to extract a specific answer
    try:
        qa_result = qa_model(
            question=query,
            context=context
        )
        
        answer = qa_result['answer']
        score = qa_result['score']
        
        # If confidence is low, use the whole paragraph
        if score < 0.1:
            answer = relevant_content[0]['paragraph']
            
        return {
            'answer': answer,
            'source': relevant_content[0]['source'],
            'title': relevant_content[0]['title'],
            'context': context
        }
        
    except Exception as e:
        # Fallback to the most relevant paragraph
        return {
            'answer': relevant_content[0]['paragraph'],
            'source': relevant_content[0]['source'],
            'title': relevant_content[0]['title'],
            'context': context
        }


In [None]:
def detect_intent(query):
    """Detect the user's intent"""
    # Map common intents
    result = intent_classifier(
        query, 
        candidate_labels=["question", "instruction", "personal story", "help request"]
    )
        
    return result['labels'][0]

In [None]:
def summarize_content(text):
        """Summarize long content"""
        # Only summarize if text is long enough
        if len(text.split()) < 50:
            return text
            
        try:
            summary = summarizer_model(text)
            return summary[0]['summary_text']
        except Exception as e:
            # Fallback to original text if summarization fails
            return text

In [None]:
def get_response(user_input):
    """enhaned main method to interact with the chatbot"""
    intent = detect_intent(user_input)
    result = answer_question(user_input)

    answer_text = result['answer']
    if len(answer_text.split()) > 100:
        answer_text = summarize_content(answer_text)
        

    
    response = f"{answer_text}\n\n"
    response += f"For more information please visit the La Leche League article: '{result['title']}'\n"
    response += f"Source: {result['source']}"

    #add related topic suggestions based on source article
    related_content = find_relevant_content(result['title'], top_k=2)
    if related_content and len(related_content) > 1:
        related_titles =set([content['title'] for content in related_content 
                             if content['title'] != result['title']])
        if related_titles:
            response += f"\n\nYou may also be interested in the following related topics: {', '.join(related_titles)} source: {related_content[0]['source']}"
            
    
    return response

In [None]:
# Interactive loop
print("Breastfeeding Information Assistant (type 'exit' to quit)")
print("Please ask your question regarding breastfeeding:")
    
while True:
    user_input = input("> ")
    if user_input.lower() == 'exit':
        break
            
    response = get_response(user_input)
    print("\n" + response + "\n")