In [None]:
#import libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [5]:
from dotenv import load_dotenv
import os

load_dotenv()
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN"),

In [6]:
# Load the CSV data
csv_path = r'C:\Users\Jade Ana-Maria\peckham1\NLP_for_Creatives\NLP_for_Creatives\final_project\data\preprocessed_lll_data.csv'
df = pd.read_csv(csv_path)
paragraphs = df['paragraph'].tolist()
paragraph_sources = df['source'].tolist()
post_titles = df['title'].tolist()

Initialise the models

In [7]:
# 1. Sentence-BERT model for encoding text into embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [8]:
# 2. Question-answering pipeline for extracting specific answers
qa_model = pipeline(
    "question-answering",
    model="deepset/roberta-base-squad2",
    tokenizer="deepset/roberta-base-squad2"
)

Device set to use cpu


In [9]:
# Create embeddings for each paragraph
paragraph_embeddings = embedding_model.encode(paragraphs)

In [12]:
# Define the answer_question function
def find_relevant_content(query, top_k=3):
    # Encode the query
    query_embedding = embedding_model.encode([query])[0]
    
    # Calculate similarity scores
    similarities = cosine_similarity(
        [query_embedding], 
        paragraph_embeddings
    )[0]
    
    # Get indices of top K most similar paragraphs
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    relevant_content = []
    for idx in top_indices:
        relevant_content.append({
            'paragraph': paragraphs[idx],
            'source': paragraph_sources[idx],
            'title': post_titles[idx],
            'score': similarities[idx]
        })
        
    return relevant_content


In [14]:
def answer_question(query):
    # First, find relevant content
    relevant_content = find_relevant_content(query, top_k=3)
    
    if not relevant_content:
        return {
            'answer': "I'm sorry, I couldn't find information about that in the La Leche League resources.",
            'source': None,
            'context': None
        }
    
    # Combine the most relevant paragraphs into a context
    context = " ".join([item['paragraph'] for item in relevant_content])
    
    # Use question-answering to extract a specific answer
    try:
        qa_result = qa_model(
            question=query,
            context=context
        )
        
        answer = qa_result['answer']
        score = qa_result['score']
        
        # If confidence is low, use the whole paragraph
        if score < 0.1:
            answer = relevant_content[0]['paragraph']
            
        return {
            'answer': answer,
            'source': relevant_content[0]['source'],
            'title': relevant_content[0]['title'],
            'context': context
        }
        
    except Exception as e:
        # Fallback to the most relevant paragraph
        return {
            'answer': relevant_content[0]['paragraph'],
            'source': relevant_content[0]['source'],
            'title': relevant_content[0]['title'],
            'context': context
        }


In [15]:
def get_response(user_input):
    """Main method to interact with the chatbot"""
    result = answer_question(user_input)
    
    response = f"{result['answer']}\n\n"
    response += f"For more information please visit the La Leche League article: '{result['title']}'\n"
    response += f"Source: {result['source']}"
    
    return response

In [16]:
# Interactive loop
print("Breastfeeding Information Assistant (type 'exit' to quit)")
print("Please ask your question regarding breastfeeding:")
    
while True:
    user_input = input("> ")
    if user_input.lower() == 'exit':
        break
            
    response = get_response(user_input)
    print("\n" + response + "\n")

Breastfeeding Information Assistant (type 'exit' to quit)
Please ask your question regarding breastfeeding:

Information aboutStoring your Milk is here.

For more information please visit the La Leche League article: 'Storing Your Milk'
Source: https://laleche.org.uk/storing-your-milk/

