<a href="https://colab.research.google.com/github/hlcubiot/Hannah-work/blob/main/Capstone_MS_RAG_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Paths used for configuration of notebook: 
    	
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json


In [None]:
# forgot that you'll need this in the colab version
!pip install sentence_transformers
!pip install transformers
!pip install sacrebleu
!pip install rouge_score
!pip install bert_score

In [None]:
# Consolidated imports to keep things much cleaner
import pandas as pd
import re
import numpy as np
import torch
import concurrent.futures
from functools import lru_cache
from google.colab import auth
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelForCausalLM, AutoTokenizer
import sacrebleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from tqdm import tqdm



In [None]:
# Parameters / Settings
# Configuration, these are the files it's looking for on google drive
FILE_IDS = {'train.csv': '1RqzS2vTKepA6ZKnPnSRhLeMezaBPpQF4',
            'test.csv': '1i1_K-qxOZSsOEu4m5J22yZKP4npcC-gu'}
MODEL_CONFIG = {
    'sentence_model': 'all-MiniLM-L6-v2',
    'language_model': 'gpt2',
    'max_seq_length': 256,  # Reduce sequence length to 256
    'beam_size': 3,
    'temperature': 0.9
}

CONVERSATION_REGEX = re.compile(r"The conversation between human and AI assistant\.")

In [None]:
def initialize_drive():
    """Authenticate and create Google Drive client"""
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    return GoogleDrive(gauth)

def initialize_models():
    """Initialize all models with proper device allocation"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG['language_model'])
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token
    tokenizer.padding_side = 'left'  # Set padding to the left
    model = AutoModelForCausalLM.from_pretrained(MODEL_CONFIG['language_model']).to(device)
    return {
        'sentence_model': SentenceTransformer(MODEL_CONFIG['sentence_model']).to(device),
        'language_model': model,
        'tokenizer': tokenizer,
        'vectorizer': TfidfVectorizer(),
        'device': device
    }

def load_data(drive, file_id):
    """Load and process data from Google Drive"""
    downloaded = drive.CreateFile({'id': file_id})
    downloaded.GetContentFile('temp.csv')
    return pd.read_csv('temp.csv').dropna()

def clean_data(df):
    """Basic data cleaning operations"""
    return df.replace(r'\n',' ', regex=True).drop_duplicates()


def split_conversation(text):
    """Efficient conversation splitting into turns."""
    try:
        # Remove the initial conversation description
        clean_text = CONVERSATION_REGEX.sub('', text).strip()
        # Updated regex to find individual human and AI turns
        parts = re.findall(r'\[\|Human\|\](.*?)(?=\[\|AI\|\]|$)|\[\|AI\|\](.*?)(?=\[\|Human\|\]|$)', clean_text, flags=re.DOTALL)

        human_turn = ""
        ai_turn = ""

        for match in parts:
            if match[0]:
                human_turn += match[0].strip() + "\n"
            elif match[1]:
                ai_turn += match[1].strip() + "\n"

        return human_turn.strip(), ai_turn.strip()

    except Exception as e:
        print(f"Error in split_conversation: {e}")
        return ("", "")

def process_dataset(df):
    """Process entire dataset with parallel operations"""
    split = df['Conversation'].apply(split_conversation).tolist()
    return pd.DataFrame({
        'Human_Message': [x[0].strip() for x in split],
        'AI_Message': [x[1].strip() for x in split]
    })


def chunk_text(text, chunk_size=256):
    """Ensures at least 1 chunk even with empty text"""
    if not text.strip():
        return [""]  # Return empty chunk placeholder

    return [
        text[i:i+chunk_size]
        for i in range(0, len(text), chunk_size)
    ]


def process_chunks(df):
  """Process all text chunks with vectorized operations"""
  chunk_records = []
  for idx, row in df.iterrows():
      # Add explicit None checks
      human_msg = row.get('Human_Message', "") or ""
      ai_msg = row.get('AI_Message', "") or ""

      # Use modified chunk_text()
      human_chunks = chunk_text(human_msg)
      ai_chunks = chunk_text(ai_msg)

      # Add entries only if chunks exist
      if human_chunks:
          for i, chunk in enumerate(human_chunks):
              chunk_records.append({
                  'conversation_id': idx,
                  'chunk_id': i,
                  'chunk_text': chunk,
                  'source': 'Human'
              })

      if ai_chunks:
          for i, chunk in enumerate(ai_chunks):
              chunk_records.append({
                  'conversation_id': idx,
                  'chunk_id': i,
                  'chunk_text': chunk,
                  'source': 'AI'
              })

  # Add empty dataframe protection
  if not chunk_records:
      return pd.DataFrame(columns=['conversation_id', 'chunk_id', 'chunk_text', 'source'])

  return pd.DataFrame(chunk_records)


def generate_responses_batched(model, tokenizer, messages, batch_size=4):
    """Batch processing for response generation with progress tracking"""
    if not messages or all(msg.strip() == "" for msg in messages):
        raise ValueError("No valid messages provided for response generation.")

    responses = []
    for i in tqdm(range(0, len(messages), batch_size), desc="Generating responses"):
        batch = messages[i:i+batch_size]
        inputs = tokenizer(
            batch,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=MODEL_CONFIG['max_seq_length']
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                num_beams=1,  # Disable beam search
                do_sample=True,
                temperature=MODEL_CONFIG['temperature'],
                pad_token_id=tokenizer.eos_token_id  # Suppress warning
            )

        responses.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))

    return responses


def calculate_metrics(references, candidates):
    """Calculate performance metrics in parallel"""
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    # Calculate ROUGE-L scores for each pair
    rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]

    # Average the ROUGE-L F1 scores
    rougeL_f1 = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    # Calculate BLEU score
    bleu_score = sacrebleu.corpus_bleu(candidates, [references]).score

    # Calculate BERTScore
    bert_scores = bert_score(candidates, references, lang='en')

    return {
        'bleu': bleu_score,
        'rougeL': rougeL_f1,
        'bert_score': bert_scores[2].mean().item()
    }

In [None]:
# 1.) Initialize systems
drive = initialize_drive()
models = initialize_models()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# 2.) Data loading
with concurrent.futures.ThreadPoolExecutor() as executor:
    raw_train = executor.submit(load_data, drive, FILE_IDS['train.csv']).result()
    raw_test = executor.submit(load_data, drive, FILE_IDS['test.csv']).result()

In [None]:
# 3.) Data processing
train_df = process_dataset(clean_data(raw_train))
test_df = process_dataset(clean_data(raw_test))


In [None]:
# 4.) Chunking
chunks_df = process_chunks(train_df)

In [None]:
# this is for testing purposes to make sure the chunks show up right
print("Chunks DataFrame columns:", chunks_df.columns.tolist())
print("First 5 chunks:", chunks_df.head())
print("Number of chunks:", len(chunks_df))
print("Unique chunk_text values:", chunks_df['chunk_text'].unique())

Chunks DataFrame columns: ['conversation_id', 'chunk_id', 'chunk_text', 'source']
First 5 chunks:    conversation_id  chunk_id  \
0                0         0   
1                0         1   
2                0         0   
3                0         1   
4                0         2   

                                          chunk_text source  
0  Hi.My names Ahdieh.I m from a small city in Ir...  Human  
1  is condition?If it is needed I can send you hi...  Human  
2  Hello Addie! Welcome and thank you for asking ...     AI  
3  d for his resuscitation is long and may have c...     AI  
4  rain function. Hope you will find this answer ...     AI  
Number of chunks: 565321
Unique chunk_text values: ['Hi.My names Ahdieh.I m from a small city in Iran.My father had a heart attack on sunday as doctor said cpr condition.after 40 minuts he came back to life. now he isnot conscious.he is in ICU.his doctor said his conscious rate is 5.what do you think about h'
 'is condition?If it is ne

In [None]:
# 5.) Embedding generation
with torch.no_grad():
    embeddings = models['sentence_model'].encode(
        chunks_df['chunk_text'].tolist(),
        batch_size=128,
        convert_to_tensor=True,
        show_progress_bar=True
    )

Batches:   0%|          | 0/4417 [00:00<?, ?it/s]

In [None]:
# Check for empty messages in train_df
print("Number of rows in train_df:", len(train_df))
print("Number of empty Human_Message rows:", train_df['Human_Message'].str.strip().eq("").sum())
print("Number of empty AI_Message rows:", train_df['AI_Message'].str.strip().eq("").sum())


Number of rows in train_df: 106556
Number of empty Human_Message rows: 0
Number of empty AI_Message rows: 0


In [None]:
# 6.) TF-IDF matrix
#tfidf_matrix = models['vectorizer'].fit_transform(chunks_df['chunk_text'])
tfidf_matrix = models['vectorizer'] = TfidfVectorizer(stop_words=None)

In [None]:
# special magic, we can force clearing of GPU memory now so that we have more to work with in the next step
torch.cuda.empty_cache()

In [None]:
# 7.) Generate responses
# Filter out empty or whitespace-only messages
test_df = test_df[test_df['Human_Message'].str.strip() != ""]
if test_df.empty:
    raise ValueError("No valid Human_Message entries available for response generation.")

# Use a random sample of 100 rows for testing
test_df = test_df.sample(n=100, random_state=42)

# Generate responses with reduced batch size and progress tracking
responses = generate_responses_batched(
    models['language_model'],
    models['tokenizer'],
    test_df['Human_Message'].tolist(),
    batch_size=4  # Reduce batch size
)

Generating responses: 100%|██████████| 25/25 [00:42<00:00,  1.71s/it]


In [None]:
# 8.) Calculate metrics
# Verify input data
if not all(isinstance(ref, str) for ref in test_df['AI_Message'].tolist()):
    raise ValueError("References contain non-string values.")
if not all(isinstance(cand, str) for cand in responses):
    raise ValueError("Candidates contain non-string values.")

# Calculate metrics
metrics = calculate_metrics(test_df['AI_Message'].tolist(), responses)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 9.) Print results
print(f"BLEU Score: {metrics['bleu']:.2f}")
print(f"ROUGE-L F1: {metrics['rougeL']:.2f}")
print(f"BERTScore: {metrics['bert_score']:.2f}")

BLEU Score: 0.88
ROUGE-L F1: 0.12
BERTScore: 0.82


# Analysis

* BLEU Score: 0.90
  * BLEU is a precision-based metric that measures how many n-grams in the generated responses match the reference responses.
  * A score of 0.90 is extremely low, indicating that the generated responses have very little overlap with the reference responses.
* ROUGE-L F1: 0.12
  * ROUGE-L measures the longest common subsequence (LCS) between the generated and reference responses.
  * A score of 0.12 is also very low, suggesting that the generated responses are not structurally similar to the reference responses.
* BERTScore: 0.82
  * BERTScore uses embeddings from a pre-trained language model (e.g., RoBERTa) to measure semantic similarity between the generated and reference responses.
  * A score of 0.82 is relatively high, indicating that the generated responses are semantically similar to the reference responses, even if they don’t match exactly.

# What to do about it

* Paraphrasing:
  * GPT-2 often paraphrases the input rather than copying exact phrases. BLEU and ROUGE-L rely on exact n-gram matches, so paraphrased responses will result in low scores. (Take note of this because it means those scores are not reliable measures)
* Length Mismatch:
  * If the generated responses are significantly shorter or longer than the reference responses, BLEU and ROUGE-L scores will drop. (Again we know for sure that the lengths are not all the same which is why we are using padding)
* Dataset Quality:
  * If the reference responses in the dataset are noisy, incomplete, or inconsistent, it will negatively impact BLEU and ROUGE-L scores.
* Beam Search and Sampling:
  * Using num_beams=3 and do_sample=True introduces randomness into the generation process, which can lead to responses that deviate from the reference responses. (You can play with these settings to possibly increase or decrease those scores but it will take longer to run)

In [None]:
# prompt: total word count in dataset

# Calculate total word count in the 'chunk_text' column of the chunks_df DataFrame.
total_word_count = sum(len(text.split()) for text in chunks_df['chunk_text'])

print(f"Total word count in the dataset: {total_word_count}")


Total word count in the dataset: 20873276


In [None]:
# prompt: generate top word frequency percentage within dataset as a whole greater than 5 letters long and remove stopwords

from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Combine all text data into a single string
all_text = ' '.join(chunks_df['chunk_text'].tolist())

# Tokenize the text and remove stopwords
words = re.findall(r'\b\w+\b', all_text.lower()) #This finds all words
words = [word for word in words if word not in stop_words and len(word) > 5]

# Calculate word frequencies
word_counts = Counter(words)

# Calculate the percentage for each word
total_words = len(words)
word_frequencies = {word: (count / total_words) * 100 for word, count in word_counts.items()}


# Sort words by frequency in descending order
sorted_word_frequencies = dict(sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True))

# Print top 10 words and their frequencies
for word, frequency in list(sorted_word_frequencies.items())[:10]:
    print(f"{word}: {frequency:.2f}%")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


doctor: 2.64%
thanks: 0.89%
treatment: 0.82%
please: 0.74%
symptoms: 0.70%
infection: 0.68%
normal: 0.58%
problem: 0.57%
question: 0.54%
consult: 0.51%


In [None]:
def generate_response(model, tokenizer, message):
    """Generate a single response for a given input message."""
    # Tokenize the input message
    inputs = tokenizer(
        [message],
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=MODEL_CONFIG['max_seq_length']
    ).to(model.device)

    # Generate the response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,  # Limit response length
            num_beams=3,        # Use beam search
            do_sample=True,     # Enable sampling
            temperature=0.7,    # Adjust temperature
            top_k=50,           # Use top-k sampling
            top_p=0.9,          # Use nucleus sampling
            repetition_penalty=1.2,  # Penalize repetition
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def chat_with_model(model, tokenizer):
    """Interactive chat interface with the model."""
    print("Chat with the AI! Type 'exit' to end the conversation.")
    while True:
        # Get user input
        human_message = input("You: ")
        if human_message.lower() == "exit":
            print("Exiting chat. Goodbye!")
            break

        # Generate a response
        response = generate_response(model, tokenizer, human_message)
        print(f"AI: {response}")


In [None]:
chat_with_model(models['language_model'], models['tokenizer'])

Chat with the AI! Type 'exit' to end the conversation.
You: what are common symptoms of the flu
AI: what are common symptoms of the flu?

The flu can cause a variety of symptoms, including fever, headache, muscle aches, fatigue, nausea, vomiting, and diarrhea. The most common symptoms of the flu include fever, headache, muscle aches, fatigue, nausea, vomiting, and diarrhea. The most common symptoms of the flu include fever
You: how do I know if I'm having a panic attack
AI: how do I know if I'm having a panic attack or not?"

"I don't know," she said.

"I'm not sure," he said.

"I'm not sure," she said.

"I'm not sure," he said.

"I'm not sure," she said.

"I'm not
You: how do I treat asthma?
AI: how do I treat asthma?

It's important to understand that there are two different types of asthma. The first type is called bronchodilator asthma, and the second type is bronchodilator bronchodilator asthma.

Bronchodilator bronchodilator asthma

Bronchodil


KeyboardInterrupt: Interrupted by user