# NewsQA Loop
11/2/2023 \
Lixiao Yang

This notebook provides a loop for different chunk and overlap sizes using GPT4ALL (falcon model) and LangChain, a small sample of text files are selected from NewsQA dataset.

Due to the computing resource, the result is limited, please follow these steps for full ledge running:
1. Replace file_path into 'combined-newsqa-data-v1.json' (80.2 MB) - to compile the json file, follow the option 1 log and [Docker method from NewsQA](https://github.com/Maluuba/newsqa#recommended-docker-set-up)
2. Update chunk_size and overlap_percentage
3. Revise calculate_em() function

In [1]:
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings import GPT4AllEmbeddings
from langchain.embeddings.gpt4all import GPT4AllEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import GPT4All
from langchain.chains import RetrievalQA
from collections import Counter
import numpy as np
from collections import defaultdict
import json
from pathlib import Path
from langchain.prompts import PromptTemplate

In [2]:
# file_path='./combined-newsqa-data-v1-small.json'
file_path='/Users/wk77/Library/CloudStorage/OneDrive-DrexelUniversity/Documents/data/combined-newsqa-data-v1.json'
data = json.loads(Path(file_path).read_text())

In [None]:
# data

In [None]:
# for story in data['data']:
#     print(story['text'])
#     print("\n--- End of story ---\n")

In [3]:
# Helper function to calculate Exact Match (EM) score
def calculate_em(predicted, actual):
    return int(predicted == actual)

# Function to calculate the token-wise F1 score for text answers
def calculate_token_f1(predicted, actual):
    predicted_tokens = predicted.split()
    actual_tokens = actual.split()
    common_tokens = Counter(predicted_tokens) & Counter(actual_tokens)
    num_same = sum(common_tokens.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(predicted_tokens)
    recall = 1.0 * num_same / len(actual_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

# Helper function to extract answer ranges from the consensus field
def extract_ranges(consensus):
    if 's' in consensus and 'e' in consensus:
        return [(consensus['s'], consensus['e'])]
    return []


template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
Also provide me the source for your answer. Explain how to get the answer step by step.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [4]:
import time
import logging
# logging.basicConfig(level=logging.WARNING)  # This will show only warnings and errors
logging.basicConfig(level=logging.ERROR)

# Parameters
max_stories = 100
# Define chunk sizes and overlap percentages
# chunk_sizes = [200,400,800,1600]
# overlap_percentages = [0,0.1,0.2,0.3]  # Expressed as percentages (0.1 = 10%)
chunk_sizes = [25,50,75,100]
overlap_percentages = [0,0.2,0.4]  # Expressed as percentages (0.1 = 10%)

# Results storage
f1_results = defaultdict(list)
em_results = defaultdict(list)
text_results = []

# Initialize the language model and the QA chain
# llm = GPT4All(model="C:/Users/24075/AppData/Local/nomic.ai/GPT4All/ggml-model-gpt4all-falcon-q4_0.bin", max_tokens=2048)
llm = GPT4All(model="/Users/wk77/Library/CloudStorage/OneDrive-DrexelUniversity/Documents/data/gpt4all/models/gpt4all-falcon-q4_0.gguf", max_tokens=2048)

# The following code would iterate over the stories and questions to calculate the scores
start_time = time.time()
print(f"{start_time} Started.")

output_file_path = "results/scores_20231115.txt"

with open(output_file_path, 'w') as file:
    # instantiate embedding ONCE
    word_embed = GPT4AllEmbeddings()
    for chunk_size in chunk_sizes:
        print(f"\n{time.time()-start_time} Processing chunk size {chunk_size}:")
        last_time = time.time()
        for overlap_percentage in overlap_percentages:
            actual_overlap = int(chunk_size * overlap_percentage)
            print(f"\n{time.time()-start_time}\t{time.time()-last_time}\tOverlap [{overlap_percentage}] {actual_overlap}")
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=actual_overlap)
    
            i = 0
            for story in data['data']:
                i+=1
                if i>max_stories:
                    break
                now_time = time.time()
                print(f"\n{now_time-start_time}\t{now_time-last_time}\t\tstory {i}: ",end='')
                last_time = now_time
    
                all_splits = text_splitter.split_text(story['text'])
                # print(f"[after split]")
                vectorstore = Chroma.from_texts(texts=all_splits, embedding=word_embed)
                # print(f"[after vector store]")
                qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True)
                # print(f"[after qa chain]")
    
                j = 0
                for question_data in story['questions']:
                    j+=1
                    # print(f"{time.time()-start_time}\t\t\tquestion {j}")
                    print('.', end='')
    
                    # TODO: embed query and perform similarity_search_by_vector() instead
                    question = question_data['q']
                    question_vector = word_embed.embed_query(question)
                    # docs = vectorstore.similarity_search(question)
                    docs = vectorstore.similarity_search_by_vector(question_vector)
                    answer_ranges = extract_ranges(question_data['consensus'])
                    
                    # Get the prediction from the model
                    result = qa_chain({"query": question})
                    
                    # Check if the predicted answer is in the expected format (string)
                    predicted_answer = result['result']
                    if isinstance(predicted_answer, dict):
                        # If it's a dictionary, you need to adapt this part of the code to extract the answer string
                        predicted_answer = predicted_answer.get('answer', '')  # Assuming 'answer' is the key for the answer string
                    elif not isinstance(predicted_answer, str):
                        # If the answer is not a string and not a dictionary, log an error or handle it appropriately
                        print(f"Unexpected format for predicted answer: {predicted_answer}")
                        continue  # Skip to the next question
                    actual_answer = story['text'][answer_ranges[0][0]:answer_ranges[0][1]] if answer_ranges else ""
                    
                    # If there is an actual answer, get it from the story text using the character ranges
                    if answer_ranges:
                        actual_answer = story['text'][answer_ranges[0][0]:answer_ranges[0][1]]
                    else:
                        actual_answer = ""
                    
                    # Calculate the scores
                    em_score = calculate_em(predicted_answer, actual_answer)
                    f1_score_value = calculate_token_f1(predicted_answer, actual_answer)
                    file.write(f"{chunk_size}\t{overlap_percentage}\t{i}\t{j}\t{f1_score_value:.4f}\t{em_score:.2f}\n")
    
                    # Store the scores
                    em_results[(chunk_size, overlap_percentage)].append(em_score)
                    f1_results[(chunk_size, overlap_percentage)].append(f1_score_value)
    
                # delete object for memory
                del qa_chain
                del vectorstore
                del all_splits
            # delete splitter instance
            del text_splitter

1700064374.579493 Started.

0.5504767894744873 Processing chunk size 25:bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


0.5505990982055664	5.0067901611328125e-06	Overlap [0] 0

0.5506200790405273	2.3126602172851562e-05		story 1: .........
12.919222831726074	12.368602752685547		story 2: .........
26.76193118095398	13.842708349227905		story 3: .........
40.541045904159546	13.779114723205566		story 4: ........
49.56605100631714	9.025005102157593		story 5: .........
60.3953959941864	10.829344987869263		story 6: .........
76.67629599571228	16.28090000152588		story 7: .........
88.29758095741272	11.62128496170044		story 8: ............
103.42063403129578	15.123053073883057		story 9: ............
121.18940997123718	17

INFO:backoff:Backing off send_request(...) for 0.7s (requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='app.posthog.com', port=443): Read timed out. (read timeout=15))


......
13245.383018016815	11.688549995422363		story 69: ........
13255.675198078156	10.292180061340332		story 70: ........
13266.850817918777	11.175619840621948		story 71: ...

INFO:backoff:Backing off send_request(...) for 0.6s (requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='app.posthog.com', port=443): Read timed out. (read timeout=15))


......
13279.21254491806	12.361726999282837		story 72: ......

INFO:backoff:Backing off send_request(...) for 1.9s (requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='app.posthog.com', port=443): Read timed out. (read timeout=15))


...
13290.331710100174	11.119165182113647		story 73: .........
13303.196202039719	12.864491939544678		story 74: .........
13315.012745141983	11.816543102264404		story 75: .........
13327.156107902527	12.143362760543823		story 76: ...........
13343.483712911606	16.32760500907898		story 77: ...........
13357.897468090057	14.413755178451538		story 78: ........
13374.855798959732	16.958330869674683		story 79: .........
13386.916049003601	12.060250043869019		story 80: .........
13399.230302095413	12.314253091812134		story 81: .........
13411.470612049103	12.240309953689575		story 82: .........
13423.543743133545	12.073131084442139		story 83: .........
13435.780396938324	12.236653804779053		story 84: ............
13451.53980088234	15.759403944015503		story 85: .........
13465.296121835709	13.75632095336914		story 86: .........
13476.50038099289	11.204259157180786		story 87: .........
13488.528657913208	12.028276920318604		story 88: ............
13507.944694757462	19.41603684425354		story 89:

INFO:backoff:Backing off send_request(...) for 0.9s (requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')))


........
16345.586253881454	11.381211996078491		story 16: .......
16354.737873077393	9.15161919593811		story 17: .........
16366.16434788704	11.426474809646606		story 18: .........
16377.444277048111	11.279929161071777		story 19: .........
17410.3903131485	1032.9460361003876		story 20: .

INFO:backoff:Backing off send_request(...) for 0.8s (requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')))


........
17421.232653856277	10.84234070777893		story 21: ........
17432.021327018738	10.788673162460327		story 22: .........
17443.105816841125	11.084489822387695		story 23: .........
18439.346754789352	996.2409379482269		story 24: .

INFO:backoff:Backing off send_request(...) for 0.4s (requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')))


........
18450.948733091354	11.601978302001953		story 25: ...........
18467.451011896133	16.502278804779053		story 26: .........
18628.246944904327	160.79593300819397		story 27: .........
18639.127797842026	10.880852937698364		story 28: .........
18658.90769481659	19.7798969745636		story 29: ...........................
18697.216455936432	38.30876111984253		story 30: .........
18712.265285015106	15.048829078674316		story 31: .........
18724.9029610157	12.637676000595093		story 32: ...........
18739.412917137146	14.509956121444702		story 33: ............
18755.135938882828	15.723021745681763		story 34: ........
18765.68890786171	10.552968978881836		story 35: .........
18778.327459812164	12.638551950454712		story 36: .........
18794.540359020233	16.212899208068848		story 37: ............................
18833.38228702545	38.841928005218506		story 38: .........
18845.242481946945	11.86019492149353		story 39: .........
18859.37577676773	14.133294820785522		story 40: .........
18870.41414904

In [5]:
# Calculate the average F1 and EM scores for each configuration
for config, scores in f1_results.items():
    avg_f1 = np.mean(scores)
    avg_em = np.mean(em_results[config])
    f1_results[config] = avg_f1
    em_results[config] = avg_em
    print(f"Chunk size {config[0]} with overlap {config[1]}% - Average F1: {avg_f1:.3f}, EM: {avg_em:.2f}")

# Output the results
print(f1_results)
print(em_results)

Chunk size 25 with overlap 0% - Average F1: 0.032, EM: 0.00
Chunk size 25 with overlap 0.2% - Average F1: 0.030, EM: 0.00
Chunk size 25 with overlap 0.4% - Average F1: 0.030, EM: 0.00
Chunk size 50 with overlap 0% - Average F1: 0.037, EM: 0.00
Chunk size 50 with overlap 0.2% - Average F1: 0.038, EM: 0.00
Chunk size 50 with overlap 0.4% - Average F1: 0.041, EM: 0.00
Chunk size 75 with overlap 0% - Average F1: 0.041, EM: 0.00
Chunk size 75 with overlap 0.2% - Average F1: 0.044, EM: 0.00
Chunk size 75 with overlap 0.4% - Average F1: 0.046, EM: 0.00
Chunk size 100 with overlap 0% - Average F1: 0.046, EM: 0.00
Chunk size 100 with overlap 0.2% - Average F1: 0.046, EM: 0.00
Chunk size 100 with overlap 0.4% - Average F1: 0.045, EM: 0.00
defaultdict(<class 'list'>, {(25, 0): 0.031851029147303415, (25, 0.2): 0.02965547712973723, (25, 0.4): 0.02989194802162396, (50, 0): 0.037108305744231414, (50, 0.2): 0.038022922229045276, (50, 0.4): 0.0405890450733727, (75, 0): 0.0413641224528871, (75, 0.2): 0.

In [None]:
# # Print out the F1 and EM Results
# print("\nF1 and EM Results:")
# for config in f1_results:
#     chunk_size, overlap_percentage = config
#     print(f"Chunk Size: {chunk_size}, Overlap Percentage: {overlap_percentage}%, F1: {f1_results[config]}, EM: {em_results[config]}")