In [6]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer
import torch
import warnings
from transformers import BertForQuestionAnswering, BertTokenizer, logging

# Suppress warnings globally
warnings.filterwarnings("ignore")

# Suppress Hugging Face Transformers logging
logging.set_verbosity_error()

# Load BERT model and tokenizer
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Function to get an answer from a short paragraph
def answer_question(question, answer_text):
    input_ids = tokenizer.encode(question, answer_text)

    sep_index = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0] * num_seg_a + [1] * num_seg_b

    assert len(segment_ids) == len(input_ids)

    outputs = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    answer = tokens[answer_start]
    for i in range(answer_start + 1, answer_end + 1):
        if tokens[i].startswith('##'):
            answer += tokens[i][2:]
        else:
            answer += ' ' + tokens[i]

    return answer

# Function to compute probability for selecting the best chunk
def answer_question_probability(question, answer_text):
    input_ids = tokenizer.encode(question, answer_text)
    sep_index = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0] * num_seg_a + [1] * num_seg_b

    outputs = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    probability = max(start_scores.tolist()[0]) + max(end_scores.tolist()[0])
    return probability

# Function to determine best section using probability
def answer_question_longfile_probability(question, answer_text, max_num_sentences, overlap_num_sentences):
    content = answer_text.splitlines()
    len_sentence = len(content)
    piece_num = - (- (len_sentence - max_num_sentences) // (max_num_sentences - overlap_num_sentences)) + 1
    prob = [0] * piece_num

    for jj in range(piece_num):
        start_idx = (max_num_sentences - overlap_num_sentences) * jj
        end_idx = min(start_idx + max_num_sentences, len_sentence)
        content_temp = " ".join(content[start_idx:end_idx])

        prob[jj] = answer_question_probability(question, content_temp)

    max_prob = max(prob)
    max_index = prob.index(max_prob)
    return max_index

# Function for handling long paragraphs
def answer_question_longfile(question, answer_text, max_num_sentences=12, overlap_num_sentences=5):
    input_ids = tokenizer.encode(question, answer_text)

    if len(input_ids) <= 512:
        return answer_question(question, answer_text)

    max_index = answer_question_longfile_probability(question, answer_text, max_num_sentences, overlap_num_sentences)
    content = answer_text.splitlines()
    len_sentence = len(content)
    piece_num = - (- (len_sentence - max_num_sentences) // (max_num_sentences - overlap_num_sentences)) + 1

    if max_index == 0:
        content_temp = " ".join(content[:max_num_sentences])
    elif max_index == piece_num - 1:
        content_temp = " ".join(content[(max_num_sentences - overlap_num_sentences) * (piece_num - 1):])
    else:
        start_idx = (max_num_sentences - overlap_num_sentences) * max_index
        end_idx = min(start_idx + max_num_sentences, len_sentence)
        content_temp = " ".join(content[start_idx:end_idx])

    return answer_question(question, content_temp)


In [8]:
dataset_path = "/Users/kamalnadhkundla/Desktop/codebert/dataset5/"
import os
import pandas as pd

# Define the question
question = "What is the initial fee?"

# Store results in a DataFrame
results = []

# Process each text file in the dataset5 folder
for filename in os.listdir(dataset_path):
    if filename.endswith(".txt"):  # Ensure it's a text file
        file_path = os.path.join(dataset_path, filename)
        
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        
        # Generate answer
        answer = answer_question_longfile(question, text)
        
        # Store result
        results.append({"File Name": filename, "Generated Answer": answer})

# Convert results to a DataFrame
df = pd.DataFrame(results)


In [9]:
df

Unnamed: 0,File Name,Generated Answer
0,15 Keller Williams.pdf-all_items_item_5.txt,"$ 35 , 000"
1,3 KFC.pdf-all_items_item_5.txt,"$ 45 , 000"
2,24 Applebees.pdf-all_items_item_5.txt,the same for all franchisees subject to this o...
3,45 Zaxbys.pdf-all_items_item_5.txt,"$ 35 , 000"
4,42 Jimmy John's.pdf-all_items_item_5.txt,"$ 5 , 000 , if you are a franchisee in good st..."
5,32 Servpro.pdf-all_items_item_5.txt,"$ 90 , 000"
6,20 Sonic Drive-In.pdf-all_items_item_5.txt,"up to $ 30 , 000"
7,46 Carl's Jr..pdf-all_items_item_5.txt,"$ 25 , 000"
8,47 Baskin Robbins.pdf-all_items_item_5.txt,"$ 12 , 500"
9,1 McDonald's.pdf-all_items_item_5.txt,"$ 45 , 000 lump sum initial franchise fee on t..."


In [34]:
dataset_path = "/Users/kamalnadhkundla/Desktop/codebert/dataset5/"
import os
import pandas as pd

# Define the question
question = "How much is the initial franchise fee?"

# Store results in a DataFrame
results = []

# Process each text file in the dataset5 folder
for filename in os.listdir(dataset_path):
    if filename.endswith(".txt"):  # Ensure it's a text file
        file_path = os.path.join(dataset_path, filename)
        
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        
        # Generate answer
        answer = answer_question_longfile(question, text)
        
        # Store result
        results.append({"File Name": filename, "Generated Answer": answer})

# Convert results to a DataFrame
df = pd.DataFrame(results)
df

Unnamed: 0,File Name,Generated Answer
0,15 Keller Williams.pdf-all_items_item_5.txt,"$ 35 , 000"
1,3 KFC.pdf-all_items_item_5.txt,"$ 45 , 000"
2,24 Applebees.pdf-all_items_item_5.txt,"$ 35 , 000"
3,45 Zaxbys.pdf-all_items_item_5.txt,"$ 35 , 000"
4,42 Jimmy John's.pdf-all_items_item_5.txt,"$ 35 , 000"
5,32 Servpro.pdf-all_items_item_5.txt,"$ 90 , 000"
6,20 Sonic Drive-In.pdf-all_items_item_5.txt,"up to $ 30 , 000"
7,46 Carl's Jr..pdf-all_items_item_5.txt,"$ 25 , 000"
8,47 Baskin Robbins.pdf-all_items_item_5.txt,"$ 12 , 500"
9,1 McDonald's.pdf-all_items_item_5.txt,"$ 45 , 000"


In [35]:
pd1=pd.read_csv("orginaldata5.csv")
print("Columns in pd1:", pd1.columns)
print("Columns in df:", df.columns)


Columns in pd1: Index(['File Name ', 'True Initial value', 'factor 1', 'factor 2', 'factor 3',
       'factor 4', ' factor 5 ', 'factor 6'],
      dtype='object')
Columns in df: Index(['File Name', 'Generated Answer'], dtype='object')


In [28]:
pd1.columns = pd1.columns.str.strip().str.lower()
df.columns = df.columns.str.strip().str.lower()

# Rename columns to match correctly
pd1.rename(columns={"file name": "filename"}, inplace=True)
df.rename(columns={"file name": "filename"}, inplace=True)

# 🔍 Debugging: Print column names again to verify
print("Columns in pd1 after renaming:", pd1.columns)
print("Columns in df after renaming:", df.columns)

Columns in pd1 after renaming: Index(['filename', 'true initial value', 'factor 1', 'factor 2', 'factor 3',
       'factor 4', 'factor 5', 'factor 6'],
      dtype='object')
Columns in df after renaming: Index(['filename', 'generated answer'], dtype='object')


In [36]:
pd1.columns = pd1.columns.str.strip().str.lower()
df.columns = df.columns.str.strip().str.lower()

# Rename columns to ensure consistency
pd1.rename(columns={"file name": "filename"}, inplace=True)
df.rename(columns={"file name": "filename"}, inplace=True)

# Perform a **clean** left merge on "filename"
pd1 = pd1.merge(df, on="filename", how="left")

pd1


Unnamed: 0,filename,true initial value,factor 1,factor 2,factor 3,factor 4,factor 5,factor 6,generated answer
0,1 McDonald's.pdf-all_items_item_5,"$45,000",,,,,,,
1,3 KFC.pdf-all_items_item_5.txt,"$45,000",,,,,,,"$ 45 , 000"
2,4 Burger King.pdf-all_items_item_5.txt,"$50,000","$25,000 (conditional FSS)","$15,000 (shorter period)",,,,,"$ 25 , 000"
3,5 Ace Hardware.pdf-all_items_item_5.txt,"$10,000","$5,000 (stock Fee )","$5,000(waivable adminfee",,,,,"$ 5 , 000"
4,6 Chick-fil-A.pdf-all_items_item_5.txt,$0,,,,,,,we
5,7 Subway.pdf-all_items_item_5.txt,"$15,000","$7,500(conditions)",$5000 (satellite franchise fee),$1000 (short term satellite fee),,,,"$ 1 , 000"
6,8 Domino's.pdf-all_items_item_5.txt,"$10,000",$1500 (exisitng store transfer),,,,,,"$ 0 to $ 10 , 000"
7,9 Circle K.pdf-all_items_item_5.txt,"$25,000","$15,000 per store(2-5 stores)","$10,000 per store(6-9store)","$7,500(10-19 stores)","$5,000 (20 stores)",,,"$ 25 , 000"
8,10 Taco Bell.pdf-all_items_item_5.txt,"$22,500",,,,,,,"$ 22 , 500"
9,11 Wendy's.pdf-all_items_item_5.txt,"$50,000.",,,,,,,"$ 20 , 000"


In [37]:
column_order = ["filename", "generated answer"] + [col for col in pd1.columns if col not in ["filename", "generated answer"]]
pd1 = pd1[column_order]  # Apply new order


In [39]:
pd1
pd1.to_csv("5_final_dataframe.csv", index=False)

In [40]:
pd1

Unnamed: 0,filename,generated answer,true initial value,factor 1,factor 2,factor 3,factor 4,factor 5,factor 6
0,1 McDonald's.pdf-all_items_item_5,,"$45,000",,,,,,
1,3 KFC.pdf-all_items_item_5.txt,"$ 45 , 000","$45,000",,,,,,
2,4 Burger King.pdf-all_items_item_5.txt,"$ 25 , 000","$50,000","$25,000 (conditional FSS)","$15,000 (shorter period)",,,,
3,5 Ace Hardware.pdf-all_items_item_5.txt,"$ 5 , 000","$10,000","$5,000 (stock Fee )","$5,000(waivable adminfee",,,,
4,6 Chick-fil-A.pdf-all_items_item_5.txt,we,$0,,,,,,
5,7 Subway.pdf-all_items_item_5.txt,"$ 1 , 000","$15,000","$7,500(conditions)",$5000 (satellite franchise fee),$1000 (short term satellite fee),,,
6,8 Domino's.pdf-all_items_item_5.txt,"$ 0 to $ 10 , 000","$10,000",$1500 (exisitng store transfer),,,,,
7,9 Circle K.pdf-all_items_item_5.txt,"$ 25 , 000","$25,000","$15,000 per store(2-5 stores)","$10,000 per store(6-9store)","$7,500(10-19 stores)","$5,000 (20 stores)",,
8,10 Taco Bell.pdf-all_items_item_5.txt,"$ 22 , 500","$22,500",,,,,,
9,11 Wendy's.pdf-all_items_item_5.txt,"$ 20 , 000","$50,000.",,,,,,
