### -1. Import necessary packages.

In [None]:
from langchain_upstage import UpstageLayoutAnalysisLoader
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from langchain_upstage import UpstageEmbeddings
import pandas as pd
import pickle

UPSTAGE_API_KEY = "up_e0SGJQIH0pC9VHFVwb6TXV7TRUKoh"

# 1. Build the Business DB (perform chunking)

### - Load the "Introduction to Marketing.pdf" and perform chunking (chunks0).

In [None]:
layzer = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY,file_path="/Users/hongjiyoung/NLP/Term_Project/final/db_files/Introduction To Marketing.pdf", output_type="text") 

docs0 = layzer.load()  # or layzer.lazy_load()

In [None]:
# Approximately 140,000 tokens

# Set the chunk_size and chunk_overlap values for chunking.
chunk_size = 5000 
chunk_overlap = 500 

# Set the Text Splitter
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    language=Language.HTML
)
chunks0 = [doc.page_content for doc in text_splitter.split_documents(docs0)]

In [None]:
len(chunks0)

### - Load the "Principles of Management-OP.pdf" and perform chunking (chunks1)

In [None]:
layzer = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY,file_path="/Users/hongjiyoung/NLP/Term_Project/baseline/mmlu/business/PrinciplesofManagement-OP 2.pdf", output_type="text") 

docs1 = layzer.load()  # or layzer.lazy_load()

In [None]:
# Approximately 1.8 million tokens

# Set the chunk_size and chunk_overlap values for chunking.
chunk_size = 9000 
chunk_overlap = 900     

# Set the Text Splitter
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    language=Language.HTML
)
chunks1 = [doc.page_content for doc in text_splitter.split_documents(docs1)]

In [None]:
len(chunks1)

### - Load the "Corporate Finance.pdf" and perform chunking (chunks2)

In [None]:
layzer = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY,file_path="/Users/hongjiyoung/NLP/Term_Project/final/db_files/Corporate Finance 2.pdf", output_type="text") 

docs2 = layzer.load()  # or layzer.lazy_load()

In [None]:
# Approximately 3.4 million tokens

# Set the chunk_size and chunk_overlap values for chunking.
chunk_size = 5000 
chunk_overlap = 500   

# Set the Text Splitter
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    language=Language.HTML
)
chunks2 = [doc.page_content for doc in text_splitter.split_documents(docs2)]

In [None]:
len(chunks2)

### - Load the "Business Ethics - Concepts and Cases.pdf" and perform chunking (chunks3).

In [None]:
layzer = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY,file_path="/Users/hongjiyoung/NLP/Term_Project/final/db_files/Ethics- Concepts and Cases.pdf", output_type="text") 

docs3 = layzer.load()  # or layzer.lazy_load()

In [None]:
# Approximately 1.8 million tokens

# Set the chunk_size and chunk_overlap values for chunking.
chunk_size = 7000 
chunk_overlap = 700     

# Set the Text Splitter
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    language=Language.HTML
)
chunks3 = [doc.page_content for doc in text_splitter.split_documents(docs3)]

In [None]:
len(chunks3)

### - Load the "Marketing Management.pdf" and perform chunking (chunks4)

In [None]:
layzer = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY,file_path="/Users/hongjiyoung/NLP/Term_Project/final/db_files/Marketing Management 15th Edition by Philip Kotler ( PDFDrive )-2.pdf", output_type="text") 

docs4 = layzer.load()  # or layzer.lazy_load()

In [None]:
# Approximately 3.6 million tokens

# Set the chunk_size and chunk_overlap values for chunking.
chunk_size = 7000 
chunk_overlap = 700     

# Set the Text Splitter
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    language=Language.HTML
)
chunks4 = [doc.page_content for doc in text_splitter.split_documents(docs4)]

In [None]:
len(chunks4)

# 2. Create a DataFrame for the chunks and save it

In [None]:
# Insert the chunks into a DataFrame ex.) chunks0, chunks1, chunks2,...
# chunks0 : Introduction To Marketing.pdf
# chunks1 : Principles of Management-OP.pdf
# chunks2 : Corporate Finance.pdf
# chunks3 : Business Ethics - Concepts and Cases.pdf
# chunks4 : Marketing Management.pdf

data = {'chunks' : chunks0}  
chunks_df = pd.DataFrame(data)

In [None]:
# Add an ‘Index’ column with index numbers starting from 0.
chunks_df.insert(0, 'Index', range(len(chunks_df)))  # Add the ‘Index’ column at the first column position.
chunks_df

In [None]:
len(chunks_df['chunks'][0])

In [None]:
# Save the DataFrame as a CSV file
chunks_df.to_csv('db_business_MK_cs5000_co500.csv', index=False)

# 3. Perform DB embedding

In [None]:
db = pd.read_csv("/Users/hongjiyoung/NLP/Term_Project/final/db_files/db_business_MK_cs5000_co500.csv")

In [None]:
def calculate_and_save_embeddings(df, embedding_model, output_file_name):
    context = df['chunks'].tolist()
    index = df['Index'].tolist()
    
    # List for storing the results
    valid_context_embeddings = []
    valid_indices = []

    for i, text in enumerate(context):
        try:
            # Calculate the embeddings
            embedding = embedding_model.embed_documents([text])  # Calculate embeddings one by one
            valid_context_embeddings.append(embedding[0])  # Store the results
            valid_indices.append(index[i])  # Store the indices
        except Exception as e:
            # Print the error message and skip
            print(f"Error with context at index {index[i]}: {e}")
            continue

    # Save the data using Pickle
    with open(output_file_name, "wb") as f:
        pickle.dump((valid_context_embeddings, valid_indices), f)
    print(f"Embeddings saved to '{output_file_name}'. {len(valid_context_embeddings)} items successfully processed.")

In [None]:
embedding_model = UpstageEmbeddings(api_key=UPSTAGE_API_KEY, model="solar-embedding-1-large")
output_file = "bs_embeddings_MK_cs5000_co500.pkl"

calculate_and_save_embeddings(db, embedding_model, output_file)

# 4. Merge the DB & Embedding

In [None]:
# Read the CSV file.
df0 = pd.read_csv('/Users/hongjiyoung/NLP/Term_Project/final/db_files/db_business_MK_cs5000_co500.csv')  # The first DataFrame
df1 = pd.read_csv('//Users/hongjiyoung/NLP/Term_Project/final/db_files/db_business_PM_cs9000_co900.csv')  # The second DataFrame
df2 = pd.read_csv('/Users/hongjiyoung/NLP/Term_Project/final/db_files/db_business_CF_cs5000_co500.csv')  # The third DataFrame
df3 = pd.read_csv('/Users/hongjiyoung/NLP/Term_Project/final/db_files/db_business_BE_cs7000_co700.csv')  # The fourth DataFrame
df4 = pd.read_csv('/Users/hongjiyoung/NLP/Term_Project/final/db_files/db_business_MM_cs7000_co700.csv')  # The fifth DataFrame

# Merge the DataFrames
merged_df = pd.concat([df0, df1, df2, df3, df4], ignore_index=True)

# Reindex the “Index” column starting from 0
merged_df["Index"] = range(len(merged_df))

merged_df

In [None]:
# Save as a new CSV file
merged_df.to_csv('db_business_merged6.csv', index=False)

In [None]:
# Paths to the five pkl files
file0 = "/Users/hongjiyoung/NLP/Term_Project/final/db_files/bs_embeddings_MK_cs5000_co500.pkl"
file1 = "/Users/hongjiyoung/NLP/Term_Project/final/db_files/bs_embeddings_PM_cs9000_co900.pkl"
file2 = "/Users/hongjiyoung/NLP/Term_Project/final/db_files/bs_embeddings_BE_cs7000_co700.pkl"
file3 = "/Users/hongjiyoung/NLP/Term_Project/final/db_files/bs_embeddings_BE_cs7000_co700.pkl"
file4 = "/Users/hongjiyoung/NLP/Term_Project/final/db_files/bs_embeddings_BE_cs7000_co700.pkl"

output_file = "bs_embeddings_merged6.pkl"

# Load the first file
with open(file0, "rb") as f:
    embeddings0, indices0 = pickle.load(f)

# Load the second file
with open(file1, "rb") as f:
    embeddings1, indices1 = pickle.load(f)

# Load the third file
with open(file2, "rb") as f:
    embeddings2, indices2 = pickle.load(f)

# Load the fourth file
with open(file3, "rb") as f:
    embeddings3, indices3 = pickle.load(f)

# Load the fifth file
with open(file4, "rb") as f:
    embeddings4, indices4 = pickle.load(f)

# Merge the data
merged_embeddings = embeddings0 + embeddings1 + embeddings2 + embeddings3 + embeddings4 # Merge the embedding lists
merged_indices = indices0 + indices1 + indices2 + indices3  + indices4   # 병합된 데이터를 저장
merged_indices = [i for i in range(len(merged_indices))]

# Save the merged data
with open(output_file, "wb") as f:
    pickle.dump((merged_embeddings, merged_indices), f)

print(f"Two pickle files merged and saved as {output_file}")