# READ URLS

In [None]:
import pandas as pd

In [None]:
with open("url_list.txt", 'r', encoding='utf-8') as file:
    content = file.read()

In [None]:
content=content.split("\n")

In [None]:
content=[item for item in content if len(item)>1]

In [None]:
import hashlib

def generate_hash(url):
    return hashlib.sha256(url.encode('utf-8')).hexdigest()

hashKeys=[]
for item in content:
    hashKeys.append(generate_hash(item))

In [None]:
df = pd.DataFrame(list(zip(content, hashKeys)), columns=['url', 'hashkey'])
df.to_csv("mappedHashKeysToUrl.csv",index=None)

# CALL API

In [None]:
import pandas as pd
df=pd.read_csv("mappedHashKeysToUrl.csv")

In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import concurrent.futures
import os

def get_slm_response(url):
    api_url = "apiURL"
    auth_token = "auth_token"
    
    headers = {
        "Authorization": f"Basic {auth_token}",
        "Content-Type": "application/json"
    }
    
    # payload = {
    #     "url": url,  # Using the correct parameter name
    # }
    payload = {
        "content": url,  # Using the correct parameter name
    }
    
    try:
        response = requests.post(api_url, headers=headers, json=payload)
        if response.status_code == 200:
            return eval(response.content)['content']
        else:
            return f"Error: Status code {response.status_code}"
    except Exception as e:
        return f"Error: {str(e)}"

def generate_md_file(hashkey, content):
    """Generate markdown file with the given hashkey and content"""
    # Create 'md_files' directory if it doesn't exist
    os.makedirs('md_files', exist_ok=True)
    
    # Create and write to the markdown file
    filename = f"md_files/{hashkey}.md"
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(content)
        return True
    except Exception as e:
        print(f"Error writing file {filename}: {e}")
        return False


In [None]:
# # Main processing loop with progress bar
# for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing URLs"):
#     try:
#         # Get response from API
#         response = get_slm_response(row['url'])
        
#         # Generate markdown file
#         if response:
#             success = generate_md_file(row['hashkey'], response)
#             if not success:
#                 print(f"Failed to generate MD file for URL: {row['url']}")
#         else:
#             print(f"No response for URL: {row['url']}")
            
#         # Optional: Add a small delay to prevent overwhelming the API
#         time.sleep(0.5)
        
#     except Exception as e:
#         print(f"Error processing row {i}: {e}")
#         continue

# print("Processing completed!")

# RETRIEVER

## Preparing Chunk DATA

In [None]:
import pandas as pd
import json

# Specify the path to your JSON file
file_path = "url_structure.json"

# Open and read the JSON file
with open(file_path, 'r') as file:
    # Parse JSON data into a Python object
    data = json.load(file)
filtered_items_dict = {key: value for key, value in data['links'].items() if "folder" in key}

In [None]:
import pandas as pd
df=pd.read_csv("mappedHashKeysToUrl.csv")
df_filtered=df[df['url'].isin(list(set(list(filtered_items_dict.keys()))))]

In [None]:
# for i, row in tqdm(df_filtered.iterrows(), total=len(df_filtered), desc="Processing URLs"):
#     file_path="md_files/"+row['hashkey']+".md"
#     with open(file_path, 'r', encoding='utf-8') as file:
#         content = file.read()
#     response=get_slm_response(content)
#     df_filtered.at[i,'folder_name']=response

In [None]:
#df_filtered.to_csv("folderNamesMapping.csv",index=None)

In [None]:
parent=[]
child=[]
for k,v in data['links'].items():
    for item in v:
        parent.append(k)
        child.append(item)
dfParentChild = pd.DataFrame(list(zip(parent, child)), columns=['parentUrl', 'childUrl'])
#dfParentChild.to_csv("parentChildURLMapping.csv",index=None)

In [None]:
df_filtered=pd.read_csv("folderNamesMapping.csv")

In [None]:
df = df.merge(dfParentChild[['childUrl', 'parentUrl']], 
              left_on='url', 
              right_on='childUrl', 
              how='left')
df['folder'] = df['parentUrl']
df = df.drop(['childUrl', 'parentUrl'], axis=1)

In [None]:
dfCategoryUrls = df[df['url'].str.contains("{patternString}") & df['url'].str.split('/').str[-1].str.len().eq(11) & ~df['url'].str.contains('folders')]
for i, row in tqdm(dfCategoryUrls.iterrows(), total=len(dfCategoryUrls), desc="Processing URLs"):
    file_path="md_files/"+row['hashkey']+".md"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    response=get_slm_response(content)
    dfCategoryUrls.at[i,'CategoryName']=response

In [None]:
#dfCategoryUrls.to_csv("CategoryURl.csv",index=None)

In [None]:
#df.to_csv("csvWithFolderNames.csv",index=None)

In [None]:
def find_category(url, df_parent_child, category_urls):
    # First level check - direct parent in category URLs
    temp = df_parent_child[df_parent_child['childUrl'] == url]
    
    if not temp.empty:
        # Check if any direct parent is in category URLs
        if any(parent in category_urls for parent in temp['parentUrl']):
            return temp[temp['parentUrl'].isin(category_urls)]['parentUrl'].iloc[0]
        
        # If not, check folder parents
        folder_parents = temp[temp['parentUrl'].str.contains('folder')]['parentUrl']
        
        if not folder_parents.empty:
            # Second level check for each folder parent
            for folder_url in folder_parents:
                temp2 = df_parent_child[df_parent_child['childUrl'] == folder_url]
        
                
                if any(parent in category_urls for parent in temp2['parentUrl']):
                    return temp2[temp2['parentUrl'].isin(category_urls)]['parentUrl'].iloc[0]
    
    return None

# # Test with a single URL first
# test_url = df['url'].iloc[2]
# result = find_category(test_url, dfParentChild, set(dfCategoryUrls['url']))


#If the test looks good, then apply to full DataFrame
df['category'] = df['url'].progress_apply(
    lambda x: find_category(x, dfParentChild, set(dfCategoryUrls['url']))
)

In [None]:
for i,r in df.iterrows():
    for i_,r_ in dfCategoryUrls.iterrows():
        if r['category']==r_['url']:
            df.at[i,"CategoryName"]=r_['CategoryName'].split("(")[0]

In [None]:
for i,r in df.iterrows():
    for i_,r_ in df_filtered.iterrows():
        if r['folder']==r_['url']:
            df.at[i,"FolderName"]=r_['folder_name'].split("(")[0]

In [None]:
df.to_csv("FinalMappingChunkData.csv",index=None)

## CHUNKING MODIFIED

In [None]:
for i,r in tqdm(df.iterrows()):
    file_path="md_files/"+r['hashkey']+".md"
    hashkey=r['hashkey']
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        if len(content)>100:
            content=content.split("Was this article helpful?")[0]
            content=content.split("MARKDOWN CONTENT")[1]
    response = get_slm_response(content)
    filename = f"md_files_modified/{hashkey}.md"
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response)

In [None]:
df

## INDEXING CHUNKS

In [None]:
import pandas as pd
df=pd.read_csv("FinalMappingChunkData.csv")

In [None]:
import pickle
from langchain.schema import Document
from tqdm import tqdm
import pandas as pd
#from llama_index.core.node_parser import SentenceWindowNodeParser
#from llama_index.core import SimpleDirectoryReader

import requests

import io
import os
from openai import OpenAI
from time import sleep

In [None]:
df.columns

In [None]:
metadata=[]
chunks=[]
for i,r in df.iterrows():
    item={}
    item['url']=r['url']
    item['Category']=r['CategoryName']
    item['Folder']=r['FolderName']
    file_path="md_files_modified//"+r['hashkey']+".md"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        chunk="URL: \n"+r['url']+"\n ------ CONTENT ------ \n"+ content
    metadata.append(item)
    chunks.append(content)

In [None]:
#MAKING LANGCHAIN DOCUMENT CLASS
docs = []
for item in range(len(df)):
    
    content = item
    document = Document(page_content=chunks[item],metadata=metadata[item])
    docs.append(document)

In [None]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Initialize FAISS vector store with GPU
vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings(model="text-embedding-3-large"))
retriever = vectorstore.as_retriever()

# Initialize the Mistral language model


In [None]:
vectorstore.save_local('indexes-updated-chunks-summarized/')

# VALIDATION DATASET

In [None]:
dfValidation=pd.read_csv("DS_Task_ValSet.csv")
def getResponse(query):
    api_url = "apiURL"
    auth_token = "authToken
    
    headers = {
        "Authorization": f"Basic {auth_token}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "query": query,  # Using the correct parameter name
    }
    
    try:
        response = requests.post(api_url, headers=headers, json=payload)
        if response.status_code == 200:
            return eval(response.content)
        else:
            return f"Error: Status code {response.status_code}"
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
for i,r in tqdm(dfValidation.iterrows()):
    query=r['Query']
    response=getResponse(query)
    if response['content']:
        dfValidation.at[i,'answer']=response['content']
    try:
        dfValidation.at[i,'metadata']=response['metadata']
    except:
        continue
    
    
    

In [None]:
#dfValidation.to_csv("ValidationResultsAgentic.csv",index=None)