In [None]:
!pip install beautifulsoup4 requests



## **FAQ's scraping code**

In [1]:
import requests
import json
from bs4 import BeautifulSoup
import time
import re  

def clean_text(text):
    text = re.sub(r'^\d+\.\s*', '', text)
    return ' '.join(text.split())  

def scrape_faq_page(url):
    try:
        response = requests.get(url, verify=False)
        response.raise_for_status()  

        soup = BeautifulSoup(response.content, 'html.parser')

        main_topic = soup.find('h3', class_='form-header').get_text(strip=True)

        faq_data = []
        questions = soup.find_all('h3', class_='uk-accordion-title')
        answers = soup.find_all('div', class_='uk-accordion-content')

        for idx, (question, answer) in enumerate(zip(questions, answers)):
            cleaned_question = clean_text(question.get_text(strip=True))  
            cleaned_answer = clean_text(answer.get_text(strip=True))  

            if cleaned_question and cleaned_answer:  
                faq_data.append({
                    "id": f"faq_{url.split('/')[-1]}_{idx}", 
                    "context": main_topic,  
                    "question": cleaned_question,  
                    "answer": cleaned_answer 
                })

        return faq_data
    except requests.exceptions.RequestException as e:
        print(f"Request error scraping {url}: {e}")
        return []
    except AttributeError as e:
        print(f"Error parsing HTML for {url}: {e}")
        return []

urls = [
    "https://www.xacbank.mn/faqs/view/10",
    "https://www.xacbank.mn/faqs/view/13",
    "https://www.xacbank.mn/faqs/view/13?lang=mn",
    "https://www.xacbank.mn/faqs/view/14",
    "https://www.xacbank.mn/faqs/view/15",
    "https://www.xacbank.mn/faqs/view/15?lang=mn",
    "https://www.xacbank.mn/faqs/view/16",
    "https://www.xacbank.mn/faqs/view/16?lang=mn",
    "https://www.xacbank.mn/faqs/view/17",
    "https://www.xacbank.mn/faqs/view/18",
    "https://www.xacbank.mn/faqs/view/18?lang=mn",
    "https://www.xacbank.mn/faqs/view/19",
    "https://www.xacbank.mn/faqs/view/19?lang=mn",
    "https://www.xacbank.mn/faqs/view/20",
    "https://www.xacbank.mn/faqs/view/21",
    "https://www.xacbank.mn/faqs/view/22",
    "https://www.xacbank.mn/faqs/view/23",
    "https://www.xacbank.mn/faqs/view/25",
    "https://www.xacbank.mn/faqs/view/26",
    "https://www.xacbank.mn/faqs/view/4",
    "https://www.xacbank.mn/faqs/view/5",
    "https://www.xacbank.mn/faqs/view/9"
]

rag_data = []
for url in urls:
    faq_entries = scrape_faq_page(url)
    rag_data.extend(faq_entries)

    time.sleep(2)

with open('rag_faq_data_cleaned.json', 'w', encoding='utf-8') as f:
    json.dump(rag_data, f, ensure_ascii=False, indent=4)

from google.colab import files
files.download('rag_faq_data_cleaned.json')




ModuleNotFoundError: No module named 'google.colab'

## **Product scraping code**


In [70]:
import csv
import requests
from bs4 import BeautifulSoup
import re

urls = [
  "https://www.xacbank.mn/product/1",
  "https://www.xacbank.mn/product/102",
  "https://www.xacbank.mn/product/104",
  "https://www.xacbank.mn/product/109",
  "https://www.xacbank.mn/product/111",
  "https://www.xacbank.mn/product/113",
  "https://www.xacbank.mn/product/1136",
  "https://www.xacbank.mn/product/1137",
  "https://www.xacbank.mn/product/1138",
  "https://www.xacbank.mn/product/1139",
  "https://www.xacbank.mn/product/1140",
  "https://www.xacbank.mn/product/1141",
  "https://www.xacbank.mn/product/1142",
  "https://www.xacbank.mn/product/1143",
  "https://www.xacbank.mn/product/1145",
  "https://www.xacbank.mn/product/1147",
  "https://www.xacbank.mn/product/1148",
  "https://www.xacbank.mn/product/1149",
  "https://www.xacbank.mn/product/1150",
  "https://www.xacbank.mn/product/1151",
  "https://www.xacbank.mn/product/1152",
  "https://www.xacbank.mn/product/1153",
  "https://www.xacbank.mn/product/1154",
  "https://www.xacbank.mn/product/1155",
  "https://www.xacbank.mn/product/116",
  "https://www.xacbank.mn/product/117",
  "https://www.xacbank.mn/product/118",
  "https://www.xacbank.mn/product/119",
  "https://www.xacbank.mn/product/12",
  "https://www.xacbank.mn/product/121",
  "https://www.xacbank.mn/product/122",
  "https://www.xacbank.mn/product/123",
  "https://www.xacbank.mn/product/125",
  "https://www.xacbank.mn/product/129",
  "https://www.xacbank.mn/product/131",
  "https://www.xacbank.mn/product/132",
  "https://www.xacbank.mn/product/136",
  "https://www.xacbank.mn/product/14",
  "https://www.xacbank.mn/product/15",
  "https://www.xacbank.mn/product/19",
  "https://www.xacbank.mn/product/24",
  "https://www.xacbank.mn/product/25",
  "https://www.xacbank.mn/product/26",
  "https://www.xacbank.mn/product/36",
  "https://www.xacbank.mn/product/4",
  "https://www.xacbank.mn/product/40",
  "https://www.xacbank.mn/product/42",
  "https://www.xacbank.mn/product/43",
  "https://www.xacbank.mn/product/44",
  "https://www.xacbank.mn/product/46",
  "https://www.xacbank.mn/product/48",
  "https://www.xacbank.mn/product/5",
  "https://www.xacbank.mn/product/50",
  "https://www.xacbank.mn/product/51",
  "https://www.xacbank.mn/product/52",
  "https://www.xacbank.mn/product/54",
  "https://www.xacbank.mn/product/55",
  "https://www.xacbank.mn/product/56",
  "https://www.xacbank.mn/product/58",
  "https://www.xacbank.mn/product/59",
  "https://www.xacbank.mn/product/6",
  "https://www.xacbank.mn/product/60",
  "https://www.xacbank.mn/product/61",
  "https://www.xacbank.mn/product/63",
  "https://www.xacbank.mn/product/65",
  "https://www.xacbank.mn/product/66",
  "https://www.xacbank.mn/product/67",
  "https://www.xacbank.mn/product/7",
  "https://www.xacbank.mn/product/71",
  "https://www.xacbank.mn/product/72",
  "https://www.xacbank.mn/product/74",
  "https://www.xacbank.mn/product/76",
  "https://www.xacbank.mn/product/77",
  "https://www.xacbank.mn/product/79",
  "https://www.xacbank.mn/product/8",
  "https://www.xacbank.mn/product/80",
  "https://www.xacbank.mn/product/84",
  "https://www.xacbank.mn/product/85",
  "https://www.xacbank.mn/product/86",
  "https://www.xacbank.mn/product/88",
  "https://www.xacbank.mn/product/89",
  "https://www.xacbank.mn/product/9",
  "https://www.xacbank.mn/product/90",
  "https://www.xacbank.mn/product/93",
  "https://www.xacbank.mn/product/94",
  "https://www.xacbank.mn/product/97"
]

def extract_product_info(url):
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')

    product_info = []

    context = soup.find('h4', class_=re.compile(r'.*title uk-clearfix.*'))
    context = context.get_text(strip=True)

    xactitle_tags = soup.find_all(class_=re.compile(r'.*xactitle.*'))
    for tag in xactitle_tags:
        question = tag.get_text(strip=True)

        next_element = tag.find_next(['ul', 'p', 'table'])  
        if next_element:
            if next_element.name == 'ul':
                answer = "\n".join([li.get_text(strip=True).replace(';', '.') + '.' for li in next_element.find_all('li')])
            elif next_element.name == 'p':
                answer = next_element.get_text(strip=True)

                links_in_paragraph = next_element.find_all('a', href=True)
                if links_in_paragraph:
                    links = [link['href'] for link in links_in_paragraph]
                    answer += f"\nLinks: {', '.join(links)}"
                else:
                    answer += f"\nNo link found. Check URL: {url}"
            elif next_element.name == 'table':
                rows = next_element.find_all('tr')
                table_data = []
                for row in rows:
                    cols = row.find_all('td')
                    cols = [col.get_text(strip=True) for col in cols]
                    if cols:
                        table_data.append(cols)
                answer = "\n".join([", ".join(row) for row in table_data])
        else:
            answer = "No answer available"

        product_info.append({'url': url, 'question': question, 'context': context, 'answer': answer})

    return product_info

with open('product_info.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['url', 'question', 'context', 'answer'])
    writer.writeheader()

    for url in urls:
        product_info = extract_product_info(url)
        for item in product_info:
            writer.writerow(item)
        print(url)
print("Data has been written to product_info.csv")




https://www.xacbank.mn/product/1




https://www.xacbank.mn/product/102




https://www.xacbank.mn/product/104




https://www.xacbank.mn/product/109




https://www.xacbank.mn/product/111




https://www.xacbank.mn/product/113




https://www.xacbank.mn/product/1136




https://www.xacbank.mn/product/1137




https://www.xacbank.mn/product/1138




https://www.xacbank.mn/product/1139




https://www.xacbank.mn/product/1140




https://www.xacbank.mn/product/1141




https://www.xacbank.mn/product/1142




https://www.xacbank.mn/product/1143




https://www.xacbank.mn/product/1145




https://www.xacbank.mn/product/1147




https://www.xacbank.mn/product/1148




https://www.xacbank.mn/product/1149




https://www.xacbank.mn/product/1150




https://www.xacbank.mn/product/1151




https://www.xacbank.mn/product/1152




https://www.xacbank.mn/product/1153




https://www.xacbank.mn/product/1154




https://www.xacbank.mn/product/1155




https://www.xacbank.mn/product/116




https://www.xacbank.mn/product/117




https://www.xacbank.mn/product/118




https://www.xacbank.mn/product/119




https://www.xacbank.mn/product/12




https://www.xacbank.mn/product/121




https://www.xacbank.mn/product/122




https://www.xacbank.mn/product/123




https://www.xacbank.mn/product/125




https://www.xacbank.mn/product/129




https://www.xacbank.mn/product/131




https://www.xacbank.mn/product/132




https://www.xacbank.mn/product/136




https://www.xacbank.mn/product/14




https://www.xacbank.mn/product/15




https://www.xacbank.mn/product/19




https://www.xacbank.mn/product/24




https://www.xacbank.mn/product/25




https://www.xacbank.mn/product/26




https://www.xacbank.mn/product/36




https://www.xacbank.mn/product/4




https://www.xacbank.mn/product/40




https://www.xacbank.mn/product/42




https://www.xacbank.mn/product/43




https://www.xacbank.mn/product/44




https://www.xacbank.mn/product/46




https://www.xacbank.mn/product/48




https://www.xacbank.mn/product/5




https://www.xacbank.mn/product/50




https://www.xacbank.mn/product/51




https://www.xacbank.mn/product/52




https://www.xacbank.mn/product/54




https://www.xacbank.mn/product/55




https://www.xacbank.mn/product/56




https://www.xacbank.mn/product/58




https://www.xacbank.mn/product/59




https://www.xacbank.mn/product/6




https://www.xacbank.mn/product/60




https://www.xacbank.mn/product/61




https://www.xacbank.mn/product/63




https://www.xacbank.mn/product/65




https://www.xacbank.mn/product/66




https://www.xacbank.mn/product/67




https://www.xacbank.mn/product/7




https://www.xacbank.mn/product/71




https://www.xacbank.mn/product/72




https://www.xacbank.mn/product/74




https://www.xacbank.mn/product/76




https://www.xacbank.mn/product/77




https://www.xacbank.mn/product/79




https://www.xacbank.mn/product/8




https://www.xacbank.mn/product/80




https://www.xacbank.mn/product/84




https://www.xacbank.mn/product/85




https://www.xacbank.mn/product/86




https://www.xacbank.mn/product/88




https://www.xacbank.mn/product/89




https://www.xacbank.mn/product/9




https://www.xacbank.mn/product/90




https://www.xacbank.mn/product/93




https://www.xacbank.mn/product/94




https://www.xacbank.mn/product/97
Data has been written to product_info.csv
