Project Overview

What IS RAG and why are we studying it?

- Lets go back to 2020 and understand the world of AI back then. (Seems like loong ago!)

- We had a Large Language Model which has Billions (even Trillions of Parameters)- Parametric Memory.
- This was able to give good results on most of the NLU Tasks!
- A Paper came out from Facebook Research - RAG - Retrieval Augmented Generation!
- For Language Generation Tasks they found that RAG models generate more specific, diverse and factual language than a state-of-the-art parametric-only seq2seq baseline.

# Notebook to Scrape Data from Investopedia

RAG Paper: https://arxiv.org/pdf/2005.11401.pdf

In [3]:
## Load Environment Variables
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path('C:/Users/erdrr/OneDrive/Desktop/Scholastic/NLP/LLM/RAG/FinsightRAG/.env'))

True

In [4]:
import string
import requests
import fitz
import pandas as pd
from time import perf_counter as timer
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
        }

In [8]:
scrape_data_path = Path(os.path.join(os.environ["BASE_SCRAPE_DATA_DIR"],"investopedia_data"))
os.makedirs(scrape_data_path, exist_ok=True)
preprocessed_data_path = Path(os.path.join(os.environ["PREPROCESSED_DATA_DIR"],"investopedia_data"))
print(f"[INFO]:\n{scrape_data_path=} \n{preprocessed_data_path=}")

[INFO]:
scrape_data_path=WindowsPath('C:/Users/erdrr/OneDrive/Desktop/Scholastic/NLP/LLM/RAG/FinsightRAG/data/raw/scraped/investopedia_data') 
preprocessed_data_path=WindowsPath('C:/Users/erdrr/OneDrive/Desktop/Scholastic/NLP/LLM/RAG/FinsightRAG/data/preprocessed/investopedia_data')


## Scrape Data

In [9]:
class InvestopediaScrape:
    """
    Class to Scrape Investopedia Data and store into pdf.
    """
    def __init__(self, scrape_data_path):
        self.scrape_data_path = scrape_data_path
        print("Starting...", flush=True)
    
    def get_all_pagination(self):
        url = 'https://www.investopedia.com/'
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text,'lxml')
        all_lists = soup.find('ul', {'class': 'terms-bar__list'}).find_all('li')
        return all_lists
        
    def scrape(self, url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
        }
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text,'lxml')
        try:
            term_urls = soup.find('div', {'class': 'dictionary-top300-list__list-content'}).find_all('a')
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return
        for term_url in term_urls:
            url = term_url['href'].split('=')[0]
            file_path = os.path.join(self.scrape_data_path, url.split('/')[-2])
            os.makedirs(file_path, exist_ok=True)
            file_name = "Investopedia_" + url.split('/')[-2] + "_what_is_" + url.split('/')[-1].replace(".asp", "").replace("-","_")
            file_name = ''.join(x for x in file_name.title() if not x.isspace())
            pdf_path = os.path.join(file_path, f'{file_name}.pdf')

            doc = SimpleDocTemplate(pdf_path, pagesize=letter)
            styles = getSampleStyleSheet()
            Story = []

            r = requests.get(url, headers=headers)
            soup = BeautifulSoup(r.content,'lxml')

            # Collecting text data and removing HTML tags by converting to text
            text_elements = soup.findAll('div', {'class': 'article-content'})
            cleaned_text = ' '.join(element.get_text(" ", strip=True) for element in text_elements)
            cleaned_text = preprocess_text(cleaned_text)
            cleaned_text = preprocess_text_math(cleaned_text)


            # Wrap text in a Paragraph
            para = Paragraph(cleaned_text, styles["Normal"])
            Story.append(para)

            # Build the PDF
            doc.build(Story)

    def close(self):
        print("[INFO]: Scraping finished.")

In [5]:
start_time = timer()
crawler = InvestopediaScrape(scrape_data_path)
all_pagination_links =crawler.get_all_pagination()
for page in all_pagination_links:
    page_url = page.find('a')['href'].split('=')[0]
    print(f"[INFO]: Fetching terms for the URL: {page_url}", flush=True)
    crawler.scrape(page_url)
end_time = timer()
print(f"[INFO]: Total Time: {end_time-start_time:.5f} seconds.")

Starting...
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-num-4769350
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-a-4769351
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-b-4769352
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-c-4769353
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-d-4769354
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-e-4769355
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-f-4769356
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-g-4769357
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-h-4769358
[INFO]: Fetching terms for the URL: https://www.investopedia.com/terms-beginning-with-i-4769359
[INFO]: Fetching terms for

Reference: https://github.com/chankeypathak/investopedia-terms

## Get Data Stats

In [18]:
def get_data_stats(directory):
    # List to store metadata of all PDFs
    pdf_metadata = []

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                # Full path of the PDF file
                file_path = os.path.join(root, file)
                
                # Try to open and read the PDF file
                try:
                    with fitz.open(file_path) as doc:
                        total_pages = doc.page_count
                        word_count = sum(len(page.get_text("text").split()) for page in doc)
                        
                        # Append metadata to the list
                        pdf_metadata.append({
                            'file_name': file,
                            'total_pages': total_pages,
                            'word_count': word_count
                        })
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    # Convert the list of metadata into a DataFrame
    df = pd.DataFrame(pdf_metadata)
    
    # Return the DataFrame
    return df

In [19]:
df = get_data_stats(scrape_data_path)

In [20]:
df

Unnamed: 0,file_name,total_pages,word_count
0,Investopedia_03_What_Is_071603.pdf,2,896
1,Investopedia_042315_What_Is_How_Do_Prepaid_Deb...,2,1062
2,Investopedia_05_What_Is_Economicmoat.pdf,2,1295
3,Investopedia_063015_What_Is_What_Effective_Int...,2,1514
4,Investopedia_06_What_Is_Putcallratio.pdf,2,1120
...,...,...,...
6248,Investopedia_Z_What_Is_Ztranche.pdf,2,1242
6249,Investopedia_Z_What_Is_Zzzzbest.pdf,2,1119
6250,Investopedia_Z_What_Is_Z_Bond.pdf,1,558
6251,Investopedia_Z_What_Is_Z_Share.pdf,1,500


In [23]:
df.describe()

Unnamed: 0,total_pages,word_count
count,6253.0,6253.0
mean,1.636654,1020.837998
std,0.746337,541.591345
min,1.0,92.0
25%,1.0,644.0
50%,2.0,873.0
75%,2.0,1237.0
max,11.0,8421.0


In [22]:
df['word_count'].sort_values(ascending=False)

4134    8421
590     6860
1444    6855
4184    5167
1453    5020
        ... 
6036     242
2097     225
5915     179
5646     178
3887      92
Name: word_count, Length: 6253, dtype: int64