In [1]:
# Standard libraries
import os
import json
import re
from datetime import datetime
from uuid import uuid4

# Third-party libraries
import requests
import pinecone
from bs4 import BeautifulSoup
from tqdm import tqdm
from tqdm.auto import tqdm

# LangChain libraries
import langchain
from langchain.document_loaders import PyPDFLoader
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from dotenv import load_dotenv


load_dotenv()

# Access the OpenAI API key from environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise ValueError("OpenAI API key not found in environment variables")

# If you need to set it as an environment variable for other modules
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY


model_name = 'text-embedding-ada-002'

embeddings = OpenAIEmbeddings(
    model=model_name,
)

  from tqdm.autonotebook import tqdm


In [9]:
url = "https://portal.vik.bme.hu/kepzes/targyak/?order=s.code&own=&department_id=all&has_datasheet=all&active=1&program=all"
base_url = 'https://portal.vik.bme.hu'

# Send a GET request
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)
    course_links = [link['href'] for link in links if 'kepzes/targyak/' in link['href']]
    print(course_links)
else:
    print("Failed to retrieve webpage")


['https://portal.vik.bme.hu/kepzes/targyak/', '/kepzes/targyak/EOAF-L52/', '/kepzes/targyak/EOAF-L52/', '/kepzes/targyak/EOAF-X03/', '/kepzes/targyak/EOAF-X03/', '/kepzes/targyak/EOAFM351/', '/kepzes/targyak/EOAFM351/', '/kepzes/targyak/EOEMKM02/', '/kepzes/targyak/EOEMKM02/', '/kepzes/targyak/EOFT5096/', '/kepzes/targyak/EOFT5096/', '/kepzes/targyak/EOFTIF03/', '/kepzes/targyak/EOFTIF03/', '/kepzes/targyak/EOFTM361/', '/kepzes/targyak/EOFTM361/', '/kepzes/targyak/EOFTMI01/', '/kepzes/targyak/EOFTMI01/', '/kepzes/targyak/EOMEMM05/', '/kepzes/targyak/EOMEMM05/', '/kepzes/targyak/EOTM1MEC/', '/kepzes/targyak/EOTM1MEC/', '/kepzes/targyak/EOTMMM05/', '/kepzes/targyak/EOTMMM05/', '/kepzes/targyak/EOUV0103/', '/kepzes/targyak/EOUV0103/', '/kepzes/targyak/EOUV5132/', '/kepzes/targyak/EOUV5132/', '/kepzes/targyak/EOVK-X02/', '/kepzes/targyak/EOVK-X02/', '/kepzes/targyak/EOVKAV29/', '/kepzes/targyak/EOVKAV29/', '/kepzes/targyak/EOVKMM01/', '/kepzes/targyak/EOVKMM01/', '/kepzes/targyak/EOVV-X14/

In [48]:
assert '/kepzes/targyak/TE90AX04/' in course_links

In [49]:
def hungarian_date_to_english(date_str):
    hungarian_months = {
        "január": "January",
        "február": "February",
        "március": "March",
        "április": "April",
        "május": "May",
        "június": "June",
        "július": "July",
        "augusztus": "August",
        "szeptember": "September",
        "október": "October",
        "november": "November",
        "december": "December"
    }
    for hu, en in hungarian_months.items():
        date_str = date_str.replace(hu, en)
    return date_str

In [59]:
def get_course_details(url):
    response = requests.get(url)
    response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
    soup = BeautifulSoup(response.text, 'html.parser')

    if "Tantárgy lejárati dátuma:" in soup.text:
        print('Course expired')
        return None

    date_tag = soup.find('p', class_='date')

    if date_tag is None:
        date_text = None
    else:
        date_text = re.search(r'\d{4}.\s?\w+.\s?\d{1,2}.', date_tag.text)
    formatted_date = None
    if date_text:
        date_text = date_text.group()
        # Adjust the format according to your requirements and locale
        date_text_english = hungarian_date_to_english(date_text)

        formatted_date = datetime.strptime(date_text_english, '%Y. %B %d.').strftime('%Y-%m-%d')

    table = soup.find('table', {'align': 'center', 'border': '1', 'cellpadding': '4', 'cellspacing': '2', 'width': '600'})
    if not table:
        return None

    rows = table.find_all('tr')[1:]  # Skipping the header row
    course_data = []
    for row in rows:
        cols = [ele.text.strip() for ele in row.find_all('td')]
        title = soup.find('p', class_='title').text.strip()
        course_data.append({
            'id': cols[0],
            'credit': cols[3],
            'type': cols[2],
            'semester': cols[1],
            'title': title,
            'last_modified': formatted_date
        })
    return course_data

def get_all_text_from_website(url):
    response = requests.get(url)
    response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code

    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.get_text(separator='\n', strip=True)

def scrape_courses(course_links, base_url):
    courses = []
    for link in tqdm(course_links):
        full_url = base_url + link
        try:
            course_details = get_course_details(full_url)
            # print(course_details)
            if course_details:
                all_text = get_all_text_from_website(full_url)
                course_details[0]['text'] = all_text
                courses.extend(course_details)
        except requests.HTTPError as e:
            print(f"Failed to retrieve webpage: {e}")
    return courses


courses_data = scrape_courses(course_links[1:], base_url)


  1%|          | 47/5380 [00:29<46:19,  1.92it/s]  

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T2071/


  1%|          | 48/5380 [00:29<49:13,  1.81it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T2071/


  1%|          | 49/5380 [00:30<50:11,  1.77it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T4176/


  1%|          | 50/5380 [00:30<48:42,  1.82it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T4176/


  1%|          | 51/5380 [00:31<45:29,  1.95it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T4A04/


  1%|          | 52/5380 [00:31<44:52,  1.98it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T4A04/


  1%|          | 53/5380 [00:32<43:22,  2.05it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T4A13/


  1%|          | 54/5380 [00:32<48:10,  1.84it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T4A13/


  1%|          | 55/5380 [00:33<49:59,  1.78it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T5120/


  1%|          | 56/5380 [00:33<48:31,  1.83it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81T5120/


  1%|          | 57/5380 [00:34<50:14,  1.77it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81TKM05/


  1%|          | 58/5380 [00:35<48:42,  1.82it/s]

Failed to retrieve webpage: 404 Client Error: Not Found for url: https://portal.vik.bme.hu/kepzes/targyak/GE%C3%81TKM05/


 49%|████▉     | 2653/5380 [41:20<56:06,  1.23s/it]  

Course expired


 49%|████▉     | 2654/5380 [41:21<49:01,  1.08s/it]

Course expired


100%|██████████| 5380/5380 [1:35:51<00:00,  1.07s/it]  


In [60]:
print(len(courses_data))

4256


In [61]:
def parse_date(date_str):
    return datetime.strptime(date_str, '%Y-%m-%d')

def keep_most_recent_courses(course_data):
    courses_by_title = {}
    for course in course_data:
        title = course['title']
        date = parse_date(course['last_modified'])

        if title in courses_by_title:
            existing_date = parse_date(courses_by_title[title]['last_modified'])
            if date > existing_date:
                courses_by_title[title] = course
        else:
            courses_by_title[title] = course

    return list(courses_by_title.values())

# Assuming course_data is your list of courses
updated_course_data = keep_most_recent_courses(courses_data)

# Now updated_course_data contains only the most recent


In [62]:
print(len(updated_course_data))

1367


In [2]:
file_path = '../documents/tad/tad.json'

In [82]:

with open(file_path, 'w') as file:
    json.dump(updated_course_data, file)

file_path

'documents/tad/tad.json'

In [3]:
with open(file_path, 'r') as file:
    updated_course_data = json.load(file)

In [4]:
print(updated_course_data[100]['text'][:500])

BME VIK - A bankinformatika számítástechnikai alapjai
Budapest University of Technology and Economics, Faculty of Electrical Engineering and Informatics
A bankinformatika számítástechnikai alapjai
Belépés
címtáras azonosítással
tantárgyi adatlapok
magyar nyelvű adatlap
vissza a tantárgylistához
nyomtatható verzió
A bankinformatika számítástechnikai alapjai
A tantárgy angol neve: Fundamentals of banking IT systems
Adatlap utolsó módosítása: 2006. július 1.
Budapesti Műszaki és Gazdaságtudományi E


In [5]:
updated_course_data[100]

{'id': 'GT467552',
 'credit': '3',
 'type': '2/0/0/v',
 'semester': '8',
 'title': 'A bankinformatika számítástechnikai alapjai',
 'last_modified': '2006-07-01',
 'text': 'BME VIK - A bankinformatika számítástechnikai alapjai\nBudapest University of Technology and Economics, Faculty of Electrical Engineering and Informatics\nA bankinformatika számítástechnikai alapjai\nBelépés\ncímtáras azonosítással\ntantárgyi adatlapok\nmagyar nyelvű adatlap\nvissza a tantárgylistához\nnyomtatható verzió\nA bankinformatika számítástechnikai alapjai\nA tantárgy angol neve: Fundamentals of banking IT systems\nAdatlap utolsó módosítása: 2006. július 1.\nBudapesti Műszaki és Gazdaságtudományi Egyetem\nVillamosmérnöki és Informatikai Kar\nVillamosmérnök, informatikus,\nKötelezően választható tárgy\nTantárgykód\nSzemeszter\nKövetelmények\nKredit\nTantárgyfélév\nGT467552\n8\n2/0/0/v\n3\n1/1\n4. A tantárgy előadója\nNév:\nBeosztás:\nTanszék, Int.:\nDr. Angyal Zoltán\nTud. munkatárs\nInformáció- és Tudásmened

In [10]:
texts = []
metadatas = []
documents = []

for i, record in enumerate(tqdm(updated_course_data)):
    # first get metadata fields for this record
    metadata = {
        'data_type': 'tad-page',
        'source': base_url +'/kepzes/targyak/' + record['id'],
        'title': "BME TAD oldal",
        'page_content': record['text'],
        'course_text': record['text'],
        'course_title': record['title'],
        'course_id': record['id'],
        'course_credit': record['credit'],
        'course_type': record['type'],
        'course_semester': record['semester'],
        'course_last_modified': record['last_modified']
    }
    metadatas.append(metadata)
    texts.append(record)
    documents.append(Document(
            page_content=metadata['page_content'],
            metadata=metadata
        ))

ids = [str(uuid4()) for _ in range(len(texts))]



100%|██████████| 1367/1367 [00:00<00:00, 56536.15it/s]


In [None]:
with open('../documents/tad/tad-meta.json', 'w', encoding='utf-8') as f:
    json.dump(metadatas, f, ensure_ascii=False, indent=4)

In [None]:
db = FAISS.from_texts([''], embeddings)
db.add_documents(documents=documents )
db.save_local('../faiss_db/tad-db')

In [None]:
texts = []
metadatas = []
documents = []

for i, record in enumerate(tqdm(updated_course_data)):
    # first get metadata fields for this record
    metadata = {
        'data_type': 'tad-page',
        'source': base_url +'/kepzes/targyak/' + record['id'],
        'title': "BME TAD oldal",
        'page_content': record['title'],
        'course_text' : record['text'],
        'course_title': record['title'],
        'course_id': record['id'],
        'course_credit': record['credit'],
        'course_type': record['type'],
        'course_semester': record['semester'],
        'course_last_modified': record['last_modified']
    }
    metadatas.append(metadata)
    texts.append(record)
    documents.append(Document(
            page_content=metadata['page_content'],
            metadata=metadata
        ))

ids = [str(uuid4()) for _ in range(len(texts))]

with open('../documents/tad/tad-meta-title-key.json', 'w', encoding='utf-8') as f:
    json.dump(metadatas, f, ensure_ascii=False, indent=4)

model_name = 'text-embedding-ada-002'

embeddings = OpenAIEmbeddings(
    model=model_name,
)
db = FAISS.from_texts([''], embeddings)
db.add_documents(documents=documents )
db.save_local('../faiss_db/tad-db-title-key')

In [5]:
# _________________________________________________________________________________________________________________________________