In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import nbformat as nbf
import json
import os

url = "https://platform.stratascratch.com/data-projects?page_size=100&order_field=difficulty"


In [5]:
class Payload:
    def __init__(self):
        self.data = None
    
    def set_payload(self, data):
        self.data = data

    def get_payload(self):
        return self.data
    

category = Payload()
difficulty = Payload()

In [6]:
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.page_load_strategy = 'none'
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scraping_data(url, class_name_presence):
    driver = setup_driver()

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, class_name_presence)))
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        if class_name_presence == 'contents':
            getting_list_data(soup)

        elif class_name_presence == 'QuestionDetails__contents':
            getting_data_info(url, soup)

    except TimeoutException:
        print("Loading took too much time!")

    finally:
         driver.quit()

def getting_list_data(soup):
    data = []
    for list in soup.find_all('a', 'group contents'):
        details = list.find_all('div')
        info = {}
        info['link'] = 'https://platform.stratascratch.com' + list.get('href')
        info['company'] = details[0].getText()
        info['difficulty'] = details[1].find('span').getText()
        info['category'] = details[2].getText()
        info['title'] = details[4].getText()

        data.append(info)

    json_dump('data_projects.json', data)  
    create_folder('Data Project notebook')
    
def getting_data_info(link, soup):
    payload = [] 
    payload.append(nbf.v4.new_markdown_cell(f"### **Link:** {link}"))
    payload.append(nbf.v4.new_markdown_cell(f"### **Difficulty:** {difficulty.get_payload()}"))
    payload.append(nbf.v4.new_markdown_cell(f"# {soup.find('h1','Assignment__heading').getText()}"))
    payload.append(nbf.v4.new_markdown_cell(str(soup.find('div','prose')).replace(' class="prose prose-lg dark:prose-invert max-w-fit"', '')))
    payload.append(nbf.v4.new_markdown_cell("## **Data:**"))
    payload.append(nbf.v4.new_code_cell())
    payload.append(nbf.v4.new_markdown_cell("## **Solution:**"))
    payload.append(nbf.v4.new_code_cell())

    base_folder = r'Data Project notebook'
    sub_folders_for_folder = f'{category.get_payload()}'
    sub_folders_for_notebook = f'{category.get_payload()}\\' + soup.find('h1','Assignment__heading').getText().strip().replace('?', '')

    create_folder(soup.find('h1','Assignment__heading').getText().strip().replace('?', ''), os.path.join(base_folder, sub_folders_for_folder))
    create_notebook(payload, 'notebook.ipynb', os.path.join(base_folder, sub_folders_for_notebook))

def json_dump(file, data):
    with open(file, 'w+') as f:
        json.dump(data, f)

def create_folder(folder_name, folder_path=""):
    new_folder = os.path.join(folder_path, folder_name)
    if not os.path.isabs(new_folder):
        current_path = os.getcwd()
        new_folder = os.path.join(current_path, new_folder)

    if not os.path.exists(new_folder):
        os.makedirs(new_folder)

def create_notebook(payload, name, path):
    notebook = nbf.v4.new_notebook()
    notebook['cells'] = payload

    with open(path + '/' + name, 'w', encoding="utf-8") as f:
        nbf.write(notebook, f)


In [61]:
scraping_data(url, 'contents')

In [7]:
with open('data_projects.json', 'r+') as list_project_data:
    for la in json.load(list_project_data):
        create_folder(la['category'], r'Data Project notebook')
        category.set_payload(la['category'])
        difficulty.set_payload(la['difficulty'])
        scraping_data(la['link'],'QuestionDetails__contents')
            