In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from pathlib import Path

from bs4 import BeautifulSoup
import time
import json


In [3]:
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.page_load_strategy = 'none'
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def get_info(url, func):
    driver = setup_driver()
    driver.implicitly_wait(3)
    driver.get(url)
    time.sleep(3)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    data = []
    if func == 'get_list_data':
        data = getting_list_data(soup)
    elif func == 'get_data_info':
        gettting_data_info(url, soup)

    driver.quit()

    return data

def getting_list_data(soup):
    data = []
    for list in soup.find_all('a', 'group contents'):
        info = {}
        details = list.find_all('div')
        info['link'] = 'https://platform.stratascratch.com' + list.get('href')
        info['title'] = details[4].getText().replace('/','-').replace('?', '')
        info['company'] = details[1].getText()
        info['difficulty'] = details[2].getText()
        info['question_type'] = details[3].getText()
        data.append(info)

    return data

def gettting_data_info(url, soup):
    questionare = {}
    questionare['link'] = url
    questionare['title'] = soup.find('h1','QuestionMetadata__h1').getText().replace('/', '-').replace('?', '')
    question_des_list = soup.find_all('div', 'QuestionMetadata__metadata')

    question_content_list = [child.get_text(strip=True) for child in question_des_list[0].children if child.get_text(strip=True)]
    questionare['company'] = question_content_list[0:len(question_content_list) - 4]
    questionare['difficulty'] = question_content_list[-4]
    questionare['question_type'] = question_content_list[-3]
    questionare['id'] = question_content_list[-2]
    questionare['description'] = soup.find('div','QuestionMetadata__question').find('p').getText()
    questionare['solution'] = ""

    file_title = 'non_coding_data/' + questionare['title'].replace(' ', '-') + '.json'
    json_dump(file_title, questionare)

def init_process():
    list_question = []

    for i in range(1, 5):
        url = f"https://platform.stratascratch.com/technical?page_size=100&order_field=difficulty&page={i}"
        list_question.extend(get_info(url, 'get_list_data'))

    json_dump('non_coding_question_list.json', list_question)

def json_dump(file, data):
    with open(file, 'w+') as f:
        json.dump(data, f)

init_process()


In [35]:
algo_list = []

folder_path = Path.cwd() / 'non_coding_data'
file_list = [str(file) for file in folder_path.iterdir() if file.is_file()]

with open('non_coding_question_list.json', 'r+') as list_non_coding:
    for la in json.load(list_non_coding):
        file_info_path = la['title'].replace(' ', '-') + '.json'
        whole_file_path_title = str(Path.cwd() / 'non_coding_data' / file_info_path)
        if whole_file_path_title not in file_list:
            pass
            #get_info(la['link'],'get_data_info')

In [None]:
def insert_cell_info(value, heading_type, type, is_many_source=False):
    cell = {}
    if type == 'markdown':
        cell['cell_type'] = "markdown"
        cell['metadata'] = {}
        if not is_many_source:
            cell_title = heading_type + value
            cell['source'] = [cell_title]
        else:
            if isinstance(value, list):
                cell['source'] = [''.join(value)]  
            else:
                cell['source'] = [value] 
    elif type == 'code':
        cell['cell_type'] = "code"
        cell['execution_count'] = None
        cell['metadata'] = {},
        cell['source'] = [value.strip()]

    return cell

folder_path = Path.cwd() / 'non_coding_data'
file_list = [str(file) for file in folder_path.iterdir() if file.is_file()]

visualization_json_list = []
for file_info in file_list:
    with open(file_info, 'r+') as f:
        visualization_json_list.append(json.load(f))


ipynb_payload = []
for visual_code_info in visualization_json_list:
    ipynb_payload_info = {
        'cells': [],
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3",
                "language": "python",
                "name": "python3"
            },
            "language_info": {
                "codemirror_mode": {
                    "name": "ipython",
                    "version": 3
                },
                "file_extension": ".py",
                "mimetype": "text/x-python",
                "name": "python",
                "nbconvert_exporter": "python",
                "pygments_lexer": "ipython3",
                "version": "3.11.4"
            }
        },
        "nbformat": 4,
        "nbformat_minor": 2
    }

    cells = []
    filename = ""
    for key, value in visual_code_info.items():
        match key:
            case 'title':
                cells.append(insert_cell_info(value, '# ', 'markdown'))
                filename = value
            case 'link':
                cells.append(insert_cell_info(value, '### **Link:** ', 'markdown'))
            case 'id':
                cells.append(insert_cell_info(value, '##### **ID:** ', 'markdown'))
            case 'difficulty':
                cells.append(insert_cell_info(value, '##### **Difficulty:** ', 'markdown'))
            case 'description':
                cells.append(insert_cell_info(value, '##### ', 'markdown'))
            case 'company':
                list_value = ', '.join(value)
                cells.append(insert_cell_info(list_value, '##### **Company:** ', 'markdown'))
            case 'solution':
                cells.append(insert_cell_info('', '## **Solution:** ', 'markdown'))
                cells.append(insert_cell_info(value, '', 'code'))

    ipynb_payload_info['cells'].extend(cells)
    ipynb_payload_info['name_of_the_file'] = filename + '.ipynb'
    ipynb_payload.append(ipynb_payload_info)



for ip in ipynb_payload:
   with open ('non_coding_notebook/' + ip['name_of_the_file'], 'w') as ipynb_f:
       json.dump(ip,ipynb_f, indent=4)