In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup
import time
import json

url = "https://platform.stratascratch.com/visualizations?code_type=2&page_size=100&order_field=difficulty"

In [6]:
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.page_load_strategy = 'none'
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scraping_data(url, func):
    driver = setup_driver()
    driver.implicitly_wait(4)
    driver.get(url)
    time.sleep(4)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    if func == 'get_list_data':
        getting_list_data(soup)
    elif func == 'get_data_info':
        getting_data_info(url, soup)

    driver.quit()

def getting_list_data(soup):
    data = []
    for list in soup.find_all('a', 'group contents'):
        details = list.find_all('div')
        info = {}
        info['link'] = 'https://platform.stratascratch.com' + list.get('href')
        info['difficulty'] = details[1].find('span').getText()
        info['id'] = details[2].getText()
        info['title'] = details[3].getText()
        data.append(info)

    json_dump('visualization_list.json', data)  

def getting_data_info(link,soup):
    questionare = {}
    questionare['link'] = link
    questionare['title'] = soup.find('h1','QuestionMetadata__h1').getText()
    
    question_des_list = soup.find_all('div', 'QuestionMetadata__metadata')
    question_meta =  question_des_list[0].find_all('span')
    questionare['difficulty'] = question_meta[0].getText()
    questionare['id'] = question_meta[1].getText()
    questionare['type'] = question_meta[2].getText()
    questionare['analysis'] = question_meta[3].getText()
    questionare['data_type'] = question_meta[4].getText()
    questionare['description'] = soup.find('div','QuestionMetadata__question').find('p').getText()
    questionare['data'] = {}
    questionare['solution'] = ""

    file_title = 'visualization_info_file/' + questionare['title'].replace(' ', '-') + '.json'
    json_dump(file_title, questionare)

def json_dump(file, data):
    with open(file, 'w+') as f:
        json.dump(data, f)

#scraping_data(url,'get_list_data')

In [7]:
visual_list = []
with open('visualization_list.json', 'r+') as list_viz:
    for viz in json.load(list_viz):
        pass
        #scraping_data(viz['link'],'get_data_info')

In [12]:
source_code = """import matplotlib.pyplot as plt
import matplotlib.dates as mdates

df['start'] = pd.to_datetime(df['start'])
df['finish'] = pd.to_datetime(df['finish'])

color_map = {'Planning': 'salmon', 'Development': 'lightgreen', 'Testing': 'lightskyblue'}
fig, ax = plt.subplots(figsize=(12, 6))

for i, task in df.iterrows():
    ax.barh(task['task'], (task['finish'] - task['start']).days, left=task['start'], color=color_map[task['phase']])

ax.set_xlabel('Date')
ax.set_ylabel('Task')
ax.set_title('Gantt Chart for Software Project Deployment Phases')

ax.xaxis_date()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=45)

plt.show()
"""

data = {
    "source_code": source_code
}

json_string = json.dumps(data, separators=(',', ':'))

print(json_string)

{"source_code":"import matplotlib.pyplot as plt\nimport matplotlib.dates as mdates\n\ndf['start'] = pd.to_datetime(df['start'])\ndf['finish'] = pd.to_datetime(df['finish'])\n\ncolor_map = {'Planning': 'salmon', 'Development': 'lightgreen', 'Testing': 'lightskyblue'}\nfig, ax = plt.subplots(figsize=(12, 6))\n\nfor i, task in df.iterrows():\n    ax.barh(task['task'], (task['finish'] - task['start']).days, left=task['start'], color=color_map[task['phase']])\n\nax.set_xlabel('Date')\nax.set_ylabel('Task')\nax.set_title('Gantt Chart for Software Project Deployment Phases')\n\nax.xaxis_date()\nax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))\nplt.xticks(rotation=45)\n\nplt.show()\n"}


In [134]:
tabular_data = """
region product_type   sales
North America   Electronics  100000
North America       Apparel   50000
North America           oys   25000
Europe          Electronics   80000
Europe              Apparel   40000
Europe                 Toys   20000
Asia            Electronics   90000
Asia                Apparel   30000
Asia                   Toys   10000

"""

lines = tabular_data.strip().split('\n')
parsed_data = [line.split() for line in lines]
header = parsed_data[0]
records = parsed_data[1:]
result = [{header[i]: row[i] if i > 0 else row[i] for i in range(len(header))} for row in records]

print(result)



[{'region': 'North', 'product_type': 'America', 'sales': 'Electronics'}, {'region': 'North', 'product_type': 'America', 'sales': 'Apparel'}, {'region': 'North', 'product_type': 'America', 'sales': 'oys'}, {'region': 'Europe', 'product_type': 'Electronics', 'sales': '80000'}, {'region': 'Europe', 'product_type': 'Apparel', 'sales': '40000'}, {'region': 'Europe', 'product_type': 'Toys', 'sales': '20000'}, {'region': 'Asia', 'product_type': 'Electronics', 'sales': '90000'}, {'region': 'Asia', 'product_type': 'Apparel', 'sales': '30000'}, {'region': 'Asia', 'product_type': 'Toys', 'sales': '10000'}]


In [8]:
sc = ''' 
<table class="ResultsTable__table"><thead><tr class="ResultsTable__header-row"><th class="ResultsTable__header-cell">month</th><th class="ResultsTable__header-cell">change</th><th class="ResultsTable__header-cell">revenue</th></tr></thead><tbody><tr class="ResultsTable__row "><td class="ResultsTable__cell">January</td><td class="ResultsTable__cell">20000</td><td class="ResultsTable__cell">120000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">February</td><td class="ResultsTable__cell">-5000</td><td class="ResultsTable__cell">135000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">March</td><td class="ResultsTable__cell">15000</td><td class="ResultsTable__cell">165000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">April</td><td class="ResultsTable__cell">-7000</td><td class="ResultsTable__cell">188000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">May</td><td class="ResultsTable__cell">10000</td><td class="ResultsTable__cell">221000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">June</td><td class="ResultsTable__cell">5000</td><td class="ResultsTable__cell">259000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">July</td><td class="ResultsTable__cell">-3000</td><td class="ResultsTable__cell">294000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">August</td><td class="ResultsTable__cell">12000</td><td class="ResultsTable__cell">341000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">September</td><td class="ResultsTable__cell">-8000</td><td class="ResultsTable__cell">380000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">October</td><td class="ResultsTable__cell">15000</td><td class="ResultsTable__cell">434000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">November</td><td class="ResultsTable__cell">-4000</td><td class="ResultsTable__cell">484000</td></tr><tr class="ResultsTable__row "><td class="ResultsTable__cell">December</td><td class="ResultsTable__cell">17000</td><td class="ResultsTable__cell">551000</td></tr></tbody></table>
 '''

soup = BeautifulSoup(sc, "html.parser")
headers = [headText.getText() for headText in soup.find_all('th', 'ResultsTable__header-cell')]
data = []

def convert_to_number(value):
    try:
        # Try converting to integer
        return int(value)
    except ValueError:
        try:
            # If int fails, try converting to float
            return float(value)
        except ValueError:
            # If both conversions fail, return the original string
            return value
        

for row in soup.find_all('tr','ResultsTable__row'):
    cell = []
    for cells in row.find_all('td', 'ResultsTable__cell'):
        text_value = cells.getText().strip() 
        cell.append(convert_to_number(text_value)) 

    data.append(cell)

json_data = [dict(zip(headers, row)) for row in data]
json_output = json.dumps(json_data, indent=4)

print(json_output)
 

[
    {
        "month": "January",
        "change": 20000,
        "revenue": 120000
    },
    {
        "month": "February",
        "change": -5000,
        "revenue": 135000
    },
    {
        "month": "March",
        "change": 15000,
        "revenue": 165000
    },
    {
        "month": "April",
        "change": -7000,
        "revenue": 188000
    },
    {
        "month": "May",
        "change": 10000,
        "revenue": 221000
    },
    {
        "month": "June",
        "change": 5000,
        "revenue": 259000
    },
    {
        "month": "July",
        "change": -3000,
        "revenue": 294000
    },
    {
        "month": "August",
        "change": 12000,
        "revenue": 341000
    },
    {
        "month": "September",
        "change": -8000,
        "revenue": 380000
    },
    {
        "month": "October",
        "change": 15000,
        "revenue": 434000
    },
    {
        "month": "November",
        "change": -4000,
        "revenue": 484000
    }

In [31]:
from pathlib import Path

def insert_cell_info(value, heading_type, type, is_many_source=False):
    cell = {}
    if type == 'markdown':
        cell['cell_type'] = "markdown"
        cell['metadata'] = {}
        if not is_many_source:
            cell_title = heading_type + value
            cell['source'] = [cell_title]
        else:
            if isinstance(value, list):
                cell['source'] = [''.join(value)]  
            else:
                cell['source'] = [value] 
    elif type == 'code':
        cell['cell_type'] = "code"
        cell['execution_count'] = None
        cell['metadata'] = {},
        cell['source'] = [value.strip()]

    return cell


folder_path = Path.cwd() / 'visualization_info_file'
file_list = [str(file) for file in folder_path.iterdir() if file.is_file()]

visualization_json_list = []
for file_info in file_list:
    with open(file_info, 'r+') as f:
        visualization_json_list.append(json.load(f))

ipynb_payload = []
for visual_code_info in visualization_json_list:
    ipynb_payload_info = {
        'cells': [],
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3",
                "language": "python",
                "name": "python3"
            },
            "language_info": {
                "codemirror_mode": {
                    "name": "ipython",
                    "version": 3
                },
                "file_extension": ".py",
                "mimetype": "text/x-python",
                "name": "python",
                "nbconvert_exporter": "python",
                "pygments_lexer": "ipython3",
                "version": "3.11.4"
            }
        },
        "nbformat": 4,
        "nbformat_minor": 2
    }

    cells = []
    filename = ""
    for key, value in visual_code_info.items():
        match key:
            case 'title':
                cells.append(insert_cell_info(value, '# ', 'markdown'))
                filename = value
            case 'link':
                cells.append(insert_cell_info(value, '### **Link:** ', 'markdown'))
            case 'id':
                cells.append(insert_cell_info(value, '##### **ID:** ', 'markdown'))
            case 'difficulty':
                cells.append(insert_cell_info(value, '##### **Difficulty:** ', 'markdown'))
            case 'description':
                cells.append(insert_cell_info(value, '##### ', 'markdown'))
            case 'type':
                cells.append(insert_cell_info(value, '##### **Type:** ', 'markdown'))
            case 'analysis':
                cells.append(insert_cell_info(value, '##### **Analysis:** ', 'markdown'))
            case 'data_type':
                cells.append(insert_cell_info(value, '##### **Data Type:** ', 'markdown'))
            case 'data':
                data_as_json_string = 'import pandas as pd \n\ndata = ' +json.dumps(value, indent=4) + '\n\ndf = pd.DataFrame(data)'
                cells.append(insert_cell_info('', '## **Data:** ', 'markdown'))
                cells.append(insert_cell_info(data_as_json_string, '', 'code'))
            case 'solution':
                cells.append(insert_cell_info('', '## **Solution:** ', 'markdown'))
                cells.append(insert_cell_info(value, '', 'code'))

    ipynb_payload_info['cells'].extend(cells)
    ipynb_payload_info['name_of_the_file'] = filename + '.ipynb'
    ipynb_payload.append(ipynb_payload_info)



In [32]:
for ip in ipynb_payload:
   with open ('visualization_notebook/' + ip['name_of_the_file'], 'w') as ipynb_f:
       json.dump(ip,ipynb_f, indent=4)