In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup
import time
import json

url = "https://platform.stratascratch.com/visualizations?code_type=2&page_size=100&order_field=difficulty"

In [13]:
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.page_load_strategy = 'none'
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scraping_data(url, func):
    driver = setup_driver()
    driver.implicitly_wait(4)
    driver.get(url)
    time.sleep(4)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    if func == 'get_list_data':
        getting_list_data(soup)
    elif func == 'get_data_info':
        getting_data_info(url, soup)

    driver.quit()

def getting_list_data(soup):
    data = []
    for list in soup.find_all('a', 'group contents'):
        details = list.find_all('div')
        info = {}
        info['link'] = 'https://platform.stratascratch.com' + list.get('href')
        info['difficulty'] = details[1].find('span').getText()
        info['id'] = details[2].getText()
        info['title'] = details[3].getText()
        data.append(info)

    json_dump('visualization_list.json', data)  

def getting_data_info(link,soup):
    questionare = {}
    questionare['link'] = link
    questionare['title'] = soup.find('h1','QuestionMetadata__h1').getText()
    
    question_des_list = soup.find_all('div', 'QuestionMetadata__metadata')
    question_meta =  question_des_list[0].find_all('span')
    questionare['difficulty'] = question_meta[0].getText()
    questionare['id'] = question_meta[1].getText()
    questionare['type'] = question_meta[2].getText()
    questionare['analysis'] = question_meta[3].getText()
    questionare['data_type'] = question_meta[4].getText()
    questionare['description'] = soup.find('div','QuestionMetadata__question').find('p').getText()
    questionare['data'] = {}
    questionare['solution'] = ""

    file_title = 'visualization_notebook/' + questionare['title'].replace(' ', '-') + '.json'
    json_dump(file_title, questionare)

def json_dump(file, data):
    with open(file, 'w+') as f:
        json.dump(data, f)

scraping_data(url,'get_list_data')

In [32]:
visual_list = []
with open('visualization_list.json', 'r+') as list_viz:
    for viz in json.load(list_viz):
        scraping_data(viz['link'],'get_data_info')

In [72]:
source_code = """
"""

data = {
    "source_code": source_code
}

json_string = json.dumps(data, separators=(',', ':'))

print(json_string)

{"source_code":"import matplotlib.pyplot as plt\nimport plotly.express as px\n\ndf['cumulative'] = df['amount'].cumsum()\ndf['color'] = df['amount'].apply(lambda x: 'green' if x >= 0 else 'red')\nplt.figure(figsize=(10, 6))\nbars = plt.bar(df['category'], df['amount'], color=df['color'])\nfor i in range(1, len(df)):\n    plt.plot([i-1, i], [df['cumulative'][i-1], df['cumulative'][i]], color='black')\n    \nplt.bar('Net Profit', df['cumulative'].iloc[-1], color='blue')\nplt.plot([len(df)-1, len(df)], [df['cumulative'].iloc[-2], df['cumulative'].iloc[-1]], color='black')\n\nplt.title('Step-by-Step Calculation of Net Profit for a Business Quarter')\nplt.xlabel('Category')\nplt.ylabel('Amount ($)')\nplt.grid(axis='y', linestyle='--', alpha=0.7)\nplt.show()\n"}


In [None]:
sc = ''' <table>
    <thead>
        <th class="ResultsTable__header-cell">store</th>
        <th class="ResultsTable__header-cell">teens</th>
        <th class="ResultsTable__header-cell">adults</th>
        <th class="ResultsTable__header-cell">seniors</th>
    </thead>
    <tbody>
        <tr class="ResultsTable__row">
            <td class="ResultsTable__cell">Store A</td>
            <td class="ResultsTable__cell">89</td>
            <td class="ResultsTable__cell">237</td>
            <td class="ResultsTable__cell"> 55</td>
        </tr>
        <tr class="ResultsTable__row">
            <td class="ResultsTable__cell">Store B</td>
            <td class="ResultsTable__cell">156</td>
            <td class="ResultsTable__cell">230</td>
            <td class="ResultsTable__cell">98</td>
        </tr>
        <tr class="ResultsTable__row">
            <td class="ResultsTable__cell">Store C</td>
            <td class="ResultsTable__cell">165</td>
            <td class="ResultsTable__cell">169</td>
            <td class="ResultsTable__cell">66</td>
        </tr>
        <tr class="ResultsTable__row">
            <td class="ResultsTable__cell">Store D</td>
            <td class="ResultsTable__cell">199</td>
            <td class="ResultsTable__cell">215</td>
            <td class="ResultsTable__cell">30</td>
        </tr>
        <tr class="ResultsTable__row">
            <td class="ResultsTable__cell">Store E</td>
            <td class="ResultsTable__cell">97</td>
            <td class="ResultsTable__cell">277</td>
            <td class="ResultsTable__cell">30</td>
        </tr>
    </tbody>
</table>
 '''

In [73]:
sc = ''' 

 '''

soup = BeautifulSoup(sc, "html.parser")
headers = [headText.getText() for headText in soup.find_all('th', 'ResultsTable__header-cell')]
data = []
for row in soup.find_all('tr','ResultsTable__row'):
    cell = []
    for cells in row.find_all('td', 'ResultsTable__cell'):
        cell.append(cells.getText())

    data.append(cell)

json_data = [dict(zip(headers, row)) for row in data]
json_output = json.dumps(json_data, indent=4)

print(json_output)
 

[
    {
        "category": "Revenue",
        "amount": "150000"
    },
    {
        "category": "COGS",
        "amount": "-70000"
    },
    {
        "category": "Operating Expenses",
        "amount": "-20000"
    },
    {
        "category": "Other Income",
        "amount": "5000"
    },
    {
        "category": "Taxes",
        "amount": "-15000"
    }
]


In [None]:
import pandas as pd
import numpy as np

np.random.seed(0)

under_25 = 25  # Ages 0 to 24
between_25_50 = 26  # Ages 25 to 50
over_50 = 30  # Ages 51 to 80


total_elements = under_25 + between_25_50 + over_50
p_under_25 = 1/3 / under_25
p_25_50 = 1/3 / between_25_50
p_over_50 = 1/3 / over_50

ages = np.random.choice(
    a=[*range(0, 25), *range(25, 51), *range(51, 81)],  
    size=1000,
    p=[p_under_25]*under_25 + [p_25_50]*between_25_50 + [p_over_50]*over_50 
)

def determine_age_group(age):
    if age < 25:
        return 'Under 25'
    elif 25 <= age <= 50:
        return '25-50'
    else:
        return 'Over 50'
    
age_groups = [determine_age_group(age) for age in ages]
df = pd.DataFrame({
    'age': ages,
    'age_group': age_groups
})

raw_data = []

for d in range(0, len(ages)):
   info = {}
   info["age"] = ages[d]
   info["age_group"] = age_groups[d]

   raw_data.append(info)

print(raw_data)



In [None]:
import pandas as pd 
dff = pd.read_csv('elevation_data_2500.csv')

result = dff.to_json(orient='records')

list = json.dumps(result, indent=4)
print(json.loads(list))
