# Getting raw html data

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import pprint
import re
import pymongo
from dotenv import load_dotenv
import os

load_dotenv()
LEETCODE_BASE_URL= "https://leetcode.com/problems"

def get_html(problem_name):
  # leetcode url 
  url = f"{LEETCODE_BASE_URL}/{problem_name}/"

  # Make a GET request to the website
  response = requests.get(url)

  # Parse the HTML content
  soup = BeautifulSoup(response.text, "html.parser")

  problem_description = soup.find("script", id="__NEXT_DATA__")
  html = None
  if problem_description:
    # contains all of the script tag, e.g. "jQuery(window)..."
    contents = problem_description.string
    
    # convert to a dictionary
    dict_contents = json.loads(str(contents))

    # drilling into the object 
    lst_of_objects = dict_contents['props']['pageProps']['dehydratedState']['queries']
    
    # filtering the list of objects
    result = [obj for obj in lst_of_objects if ('question' in obj['state']['data'] and 
                                                'content' in obj['state']['data']['question'] and 
                                                obj['state']['data']['question']['content'] is not None)]
    
    if len(result) > 0: 
      # drilling further into the object to find the html content
      html = result[0]['state']['data']['question']['content']
  
  return response, html 

### Getting Python & Javascript code snippets

In [None]:
def get_code_snippets(problem_name):
  # we dont care about the particular question
  url = f"{LEETCODE_BASE_URL}/{problem_name}/"
    
  # Make a GET request to the website
  response = requests.get(url)

  # Parse the HTML content
  soup = BeautifulSoup(response.text, "html.parser")
  
  problem_description = soup.find("script", id="__NEXT_DATA__")

  code_snippets = None
  if problem_description:
    # contains all of the script tag, e.g. "jQuery(window)..."
    contents = problem_description.string
    
    dict_contents = json.loads(str(contents))
    
    list_of_starter_codes = (dict_contents['props']['pageProps']['dehydratedState']['queries'][2]['state']
                            ['data']['question']['codeSnippets'])

    if list_of_starter_codes is not None:

      code_snippets = [obj for obj in list_of_starter_codes if (obj['lang'] == 'Python3' or obj['lang'] == 'JavaScript')]

      return code_snippets
    return ""

### Getting problem explanation

In [None]:
def get_problem_description(html):
  if html is not None:
    # regex matches everything before the first example
    result = re.match(r"(.+?)<strong class=.example.", html, re.DOTALL).group(1)

    soup = BeautifulSoup(result, 'html.parser')

    # Extract all text from the HTML and remove the tags
    text = soup.get_text()
    # Find the element containing the image
    
    image_element = soup.find('img')

    image_src = None
    if image_element is not None: 
      # Extract the image source
      image_src = image_element['src']

    return {"description_text": text, "description_img_source": image_src}
  return ""


### Getting examples

In [None]:
def get_examples(html):
  if html is not None:
    soup = BeautifulSoup(html, 'html.parser')

    # Find all elements with the class "example"
    examples = soup.find_all(class_="example")

    result = []
    for example in examples:
        # Find the next sibling element (pre element in this case)
        pre_element = example.findNext("pre")
        # Get the text from the pre element
        example_text = pre_element.text
        # get image 
        image_element = example.findNext('img')
        img_src = None
        if image_element is not None: 
          # Extract the image source
          img_src = image_element['src']

        result.append({"example_text": example_text, "example_img_source": img_src})

    return result
  return ""


### Getting constraints

In [None]:
def get_constraints(html):
    if html is not None:
        constraints = re.search(r"<p><strong>Constraints:</strong></p>(.*)", html, re.DOTALL)
        if constraints:
            constraints = constraints.group(1)
            soup = BeautifulSoup(constraints, 'html.parser')
            text = soup.get_text()
            return text
    return ""

In [None]:
res, htm = get_html("count-and-say")
examples = get_examples(htm)
descrip, img_url = get_problem_description(htm)
print(img_url)


### Adding kebab-case field to every document

In [None]:
from re import sub

def camel_case(s):
  s = re.sub(r"(_|-)+", " ", s).title().replace(" ", "")
  return ''.join([s[0].lower(), s[1:]])

MONGO_USER = os.environ['MONGO_USER']
MONGO_PASSWORD  = os.environ['MONGO_PASSWORD']
MONGO_BASE_URL = f"mongodb+srv://{MONGO_USER}:{MONGO_PASSWORD}@cluster0.pn9un82.mongodb.net/?retryWrites=true&w=majority"

# Connect to the server
client = pymongo.MongoClient(MONGO_BASE_URL)

# Get a reference to a database
db = client['kokolearn']

# Get a reference to a collection
collection = db['questions']

# Specify the projection to include only the name field
projection = {"name": 1}

# Find all documents in the collection
cursor = collection.find({}, projection)

lst = []
# Iterate through the documents and print the names
for document in cursor:
  lst.append(document["name"])

for name in lst: 
  # Define the filter to select the document to update
  filter = {"name": name}

  # Define the update operation
  update = { "$set": {"kebab_case": camel_case(name)} }

  # Update the document
  result = collection.update_one(filter, update)

  # Print the number of updated documents
  print(result.modified_count)

In [6]:
# Update all documents in the collection
collection.update_many({}, {"$rename": {"kebab_case": "camelCaseName"}})

<pymongo.results.UpdateResult at 0x1097e7a00>