# Getting raw html data

In [17]:
import requests
from bs4 import BeautifulSoup
import json
import pprint
import re
import pymongo
from dotenv import load_dotenv
import os

load_dotenv()
LEETCODE_BASE_URL= "https://leetcode.com/problems"

def get_html(problem_name):
  # leetcode url 
  url = f"{LEETCODE_BASE_URL}/{problem_name}/"

  # Make a GET request to the website
  response = requests.get(url)

  # Parse the HTML content
  soup = BeautifulSoup(response.text, "html.parser")

  problem_description = soup.find("script", id="__NEXT_DATA__")
  html = None
  if problem_description:
    # contains all of the script tag, e.g. "jQuery(window)..."
    contents = problem_description.string
    
    # convert to a dictionary
    dict_contents = json.loads(str(contents))

    # drilling into the object 
    lst_of_objects = dict_contents['props']['pageProps']['dehydratedState']['queries']
    
    # filtering the list of objects
    result = [obj for obj in lst_of_objects if ('question' in obj['state']['data'] and 
                                                'content' in obj['state']['data']['question'] and 
                                                obj['state']['data']['question']['content'] is not None)]
    
    if len(result) > 0: 
      # drilling further into the object to find the html content
      html = result[0]['state']['data']['question']['content']
  return response, html 

### Getting Python & Javascript code snippets

In [18]:
def get_code_snippets(problem_name):
  # we dont care about the particular question
  url = f"{LEETCODE_BASE_URL}/{problem_name}/"
    
  # Make a GET request to the website
  response = requests.get(url)

  # Parse the HTML content
  soup = BeautifulSoup(response.text, "html.parser")
  
  problem_description = soup.find("script", id="__NEXT_DATA__")

  code_snippets = None
  if problem_description:
    # contains all of the script tag, e.g. "jQuery(window)..."
    contents = problem_description.string
    dict_contents = json.loads(str(contents))
    

    list_of_starter_codes = (dict_contents['props']['pageProps']['dehydratedState']['queries'][2]['state']
                            ['data']['question']['codeSnippets'])

    if list_of_starter_codes is not None:

      code_snippets = [obj for obj in list_of_starter_codes if (obj['lang'] == 'Python3' or obj['lang'] == 'JavaScript')]

      return code_snippets
    return ""

### Getting problem explanation

In [19]:
def get_problem_description(html):
  if html is not None:
    result = re.match(r"(.+?)<strong class=.example.", html, re.DOTALL).group(1)

    soup = BeautifulSoup(result, 'html.parser')

    # Extract all text from the HTML and remove the tags
    text = soup.get_text()

    return text
  return ""


### Getting examples

In [20]:
def get_examples(html):
  if html is not None:
    soup = BeautifulSoup(html, 'html.parser')

    # Find all elements with the class "example"
    examples = soup.find_all(class_="example")

    result = []
    for example in examples:
        # Find the next sibling element (pre element in this case)
        pre_element = example.findNext("pre")
        # Get the text from the pre element
        example_text = pre_element.text
        result.append(example_text)

    return result
  return ""


### Getting constraints

In [21]:
def get_constraints(html):
    if html is not None:
        constraints = re.search(r"<p><strong>Constraints:</strong></p>(.*)", html, re.DOTALL)
        if constraints:
            constraints = constraints.group(1)
            soup = BeautifulSoup(constraints, 'html.parser')
            text = soup.get_text()
            return text
    return ""

### Getting problem names

In [22]:
names = [
"Contains Duplicate",
"Valid Anagram",
"Two Sum",
"Group Anagrams",
"Top K Frequent Elements",
"Product of Array Except Self",
"Valid Sudoku",
"Encode And Decode Strings",	
"Longest Consecutive Sequence",
"Valid Palindrome",
"Two Sum II Input Array Is Sorted",
"3Sum",
"Container With Most Water",
"Trapping Rain Water",
"Best Time to Buy And Sell Stock",
"Longest Substring Without Repeating Characters",
"Longest Repeating Character Replacement",
"Permutation In String",
"Minimum Window Substring",
"Sliding Window Maximum",
"Valid Parentheses",
"Min Stack",
"Evaluate Reverse Polish Notation",
"Generate Parentheses",
"Daily Temperatures",
"Car Fleet",
"Largest Rectangle In Histogram",
"Binary Search",
"Search a 2D Matrix",
"Koko Eating Bananas",
"Find Minimum In Rotated Sorted Array",
"Search In Rotated Sorted Array",
"Time Based Key Value Store",
"Median of Two Sorted Arrays",
"Reverse Linked List",
"Merge Two Sorted Lists",
"Reorder List",
"Remove Nth Node From End of List",
"Copy List With Random Pointer",
"Add Two Numbers",
"Linked List Cycle",
"Find The Duplicate Number",
"LRU Cache",
"Merge K Sorted Lists",
"Reverse Nodes In K Group",
"Invert Binary Tree",
"Maximum Depth of Binary Tree",
"Diameter of Binary Tree",
"Balanced Binary Tree",
"Same Tree",
"Subtree of Another Tree",
"Lowest Common Ancestor of a Binary Search Tree",
"Binary Tree Level Order Traversal",
"Binary Tree Right Side View",
"Count Good Nodes In Binary Tree",
"Validate Binary Search Tree",
"Kth Smallest Element In a Bst",
"Construct Binary Tree From Preorder And Inorder Traversal",
"Binary Tree Maximum Path Sum",
"Serialize And Deserialize Binary Tree",
"Implement Trie Prefix Tree",
"Design Add And Search Words Data Structure",
"Word Search II",
"Kth Largest Element In a Stream",
"Last Stone Weight",
"K Closest Points to Origin",
"Kth Largest Element In An Array",
"Task Scheduler,", 
"Design Twitter,", 
"Find Median From Data Stream,",
"Subsets,", 
"Combination Sum,", 
"Permutations,", 
"Subsets II,", 
"Combination Sum II,", 
"Word Search,", 
"Palindrome Partitioning,", 
"Letter Combinations of a Phone Number,", 
"N Queens,", 
"Number of Islands,", 
"Clone Graph,", 
"Max Area of Island,", 
"Pacific Atlantic Water Flow,", 
"Surrounded Regions,", 
"Rotting Oranges,", 
"Walls And Gates,", 
"Course Schedule,", 
"Course Schedule II,", 
"Redundant Connection,", 
"Number of Connected Components In An Undirected Graph,", 
"Graph Valid Tree,", 
"Word Ladder,", 
"Reconstruct Itinerary,", 
"Min Cost to Connect All Points,", 
"Network Delay Time,", 
"Swim In Rising Water,", 
"Alien Dictionary,",
"Cheapest Flights Within K Stops,", 
"Climbing Stairs,", 
"Min Cost Climbing Stairs,", 
"House Robber,", 
"House Robber II,", 
"Longest Palindromic Substring,", 
"Palindromic Substrings,", 
"Decode Ways,", 
"Coin Change,", 
"Maximum Product Subarray,", 
"Word Break,", 
"Longest Increasing Subsequence,", 
"Partition Equal Subset Sum,", 
"Unique Paths,", 
"Longest Common Subsequence,", 
"Best Time to Buy And Sell Stock With Cooldown,", 
"Coin Change II,", "Target Sum,", 
"Interleaving String,", 
"Longest Increasing Path In a Matrix,", 
"Distinct Subsequences,", 
"Edit Distance,", 
"Burst Balloons,", 
"Regular Expression Matching,", 
"Maximum Subarray,", 
"Jump Game,", 
"Jump Game II,", 
"Gas Station,", 
"Hand of Straights,",
"Merge Triplets to Form Target Triplet,", 
"Partition Labels,", 
"Valid Parenthesis String,", 
"Insert Interval,", 
"Merge Intervals,", 
"Non Overlapping Intervals,", 
"Meeting Rooms,", 
"Meeting Rooms II,", 
"Minimum Interval to Include Each Query,", 
"Rotate Image,", 
"Spiral Matrix,", 
"Set Matrix Zeroes,", 
"Happy Number,", 
"Plus One,", 
"Powx n", 
"Multiply Strings,", 
"Detect Squares,", 
"Single Number,", 
"Number of 1 Bits,", 
"Counting Bits,", 
"Reverse Bits,", 
"Missing Number,", 
"Sum of Two Integers,", 
"Reverse Integer"]

In [23]:
def to_kebab_case(s):
    # Replace any non-alphanumeric characters with hyphens
    s = re.sub(r'[^a-zA-Z0-9]', '-', s).strip("-")
    # Convert to lowercase
    s = s.lower()
    return s

kebab_names = [to_kebab_case(name) for name in names]



In [24]:
MONGO_USER = os.environ['MONGO_USER']
MONGO_PASSWORD  = os.environ['MONGO_PASSWORD']
print(MONGO_USER, MONGO_PASSWORD )
MONGO_BASE_URL = f"mongodb+srv://{MONGO_USER}:{MONGO_PASSWORD}@cluster0.pn9un82.mongodb.net/?retryWrites=true&w=majority"

# Connect to the server
client = pymongo.MongoClient(MONGO_BASE_URL)

# Get a reference to a database
db = client['kokolearn']

# Get a reference to a collection
collection = db['questions']

for kebab_name, name in zip(kebab_names, names):
  response, html = get_html(kebab_name)
  if response.status_code == 200:
    description = get_problem_description(html)
    examples = get_examples(html)
    constraint = get_constraints(html)
    code_snippets = get_code_snippets(kebab_name)
    #TODO: 
      # 1) get name 
      # 2) get description 
      # 3) get examples 
      # 4) get constraint
      # 5) get code snippets
    document = {
      "name": name,
      "description": description,
      "examples": examples,
      "constraint": constraint,
      "code_snippets": code_snippets
    }
    collection.insert_one(document)
  else: 
    print(f"Could not parse {name}")

kokolearn-admin KokoLearn123...
Could not parse Lowest Common Ancestor of a Binary Search Tree
Could not parse Binary Tree Level Order Traversal
Could not parse Binary Tree Right Side View
Could not parse Count Good Nodes In Binary Tree
Could not parse Validate Binary Search Tree
Could not parse Kth Smallest Element In a Bst
Could not parse Construct Binary Tree From Preorder And Inorder Traversal
Could not parse Binary Tree Maximum Path Sum
Could not parse Serialize And Deserialize Binary Tree
Could not parse Implement Trie Prefix Tree
Could not parse Design Add And Search Words Data Structure
Could not parse Word Search II
Could not parse Kth Largest Element In a Stream
Could not parse Last Stone Weight
Could not parse K Closest Points to Origin
Could not parse Kth Largest Element In An Array
Could not parse Task Scheduler,
Could not parse Design Twitter,
Could not parse Find Median From Data Stream,
Could not parse Subsets,
Could not parse Combination Sum,
Could not parse Permutatio