In [60]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import json
import numpy as np

input = "/content/drive/MyDrive/AML Final Project/leetcode_dataset - lc.csv.zip"
output = "/content/drive/MyDrive/AML Final Project/algorithms.json"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [61]:
problems = {
    'Array': 20,
    'Dynamic Programming': 20,
    'Graph': 20,
    'Tree': 10,
    'Two Pointers': 10,
    'Backtracking': 5,
    'String': 10
}

In [79]:
def select_category(df, target_distribution):
    selected_problems = []
    df['is_selected'] = False

    search_patterns = {
        'Dynamic Programming': r'Dynamic Programming|Substring|Subsequence|Subproblem|Recursive|Fibonacci',
        'Graph': r'Graph|Node|Edge|Adjacency',
        'Tree': r'Tree|Root|Node|Preorder|Inorder|Postorder',
        'Two Pointers': r'Pointer|Window|Subarray|Substring',
        'Backtracking': r'Backtracking|Permutation|Combination|Subset',
        'String': r'String|Substring|Palindrome|Anagram',
        'Array': r'Array|List',
    }

    for category, count in target_distribution.items():
        search_term = search_patterns.get(category, category)

        available_df = df[
            ~df['is_selected'] &
            df['title'].astype(str).str.contains(search_term, case=False, na=False)
        ]

        to_select_count = min(len(available_df), count)
        to_select = available_df.sample(n=to_select_count, random_state=42)

        selected_problems.append(to_select)
        df.loc[to_select.index, 'is_selected'] = True

        print(f"Selected {len(to_select)} problems for category: {category} (Searched for: '{search_term}')")

    final_selection_df = pd.concat(selected_problems).reset_index(drop=True)
    return final_selection_df

def question_to_json_format(df):
    few_shot_data = []
    for index, row in df.iterrows():
        question_text = row.get('title', 'N/A')
        description = row.get('description', 'N/A')

        entry = {
          "problem": f"{question_text}|{description}",
        }

        few_shot_data.append(entry)

    return few_shot_data

In [80]:
df = pd.read_csv(input)
selected_df = select_category(df.copy(), problems)

json_data = question_to_json_format(selected_df)
with open(output, 'w') as f:
  json.dump(json_data, f, indent=2)

Selected 20 problems for category: Array (Searched for: 'Array|List')
Selected 20 problems for category: Dynamic Programming (Searched for: 'Dynamic Programming|Substring|Subsequence|Subproblem|Recursive|Fibonacci')
Selected 20 problems for category: Graph (Searched for: 'Graph|Node|Edge|Adjacency')
Selected 10 problems for category: Tree (Searched for: 'Tree|Root|Node|Preorder|Inorder|Postorder')
Selected 10 problems for category: Two Pointers (Searched for: 'Pointer|Window|Subarray|Substring')
Selected 5 problems for category: Backtracking (Searched for: 'Backtracking|Permutation|Combination|Subset')
Selected 10 problems for category: String (Searched for: 'String|Substring|Palindrome|Anagram')


In [81]:
from google.colab import files

JSON = "/content/drive/MyDrive/AML Final Project/algorithms.json"
files.download(JSON)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [86]:
import json
import os
import pandas as pd
import re
from datasets import load_dataset

questions_path = '/content/drive/MyDrive/AML Final Project/algorithms.json'
hf_dataset_id = "cassanof/leetcode-solutions"
output_file = '/content/drive/MyDrive/AML Final Project/merged_algorithms_by_slug_final.json'

hf_slug_field = 'slug'
hf_title_field = 'title'
solution_code_field = 'python_solutions'

def generate_slug(title):
    if not isinstance(title, str):
        return ""
    title = title.lower()
    title = re.sub(r' \(ii\)$| \(i\)$| - i$| - ii$| - iii$', '', title).strip()
    title = re.sub(r'[^\w\s-]', '', title)
    title = re.sub(r'[\s_]+', '-', title)
    return re.sub(r'-+', '-', title).strip('-')

def merge_by_slug(questions_path, hf_dataset_id, output_path):
    with open(questions_path, 'r', encoding='utf-8') as f:
        questions_data = json.load(f)
    try:
        dataset = load_dataset(hf_dataset_id)['train']
        solutions_df = dataset.to_pandas()
    except Exception as e:
        print(f"Error loading Hugging Face dataset: {e}")
        return

    if hf_slug_field not in solutions_df.columns or solution_code_field not in solutions_df.columns:
        print("Error: Hugging Face DataFrame is missing the required columns.")
        print(f"Required: '{hf_slug_field}' and '{solution_code_field}'.")
        print(f"Available columns: {solutions_df.columns.tolist()}")
        return

    solution_map = solutions_df.set_index(hf_slug_field)[solution_code_field].to_dict()
    print(f"Created lookup map for {len(solution_map)} solutions.")

    merged_count = 0
    unmatched_problems = []
    problems = 'problem'

    for q_entry in questions_data:
        combined_text = q_entry.get(problems)

        if combined_text and '|' in combined_text:
            raw_name = combined_text.split('|')[0].strip()
            generated_slug = generate_slug(raw_name)

            if generated_slug in solution_map:
                solution_code = solution_map[generated_slug]
                q_entry['answer'] = solution_code

                merged_count += 1

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(questions_data, f, indent=2)
    print(f"Successfully merged {merged_count} solutions based on slug matching.")

if __name__ == "__main__":
    merge_by_slug(questions_path, hf_dataset_id, output_file)

Repo card metadata block was not found. Setting CardData to empty.


Created lookup map for 1725 solutions.
Successfully merged 76 solutions based on slug matching.
