In [1]:
from tqdm import tqdm
import json
from google.oauth2 import service_account
from googleapiclient.discovery import build

from src.Q_generator.question_generator import QuestionGenerator
from src.Q_generator.question_processor import crawl_keys
from src.Q_generator.question_processor import process_problems, find_and_load_all_problems, extract_questions_by_topic, load_complexity_data, flatten_complexity_scores, sort_and_rank_complexity_scores, create_and_save_notebook, process_notebooks_in_folder
from src.Q_generator.gdrive_upload import build_service_and_upload_files, update_problems_with_metadata

In [2]:
generator = QuestionGenerator()

In [None]:
parent_dir = "./notebooks"
all_problems = find_and_load_all_problems(parent_dir)
questions_grouped_by_topics = extract_questions_by_topic(all_problems)

# # Printing each topic and its questions
# for topic, questions in questions_grouped_by_topic.items():
#     print(f"Topic: {topic}")
#     for question in questions:
#         print(f" - {question}")

#### Load Topic

In [None]:
# Open the `topic_hierarchy.json` file and retrieve all topics
with open('topic_hierarchy.json') as json_file:
    topic_hierarchy = json.load(json_file)

all_topics = crawl_keys(topic_hierarchy)
print(f"Total number of topics: {len(all_topics)}")

In [9]:
def load_config():
    with open("./configs/config.json", "r") as file:
        return json.load(file)

config = load_config()

MAX_QUESTIONS = config["MAX_QUESTIONS"]
generated_questions_count = config["generated_questions_count"]

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

questions_grouped_by_topic = {}

# Convert if the existing questions are not grouped by full topic paths
for topic in all_topics:
    questions_grouped_by_topic[topic] = questions_grouped_by_topics.get(topic, set())

problems = []
generated_questions_count = config["generated_questions_count"]


# Function to generate questions for a given topic
def generate_for_topic(topic):
    global generated_questions_count

    new_problems = []
    existing_questions = questions_grouped_by_topic[topic]

    if generated_questions_count < MAX_QUESTIONS:
        questions = generator.generate_human_like_questions(topic, 3, existing_questions)
        for question in questions["questions"]:
            if generated_questions_count >= MAX_QUESTIONS:
                break
            new_problems.append({
                "metadata": {
                    "topic": topic,
                    "type": "query",
                    "difficulty": "Easy",
                    "target_length": 1
                },
                "messages": [
                    {"role": "user", "content": question},
                ]
            })
            generated_questions_count += 1
    return new_problems

# Set the maximum number of threads to use
MAX_THREADS = 20

with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    # Create a dictionary to store future-to-topic mapping
    future_to_topic = {executor.submit(generate_for_topic, topic): topic for topic in questions_grouped_by_topic.keys()}

    with tqdm(total=MAX_QUESTIONS) as pbar:
        for future in as_completed(future_to_topic):
            new_problems = future.result()
            problems.extend(new_problems)
            pbar.update(len(new_problems))

In [None]:
problem_titles, file_path_to_problem, problem_topic_counts = process_problems(problems)

In [None]:
complexity_data = load_complexity_data('./topic_dist.json')
complexity_scores = flatten_complexity_scores(complexity_data)
topic_percentiles = sort_and_rank_complexity_scores(complexity_scores)

create_and_save_notebook('notebooks/v3/old_pipe/{}.ipynb', problems, problem_titles, topic_percentiles)

In [None]:
creds_path = 'creds/google__sa.json'
folder_path = './notebooks/v3/new_pipe/batch_1/'
destination_folder_url = 'https://drive.google.com/drive/u/2/folders/1FuHZZ18qn6k8iXTi5YbstGRr56w0kDEz'

file_path_to_url = build_service_and_upload_files(creds_path, folder_path, destination_folder_url)
update_problems_with_metadata(file_path_to_url, file_path_to_problem, "batch_1")


In [None]:
# Path to your service account key file
SERVICE_ACCOUNT_FILE = 'creds/google__sa.json'

# The scopes required for the Sheets API
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# The ID of your spreadsheet
SPREADSHEET_ID = '1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4'

# Authenticate and build the service
creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('sheets', 'v4', credentials=creds)

# Specify the range and values to update
range_ = 'Conversations_Batch_5!A380:E1000'  # For example, this updates cells from A1 to D5 in Sheet1
values = []

for problem in problems:
    values.append([
        problem["metadata"]["colab_url"],
        problem["metadata"]["topic"],
    ])


body = {
    'values': values
}

# Call the Sheets API to update the range
request = service.spreadsheets().values().update(spreadsheetId=SPREADSHEET_ID, range=range_, valueInputOption='RAW', body=body)
response = request.execute()

In [None]:
folder_path = './notebooks/v3/new_pipe/batch_2/'
process_notebooks_in_folder(folder_path)