## Settings and pointers

In [17]:
service_account_file = '../../creds/google__sa.json'

tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"
included_sheet_names = [
    "Conversations_Batch_7",
    "Conversations_Batch_8",
    "Conversations_Batch_9",
]

jupyter_gdrive_folder_ids = [
    "1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9", # V0
    "1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb", # V1
    "1jV7WA5zB172DJUp7Z2XzHr62E6U6_NtY",
]

delivery_sheet_id = "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4"
delivery_jsonl_gdrive_folder_id = "1BXvUdQMulQkJk5rQvJr8Op07JbciBuTx"
destination_folder_url = f"https://drive.google.com/drive/folders/{delivery_jsonl_gdrive_folder_id}"
DELIVERY_BATCH_NAME = "Batch 10"

insights_sheet_id = "1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w"
INSIGHTS_VERSION_TAB = "v1 (Jan 25)"

## Source Code


In [5]:
import sys 
sys.path.append('../../')
import io
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

import nbformat
import pandas as pd
from tqdm import tqdm
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload



def get_number_of_turns(messages):
    count = 0
    for message in messages:
        if message["role"] == "User":
            count += 1
    return count


def standardize_date_format(date):
    """
    Given a date string, standardize the date format to YYYY/MM/DD.
    """
    try:
        # Parse the date string into a datetime object
        standardized_date = datetime.strptime(date, "%Y/%m/%d")
    except ValueError:
        try:
            # Attempt to parse other common formats here
            # Example: MM/DD/YYYY
            standardized_date = datetime.strptime(date, "%m/%d/%Y")
        except ValueError:
            return "Invalid date format"

    # Format the datetime object into the desired string format
    return standardized_date.strftime("%Y/%m/%d")
###################################


#########################
    # Colab #
#########################


def update_colab_notebook(colab_link, local_nb_path, sa_creds_path):
    """
    Update a Google Colab notebook file in Google Drive.

    :param colab_link: The link to the Colab notebook in Google Drive.
    :param local_nb_path: The local path of the notebook file to upload.
    :param sa_creds_path: The path to the service account credentials.
    """
    try:
        # Extract file ID from the Colab link
        file_id = colab_link.split('/drive/')[1].split('/')[0]
    except IndexError:
        raise ValueError("Invalid Colab link format")

    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(sa_creds_path, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Specify the file to upload
    media = MediaFileUpload(local_nb_path, resumable=True)

    # Update the file
    try:
        updated_file = service.files().update(fileId=file_id, media_body=media).execute()
        return f"Updated file ID: {updated_file.get('id')}"
    except Exception as e:
        return f"Error updating file: {e}"


def get_colab_notebook(colab_link, sa_creds_path) -> nbformat.NotebookNode:
    try:
        # Extract file ID from the Colab link
        file_id = colab_link.split('/drive/')[1].split('/')[0]
    except IndexError:
        raise ValueError("Invalid Colab link format")

    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(sa_creds_path, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Download the file
    try:
        request = service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)

        done = False
        while not done:
            status, done = downloader.next_chunk()
        
        # Load as nbformat notebook
        notebook = nbformat.reads(fh.getvalue().decode(), as_version=4)
        return notebook
    except Exception as e:
        print(f"Error downloading file: {e}")
        return None


def get_file_name_from_colab_link(colab_link, service_account_file):
    try:
        file_id = colab_link.split('/drive/')[1]
    except IndexError:
        return None

    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    try:
        file = service.files().get(fileId=file_id).execute()
        return file.get('name')
    except Exception as e:
        return None


def fetch_file_names_parallel(links, service_account_file, max_workers=100):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_file_name_from_colab_link, link, service_account_file): link for link in links}
        results = {}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching File Names"):
            link = futures[future]
            try:
                file_name = future.result()
                results[link] = file_name
            except Exception as e:
                results[link] = None
        return results

## Read Remote Sheet

In [6]:
from src.sheets_utils import download_sheet_as_df


progress_batches = []
for sheet_name in included_sheet_names:
    print(sheet_name)
    bdf = download_sheet_as_df(service_account_file, tracking_sheet_id, sheet_name)
    progress_batches.append(bdf)

df = pd.concat(progress_batches, ignore_index=True)
completed_df = df[df["completion_status"] == "Done"]
completed_df = completed_df.drop_duplicates(subset=["task_link"])

delivered = pd.concat([
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 1"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 2"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 3"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 4"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 5"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 6"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 7"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 8"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 9"),
], ignore_index=True)


completed_to_be_delivered_df = completed_df[~completed_df["task_link"].isin(delivered["task_link"])]
completed_to_be_delivered_df

Conversations_Batch_7
Conversations_Batch_8
Conversations_Batch_9


Unnamed: 0,task_link,metadata__topic,assigned_to_email,completion_status,modified_question?,duration_mins,completion_date,comments,metadata__type,metadata__target_length,review_status,reviewer_email,Start Time,End Time
224,https://colab.research.google.com/drive/16akew...,228_problem solving - write code in python.ipynb,"alexsander.j@turing,com",Done,FALSE,29,2/17/2024,,,,,,-,-
505,5,python basics & scripting - explain complex co...,chandrashekhar.s@turing.com,Done,FALSE,45,7/2/2024,,,,Reviewed,paulo.c@turing.com,,
1246,https://colab.research.google.com/drive/1zOj_5...,cloud_computing_OR_frameworks__answer_infra_qu...,aarunik.g@turing.com,Done,,60,2/19/2024,,,,,,,
1247,https://colab.research.google.com/drive/1l4T0b...,cloud_computing_OR_frameworks__answer_infra_qu...,aarunik.g@turing.com,Done,,60,2/19/2024,,,,,,,
1248,https://colab.research.google.com/drive/1DH6pd...,machine_learning__write_code_in_python__10_V8_...,aarunik.g@turing.com,Done,,60,2/19/2024,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3183,https://colab.research.google.com/drive/1R7Z-6...,web_development__write_code_in_python__5__19-0...,arnold.o@turing.com,Done,,60,2/20/2024,,,,Unreviewed,,,
3184,https://colab.research.google.com/drive/1Ybter...,web_development__write_code_in_python__6__19-0...,arnold.o@turing.com,Done,,60,2/20/2024,,,,Unreviewed,,,
3185,https://colab.research.google.com/drive/112T7j...,web_development__write_code_in_python__7__19-0...,arnold.o@turing.com,Done,,60,2/20/2024,,,,Unreviewed,,,
3186,https://colab.research.google.com/drive/1WQuPA...,web_development__write_code_in_python__8__19-0...,arnold.o@turing.com,Done,,60,2/19/2023,,,,Unreviewed,,,


In [7]:
from src.llm_reviewer.notebook_parser import notebook_parser
from concurrent.futures import ThreadPoolExecutor, as_completed


notebooks = []
results = []
errors = []

def process_task_link(task_link):
    try:
        notebook = get_colab_notebook(task_link, service_account_file)
        parsed_notebook = notebook_parser(notebook)
        parsed_messages = parsed_notebook["messages"]
        number_of_turns = get_number_of_turns(parsed_messages)
        return parsed_notebook, {
            "task_link": task_link,
            "n_messages": len(parsed_messages),
            "number_of_turns": number_of_turns,
        }
    except Exception as e:
        return None, {
            "task_link": task_link,
            "error": str(e)
        }

with ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(process_task_link, task_link) for task_link in completed_to_be_delivered_df["task_link"].tolist()]
    for future in as_completed(futures):
        notebook, result = future.result()
        if notebook is not None:
            notebooks.append(notebook)
            results.append(result)
        else:
            errors.append(result)


results_df = pd.DataFrame(results)
results_df

Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='User', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='User', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=Chat

Unnamed: 0,task_link,n_messages,number_of_turns
0,https://colab.research.google.com/drive/1zOj_5...,8,4
1,https://colab.research.google.com/drive/1UoJmB...,6,3
2,https://colab.research.google.com/drive/16akew...,8,3
3,https://colab.research.google.com/drive/1WDGIi...,11,4
4,https://colab.research.google.com/drive/1EPUNK...,8,4
...,...,...,...
556,https://colab.research.google.com/drive/112T7j...,6,2
557,https://colab.research.google.com/drive/1R7Z-6...,8,4
558,https://colab.research.google.com/drive/1WQuPA...,10,4
559,https://colab.research.google.com/drive/17WwbW...,12,4


## GPT Review

In [None]:
import json

with open("batch_5_parsed_reviewed.json", "r") as f:
    results = json.load(f)

results = [r for r in results if r["n_messages"] != 0]
for r in results:
    r.pop("quality_review")

In [None]:
import os
from typing import List
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import tiktoken
from pydantic import BaseModel, Field
from llama_index.llms.openai import OpenAI
from llama_index import ServiceContext, set_global_service_context
from llama_index.program import OpenAIPydanticProgram
from llama_index.callbacks import CallbackManager, TokenCountingHandler

api_key = os.environ["OPENAI_API_KEY"]


token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4-1106-preview").encode
)
callback_manager = CallbackManager([token_counter])

class Feedback(BaseModel):
    score: int = Field(description="A score representing how good the conversation is in the given quality aspect, 1 is terrible, 5 is exemplary and flawless.", ge=1, le=5)
    issues: List[str] = Field(description="A concrete list of issues in the conversation. 15 words or less each.")
    praises: List[str] = Field(description="A concrete list of praise for exceptional behavior the conversation. 15 words or less each.")


class QualityAspect(BaseModel):
    name: str = Field(description="The name of the quality aspect.")
    instruction: str = Field(description="Instructions & details on how to inspect this quality aspect.")


quality_aspects = {
    "Overall": [
        QualityAspect(
            name="Completness",
            instruction="""
            How complete is the conversation? Completeness is defined as:
            - The assistant always responds to the user.
            - The conversation contains at least 1 back and forth between the user and the assistant.
            - The conversation flow is not broken.

            JUDGE THE ENTIRE CONVERSATION AS A WHOLE.
            """
        ),
    ],
    "User": [
        QualityAspect(
            name="Natural & Realistic", 
            instruction="""
            How does the user interaction resemble a real conversation and interactions a real user would have with a highly intelligent coding assistant.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
        QualityAspect(
            name="Coherent Follow ups", 
            instruction="""
            How coherent are the user's follow up messages to the assistant's messages in the conversation as a whole.
            Ideally, the user would incrementally build on the conversation to achieve their goal.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
    ],
    "Assistant": [
        QualityAspect(
            name="Code Quality", 
            instruction="""
            How good is the code that the assistant generates.
            Qualities:
            #   - Correctness
            #   - Optimality
            #   - PEP8 Compliance & Readability

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Text Quality", 
            instruction="""
            How good is the text that the assistant generates.
            Qualities:
            #   - Spelling
            #   - Grammar
            #   - Capitalization & Punctuation
            #   - Information Density (Should be a sweet spot leaning on the concise side, but not too concise... definitely not too verbose)
            #   - Explains Code Well

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Markdown Formatting", 
            instruction="""
            How good is the markdown formatting that the assistant generates. Is it leveraging markdown syntax tools to maximize the readability of the text?

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        )
    ]
}


def inspect_conversation_quality_aspect(conversation: List[List[dict]], quality_aspect: QualityAspect):
    """Inspect a conversation for a given quality aspect."""

    prompt_template_str = """
    IDENTITY:
    You are one of many specialized judges, so precisely focus on your quality aspect only.

    SITUATION:
    A large team is building a dataset of illustractions of dialogues showcasing the interaction between a user and a highly intelligent AI in the context of software development scenarios.
    - The user's replies should closely resemble authentic user engagement.
    - The AI's responses should aim to provide maximum benefit to the user.

    INSTRUCTIONS:
    Given the following conversation, please rate the quality of the conversation according to the given quality aspect.
    
    ALL QUALITY ASPECTS:
    {all_quality_aspects}

    YOUR QUALITY ASPECT:
    {quality_aspect}
    
    CONVERSATION:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        callback_manager=callback_manager,
        output_cls=Feedback, 
        prompt_template_str=prompt_template_str, 
    )
    all_quality_aspects = "\n".join([f"- {key}: {quality_aspect.name}" for key in quality_aspects.keys() for quality_aspect in quality_aspects[key]])
    output = program( 
        all_quality_aspects=all_quality_aspects,
        quality_aspect=quality_aspect.model_dump(),
        conversation=conversation["messages"],
        description="Judge the quality of the conversation according to the given quality aspect. Provide constructive criticism, rarely praise."
    )
    return output


def inspect_all_conversation_quality_aspects(conversation) -> dict:
    """Inspect a conversation for all quality aspects."""

    quality_results = {}
    for key in quality_aspects.keys():
        for quality_aspect in quality_aspects[key]:
            r = inspect_conversation_quality_aspect(conversation, quality_aspect)
            quality_results[f"{key} - {quality_aspect.name}"] = r.model_dump()

    return quality_results

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

reviewed_results = []

def process_notebook(result):
    if result is None or len(result["notebook"]["messages"]) == 0:
        return None
    result["quality_review"] = inspect_all_conversation_quality_aspects(result["notebook"])
    return result

with tqdm(total=len(results), desc="Processing notebooks") as pbar:
    with ThreadPoolExecutor(max_workers=15) as executor:
        futures = [executor.submit(process_notebook, result) for result in results]
        for future in as_completed(futures):
            r = future.result()
            if r is not None:
                reviewed_results.append(r)
            pbar.update(1)


Processing notebooks: 100%|██████████| 1060/1060 [52:46<00:00,  2.99s/it] 


In [None]:
import json 

with open("batch_5_parsed_reviewed.json", "w") as f:
    json.dump(reviewed_results, f, indent=4)

with open("batch_5_parsed_reviewed.json", "r") as f:
    reviewed_results = json.load(f)

NameError: name 'reviewed_results' is not defined

In [None]:
reviewed_results[2]

{'task_link': 'https://colab.research.google.com/drive/1yHgim2bxGo_Sf0c1BsSaFROtZZdfpiud',
 'n_messages': 6,
 'number_of_turns': 2,
 'notebook': {'metadata': {'topic': 'algorithms > by_data_structure > heaps',
   'type': 'modification',
   'target_turns': '2+'},
  'messages': [{'role': 'User',
    'content': "I have a min-heap implemented as a list in Python, and I'd like to add a method to get the kth smallest element. Could you help me with that? Here's my current min-heap class:\n```python\nclass MinHeap:\n    def __init__(self):\n        self.heap = []\n\n    def insert(self, val):\n        # Implementation for insert\n        pass\n\n    def get_min(self):\n        # Implementation for get_min\n        pass\n```",
    'type': 'markdown'},
   {'role': 'Assistant',
    'content': "Sure, here's a simple breakdown of how the kth_smallest method works in our MinHeap class:\n\n- ***Copy Heap***: Start by making a copy of the min-heap.\n- ***Heapify***: Reorganize the copy to maintain mi

In [None]:
data_skeleton = []
for rr in reviewed_results:
    scores = []
    feedback = ""
    for key in rr["quality_review"].keys():
        scores.append(rr["quality_review"][key]["score"])
        stringified_issues = "\n".join([f"- {issue}" for issue in rr["quality_review"][key]["issues"]])
        feedback += f"{key}: \n{stringified_issues}\n\n"

    data_skeleton.append({
        "task_link": rr["task_link"],
        "avg_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "issues": feedback,
    })

df_gpt_reviews = pd.DataFrame(data_skeleton)
df_gpt_reviews

Unnamed: 0,task_link,avg_score,min_score,issues
0,https://colab.research.google.com/drive/182asg...,4.000000,3,Overall - Completness: \n- Assistant's initial...
1,https://colab.research.google.com/drive/1qfFDR...,4.666667,4,Overall - Completness: \n\n\nUser - Natural & ...
2,https://colab.research.google.com/drive/1yHgim...,4.333333,3,Overall - Completness: \n\n\nUser - Natural & ...
3,https://colab.research.google.com/drive/1IMCwR...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
4,https://colab.research.google.com/drive/1jAjfx...,4.333333,4,Overall - Completness: \n\n\nUser - Natural & ...
...,...,...,...,...
1055,https://colab.research.google.com/drive/1PFqmH...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
1056,https://colab.research.google.com/drive/1qPvTL...,4.833333,4,Overall - Completness: \n\n\nUser - Natural & ...
1057,https://colab.research.google.com/drive/1lekWL...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
1058,https://colab.research.google.com/drive/1x_-q3...,4.666667,4,Overall - Completness: \n\n\nUser - Natural & ...


In [None]:
pd.set_option('display.max_colwidth', 500)


borderline_avg_flags = df_gpt_reviews.sort_values(by="avg_score", ascending=False)[df_gpt_reviews["avg_score"] < 4]
critical_mistake_flags = df_gpt_reviews.sort_values(by="min_score", ascending=False)[df_gpt_reviews["min_score"] < 3]

all_flags = pd.concat([borderline_avg_flags, critical_mistake_flags], ignore_index=True)
all_flags = all_flags.drop_duplicates(subset=["task_link"])

all_flags.sort_values(by="min_score", ascending=False)

In [None]:
all_flags = all_flags.merge(completed_to_be_delivered_df, on="task_link", how="left")[["task_link", "avg_score", "min_score", "issues", "assigned_to_email"]]
all_flags = all_flags.rename(columns={"assigned_to_email": "original_author_email"})
all_flags.to_csv("batch_5_flags.csv", index=False)

## Extract Advanced Metadata

In [11]:
import os
from typing import List
from pydantic import BaseModel, Field
from llama_index.program import OpenAIPydanticProgram
from llama_index.llms.openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures

from tqdm import tqdm
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
api_key = os.environ["OPENAI_API_KEY"]


class HierarchicalCategory(BaseModel):
    """Data model for hierarchical category classification."""
    top_level: str
    sub_level: str

def classify_conversation_by_domain(conversation: List[dict]) -> HierarchicalCategory:
    DOMAIN_CATEGORIES = """
        - Python basics & scripting
        - Problem Solving
        - Interview Prep
        - Web Development
        - Testing
        - Cloud Computing / Frameworks
        - Data Analysis
        - Machine Learning
        - Other languages
        - Other
    """

    prompt_template_str = """
    Categorize the theme of user requests in the following conversation by domain into one of the following top-level categories, then sub categories that you think is descriptive & appropriate:
    {categories}

    Conversation:
    {conversation}
    """

    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=HierarchicalCategory,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        categories=DOMAIN_CATEGORIES,
        conversation=conversation["messages"],
    )
    return output


def classify_conversation_by_action(
    conversation: List[dict]
) -> HierarchicalCategory:
    ACTION_CATEGORIES = """
    - Write code in python: this should have the user make requests that elicit python code writing behavior from the assistant.
    - Explain code: this should have the user present medium/high complexity code to the assistant and have the assistant explain it
    - Fix / refactor / optimize code: this should have the user present medium/high complexity code to the assistant and have the assistant do modifications on it as requested.
    - Debug error trace: the user should present a stack trace and some code and the assistant will find what the problem is and potentially fix the code (It's okay to have situations where the bug is not in the presented code but in a dependency... though this should be rare).... This EXCLUDES having the assistant teach the user how to use debug tools to find what the problem is themselves
    - Write unit tests: this should have the user present some low/medium/high complexity code to the assistant and have the assistant write tests for it... maximizing test coverage. (Critical Path first, Corner Cases Second)
    - Write CI/CD code: this should have the user request some help from the assistant in writing ci/cd pipelines in any flavor. (Github actions, Gitlab, Jenkins... etc)
    - Do a code review: this should have the user present some code snippet and request the assistant to review the code as if it's a PR... providing high level conceptual feedback, modifying any bugs and using inline comments to mark changes or suggest alternatives.
    - Write / modify / fix beam code: this should have the user present some data schema or dummy data and have the assistant write beam code for it.
    - Write / modify / fix spark code: this should have the user present some data schema or dummy data and have the assistant write spark code for it.
    - Write end to end ML training code: scenarios where the conversation has the user and assistant solving a problem e2e data eda/prep, feature extraction, training, maybe some evals and visuals as well
    - Help me take an interview: scenario where the user requests the assistant to act as an interviewer and do a mock interview with a focus on a certain area... this should also include some final section where the assistant gives feedback to the user on how to be better... etc (Take inspiration from real interview questions, they should be at least medium complexity and occasionally challenging)
    - Answer ML research questions: this is where the user will ask some cutting edge conceptual questions related to ML Research Hot topics to the assistant... assistant can but is not obligated to provide code as a response.
    - Answer infra questions: user asks some conceptual or code snippet related questions within the scope of cloud, backend, database, development tools... all flavors are welcome!
    - Write / modify / fix SQL code: this should have the user elicit interaction from the assistant within the context of SQL code.
    - Write / modify / fix JavaScript code: this should have the user elicit interaction from the assistant within the context of Javascript code.
    - Scrape a website: this should have the user present some html and the assistant write code to scrape it.
    """
    prompt_template_str = """
    Categorize the user requests in the following conversation by requested action into one of the following top-level categories. Sub-level should be empty string always. In case there's no natural fit, use "Other" as the top-level category.
    
    Please note that there are "metadata" fields in the conversation that describe the intended top-level category via "Project / Action"... this should be considered, but may be overridden if the conversation is clearly about something else.

    Categories:
    {categories}

    Conversation:
    {conversation}

    """

    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=HierarchicalCategory,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        categories=ACTION_CATEGORIES,
        conversation=conversation,
    )
    return output


class SummaryResult(BaseModel):
    """Data model for the summary result."""
    summary: str = Field(
        description="A short summary containing 1 sentence, 15 words max, focused on the specific theme. [super concise language]"
    )

def exec_summary(conversation: List[List[dict]]):
    prompt_template_str = """
    Given the following conversation, please, generate an executive summary of the conversation.

    User Use Case, why user uses the Assistant in this conversation, in general terms, **for what** the User is using it. Not from a technical perspective, but from a daily life situation perspective. 
    Example: work, homework, exam, studying, inteview, debugging, etc...

    It should also contain a little bit of the context of the conversation, and the main goal of the conversation.

    Conversation:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=SummaryResult,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        conversation=conversation["messages"]
    )
    return output


class GPTEstimationResult(BaseModel):
    """Data model for the GPT estimation result."""
    estimated_duration: int = Field(
        description="The estimated duration of the conversation in minutes."
    )

def gpt_estimated_duration(conversation: List[dict]) -> int:
    prompt_template_str = """
    Given the following conversation which has been generated by a median skilled technical human playing both User and Assistant... He also is responsible for making sure the assistant responses are flawless...
    Estimate how many minutes it would take to Design, Write & Verify this (Conversation Length, Complexity).

    If you get this right, you will save my life.

    Conversation:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=GPTEstimationResult,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        conversation=conversation["messages"]
    )
    return output


def process_conversation__metadata_extraction(conversation, task_link):
    domain = classify_conversation_by_domain(conversation)
    action = classify_conversation_by_action(conversation)
    summary = exec_summary(conversation)
    estimate_duration = gpt_estimated_duration(conversation)
    conversation["metadata"].update({
        "domain": domain.model_dump(), 
        "action": action.model_dump()["top_level"], 
        "use_case_summary": summary.model_dump()["summary"],
        "gpt_estimated_duration": estimate_duration.model_dump()["estimated_duration"],
        "task_link": task_link
    })
    return conversation
    
    

def extract_metadata_parallel(conversations, task_links, max_workers=15):
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(process_conversation__metadata_extraction, conversation, task_link)
            for conversation, task_link in zip(conversations, task_links)
        ]
        progress_bar = tqdm(total=len(futures))
        for future in concurrent.futures.as_completed(futures):
            results.append(future.result())
            progress_bar.update(1)
        progress_bar.close()
    return results


metadata_results = extract_metadata_parallel(notebooks, results_df["task_link"].tolist())

100%|██████████| 561/561 [09:29<00:00,  1.01s/it]


In [12]:
rich_task_metadata = []

for task_link in metadata_results:
    rich_task_metadata = []
    for task_link in metadata_results:
        row_main = {
            "task_link": task_link["metadata"]["task_link"],
            "number_of_turns": get_number_of_turns(task_link["messages"]),
            "batch_id": DELIVERY_BATCH_NAME[-1],
            "domain": {
                "top_level": task_link["metadata"]["domain"]["top_level"],
                "sub_level": task_link["metadata"]["domain"]["sub_level"],
            },
            "action": task_link["metadata"]["action"],
            "use_case__summary": task_link["metadata"]["use_case_summary"],
            "gpt_estimated_duration": task_link["metadata"]["gpt_estimated_duration"],
        }
        rich_task_metadata.append(row_main)


rich_flattened_metadata = []
for metadata in rich_task_metadata:
    current_metadata = metadata.copy()
    for key in metadata.keys():
        if isinstance(metadata[key], dict):
            for sub_key in metadata[key].keys():
                current_metadata[f"{key}__{sub_key}"] = current_metadata[key][sub_key]
            current_metadata.pop(key)
    rich_flattened_metadata.append(current_metadata)

df_metadata__output = pd.DataFrame(rich_flattened_metadata)
df_metadata__output

Unnamed: 0,task_link,number_of_turns,batch_id,action,use_case__summary,gpt_estimated_duration,domain__top_level,domain__sub_level
0,https://colab.research.google.com/drive/1UoJmB...,3,0,Answer infra questions,User seeks assistance for deploying a Flask ap...,10,Cloud Computing / Frameworks,Serverless Deployment
1,https://colab.research.google.com/drive/1EnQ3Y...,3,0,Write code in python,User seeks assistance for coding CRUD operatio...,10,Web Development,Flask / Backend Development
2,https://colab.research.google.com/drive/1NZy7e...,7,0,Answer infra questions,User seeks to understand architectural differe...,30,Cloud Computing / Frameworks,Architecture Patterns
3,https://colab.research.google.com/drive/1zOj_5...,4,0,Answer infra questions,The User is evaluating cloud providers for a S...,30,Cloud Computing / Frameworks,Cloud Provider Comparison
4,https://colab.research.google.com/drive/1y8bY0...,3,0,Debug error trace,User seeks assistance with Airflow webserver i...,10,Cloud Computing / Frameworks,Airflow
...,...,...,...,...,...,...,...,...
556,https://colab.research.google.com/drive/1c9Lky...,3,0,Write code in python,The User seeks assistance with implementing se...,30,Web Development,User Authentication
557,https://colab.research.google.com/drive/17WwbW...,4,0,Write code in python,A web developer seeks to enhance a Flask app w...,30,Web Development,Middleware Integration
558,https://colab.research.google.com/drive/14SuVN...,3,0,Write code in python,User seeks assistance for automating photo org...,60,Python basics & scripting,File and data manipulation
559,https://colab.research.google.com/drive/1VFE26...,2,0,Write code in python,User seeks assistance for creating a Flask app...,60,Web Development,Flask Web Framework


In [13]:
df_metadata__output["action"].value_counts()

action
Write code in python                    200
Answer infra questions                   53
Explain code                             41
Debug error trace                        39
Fix / refactor / optimize code           38
Write unit tests                         36
Write end to end ML training code        31
Help me take an interview                27
Scrape a website                         17
Write / modify / fix SQL code            17
Write / modify / fix JavaScript code     16
Write CI/CD code                         14
Do a code review                         10
Answer ML research questions              8
Write / modify / fix spark code           7
Write / modify / fix beam code            5
Other                                     2
Name: count, dtype: int64

In [14]:
from src.sheets_utils import upload_df_to_sheet, GoogleSheetsService

sheets_client = GoogleSheetsService(service_account_file, ['https://www.googleapis.com/auth/spreadsheets'])
df_metadata__output = df_metadata__output[["task_link", "batch_id", "number_of_turns", "gpt_estimated_duration", "action", "domain__top_level", "domain__sub_level", "use_case__summary"]]
values = [df_metadata__output.columns.tolist()] + df_metadata__output.values.tolist()
sheets_client.update_or_append_data_to_sheet(insights_sheet_id, INSIGHTS_VERSION_TAB, values)

Updated or appended data to 'v1 (Jan 25)'


{'spreadsheetId': '1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w',
 'tableRange': "'v1 (Jan 25)'!A1:H5288",
 'updates': {'spreadsheetId': '1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w',
  'updatedRange': "'v1 (Jan 25)'!A5289:H5850",
  'updatedRows': 562,
  'updatedColumns': 8,
  'updatedCells': 4496}}

In [15]:
import os
import json

directory = f"json_conversations/{DELIVERY_BATCH_NAME}"
if not os.path.exists(directory):
    os.makedirs(directory)

valid_notebooks = []
for r, n in zip(results, notebooks):
    if r is None or r["n_messages"] == 0:
        continue
    n["task_link"] = r["task_link"]
    valid_notebooks.append(n)

parsed_jsons = []
for vn in valid_notebooks:
    for rtm in rich_task_metadata:
        if vn["task_link"] == rtm["task_link"]:
            vn["metadata"] = rtm
            parsed_jsons.append(vn)

for pj in parsed_jsons:
    pj["id"] = pj.pop("task_link").split("/")[-1]
    try:
        pj["metadata"].pop("duration_mins")
        pj["metadata"].pop("batch_id")
    except KeyError:
        pass

for i, conversation in enumerate(parsed_jsons):
    drive_id = conversation["id"] 
    with open(f"json_conversations/{DELIVERY_BATCH_NAME}/{drive_id}.json", "w") as f:
        f.write(json.dumps(conversation))

## Upload JSONL

In [18]:
from src.gdrive_api import build_service
from src.gdrive_api.folder_upload import upload_folder

from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials

import io
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload, MediaIoBaseUpload



def upload_gdrive_file(file_contents, folder_id, service_account_file):
    """
    Re-uploads a dictionary from memory as a JSON file to Google Drive. 

    Parameters:
    - file_contents: dict
        The file contents to upload.
    - folder_id: str
        The ID of the file to upload.
    - service_account_file: str
        The path to the service account file.

    Returns True if the file was successfully uploaded, False otherwise.
    """
    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Convert the dictionary to JSON and prepare it for upload
    file_metadata = {
        'name': f'{file_contents["id"]}.json',
        'parents': [folder_id]
    }
    file_data = io.BytesIO(json.dumps(file_contents).encode('utf-8'))
    media = MediaIoBaseUpload(file_data, mimetype='application/json')

    # Upload the file
    try:
        file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
        return True
    except Exception as e:
        print(f"An error occurred: {e}")
        return False


def parallel_execute_with_progress(function, arguments, max_workers=10):
    """
    Executes a function in parallel with multiple arguments displaying a tqdm progress bar.

    Parameters
    function: function
        The function to execute
    arguments: list
        A list of tuples, where each tuple contains the arguments to pass to the function
    max_workers: int
        The maximum number of workers to use

    Returns a list of results
    """
    # Create a thread pool
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Execute the function for each argument
        futures = [executor.submit(function, *args) for args in arguments]
        results = []
        for future in tqdm(as_completed(futures), total=len(futures)):
            results.append(future.result())
    return results


statuses = parallel_execute_with_progress(upload_gdrive_file, [(pj, delivery_jsonl_gdrive_folder_id, service_account_file) for pj in parsed_jsons], max_workers=50)
pd.Series(statuses).value_counts()

100%|██████████| 561/561 [00:36<00:00, 15.51it/s]


True    561
Name: count, dtype: int64

In [19]:
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Function to check if a file is a folder
def is_folder(file):
    return file.get('mimeType') == 'application/vnd.google-apps.folder'

# Function to process files and folders
def process_files(service, folder_id, parent_folders=[]):
    query = f"'{folder_id}' in parents and trashed = false"
    page_token = None

    all_files = []
    while True:
        response = service.files().list(q=query,
                                        spaces='drive',
                                        fields='nextPageToken, files(id, name, mimeType, webViewLink)',
                                        pageToken=page_token).execute()

        for file in response.get('files', []):
            # Skip 'tool_data' folder
            if file.get('name') == 'tool_data' and is_folder(file):
                continue

            all_files.append(file)

            # Process the file or folder
            print('Processing:', '/'.join(parent_folders + [file.get('name')]))

            # If it's a folder, recursively process its contents
            if is_folder(file):
                children_files = process_files(service, file.get('id'), parent_folders + [file.get('name')])
                all_files.extend(children_files)

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

    return all_files

# Authenticate and create the service
SERVICE_ACCOUNT_FILE = service_account_file
SCOPES = ['https://www.googleapis.com/auth/drive']
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('drive', 'v3', credentials=credentials)

# Replace with your Google Drive folder ID
folder_id = destination_folder_url.split("/")[-1]

# Start processing from the specified folder
all_files = process_files(service, folder_id)

jsonl_df = pd.DataFrame(all_files)
jsonl_df = jsonl_df[["id", "name", "webViewLink"]]
jsonl_df

Processing: 112T7jIlxZ9OxXh3vRhMHsfScKvkQKsYs.json
Processing: 1kMh0GcOJQs_Hsv8pVYOCcajcrPdUV-4X.json
Processing: 17WwbWsmK1FjQyKUFlBz0S6NHRbYrdlQu.json
Processing: 1R7Z-6XVWhKd8jxBIwooXGAJpgF9TYZXL.json
Processing: 1OmBS8YfCz9auI-6_wRpxReUmuKjlJ6bA.json
Processing: 1WQuPASB2kwBdhbyxnFJ-75o6HkpPrO2z.json
Processing: 1c9LkyN6hDRb7xE_-P9tUsEO8e__A9QIX.json
Processing: 1YbterBoJEMH_VoVRM8BGEqB_uNmYIvmx.json
Processing: 1ac2j36_rUVpFJqFaOaIhr5__dStaDabc.json
Processing: 109dFY5WvjBbYnGAgezv7M4Z6pSZ8SLsq.json
Processing: 1My_UEqBv2_JcS1JWxHkyd0t9gOh5t261.json
Processing: 1FC74V6bo-hiSdF3IHdX3vggOv6v97SNt.json
Processing: 1N1wWXM2YbW88Jw6Tec_ypUqIHgwk66-2.json
Processing: 1JDKCg5E0BINBUEiRP-Kg8hQcGV-z87ed.json
Processing: 1fXKKKsG1lLmAfswXeS025D2DzglzzUHd.json
Processing: 11avVulvACI3u6OomqQxSLd6R7-D-ibxh.json
Processing: 1nxzJbJL8ZLzKGYZYSiSZi7Sl5aUyUpiy.json
Processing: 14LqskSlHATcPKG9ms328vQ-_omM5Y4yM.json
Processing: 1SDVU4b7zZ46EhQ_-h-nbeeNO3tFR0782.json
Processing: 1o7aEZrKPFuFCYhI4b2

Unnamed: 0,id,name,webViewLink
0,1di3Ikq5LiB3moG0E75rkZK4muYETUxVn,112T7jIlxZ9OxXh3vRhMHsfScKvkQKsYs.json,https://drive.google.com/file/d/1di3Ikq5LiB3mo...
1,1bBo9KjX3YAGeIfZ_8xzfOQhTEZppchRn,1kMh0GcOJQs_Hsv8pVYOCcajcrPdUV-4X.json,https://drive.google.com/file/d/1bBo9KjX3YAGeI...
2,1KlHJdNsI5D_8oXzVfOL_ZSmADJSqCjie,17WwbWsmK1FjQyKUFlBz0S6NHRbYrdlQu.json,https://drive.google.com/file/d/1KlHJdNsI5D_8o...
3,1jF1X3ZA2GG8niR_CR9QxRTmRMwVoek4v,1R7Z-6XVWhKd8jxBIwooXGAJpgF9TYZXL.json,https://drive.google.com/file/d/1jF1X3ZA2GG8ni...
4,1ZKBOls8fI0vezoyW-zYynU3sTMw5Eag4,1OmBS8YfCz9auI-6_wRpxReUmuKjlJ6bA.json,https://drive.google.com/file/d/1ZKBOls8fI0vez...
...,...,...,...
556,1smeBFGIqGj1llkhkNpyJM3pUYeejQHk7,1zOj_5wMcwAUMZ4scdxfJvhtg7luoXGn3.json,https://drive.google.com/file/d/1smeBFGIqGj1ll...
557,1_dXmzsV8cQDUq55AFg0J2J5TId13_zy2,1tS0oZayjTgt9rcTnQv88Oh6CKfbwsBBZ.json,https://drive.google.com/file/d/1_dXmzsV8cQDUq...
558,1IC2kmGVju4HX9q_uEIjwiyccQj5rboJz,1d3bS2Gti175I5sHJdXo4RW1TsNOzXPu5.json,https://drive.google.com/file/d/1IC2kmGVju4HX9...
559,1YESrG9sdNLJTImToRyICg_EooCe_CUbM,1WDGIi_FQ9p5RxEgWigDGorwqm5di3_Mn.json,https://drive.google.com/file/d/1YESrG9sdNLJTI...


In [20]:
conversation.keys()

parsed_jsons_ref = [
    {
        "colab_id": pj["id"],
        "task_link": pj["metadata"]["task_link"],
        "number_of_turns": pj["metadata"]["number_of_turns"],
    }
    for pj
    in parsed_jsons
]
conversation_df = pd.DataFrame(parsed_jsons_ref)
conversation_df

Unnamed: 0,colab_id,task_link,number_of_turns
0,1zOj_5wMcwAUMZ4scdxfJvhtg7luoXGn3,https://colab.research.google.com/drive/1zOj_5...,4
1,1UoJmB4imKSg2XT7ev1dguvMASrmp2YP1,https://colab.research.google.com/drive/1UoJmB...,3
2,16akewCNbJ_WKVcdT5A_3e-3JSKIfV6t6,https://colab.research.google.com/drive/16akew...,3
3,1WDGIi_FQ9p5RxEgWigDGorwqm5di3_Mn,https://colab.research.google.com/drive/1WDGIi...,4
4,1EPUNKPHSyVCz8vgRfybkNQi8VEy30DXK,https://colab.research.google.com/drive/1EPUNK...,4
...,...,...,...
556,112T7jIlxZ9OxXh3vRhMHsfScKvkQKsYs,https://colab.research.google.com/drive/112T7j...,2
557,1R7Z-6XVWhKd8jxBIwooXGAJpgF9TYZXL,https://colab.research.google.com/drive/1R7Z-6...,4
558,1WQuPASB2kwBdhbyxnFJ-75o6HkpPrO2z,https://colab.research.google.com/drive/1WQuPA...,4
559,17WwbWsmK1FjQyKUFlBz0S6NHRbYrdlQu,https://colab.research.google.com/drive/17WwbW...,4


In [21]:
jsonl_df["colab_id"] = jsonl_df["name"].apply(lambda x: x.split(".")[0])


df_merged = conversation_df.merge(jsonl_df, on="colab_id", how="inner")
df_merged = df_merged[["task_link", "number_of_turns", "webViewLink"]]
df_merged = df_merged.rename(columns={"webViewLink": "jsonl_link"})
df_merged

Unnamed: 0,task_link,number_of_turns,jsonl_link
0,https://colab.research.google.com/drive/1zOj_5...,4,https://drive.google.com/file/d/1smeBFGIqGj1ll...
1,https://colab.research.google.com/drive/1UoJmB...,3,https://drive.google.com/file/d/1JKxehgbnnd9RU...
2,https://colab.research.google.com/drive/16akew...,3,https://drive.google.com/file/d/1gb60vTwO78hkB...
3,https://colab.research.google.com/drive/1WDGIi...,4,https://drive.google.com/file/d/1YESrG9sdNLJTI...
4,https://colab.research.google.com/drive/1EPUNK...,4,https://drive.google.com/file/d/15-jPR2TKT3Sav...
...,...,...,...
556,https://colab.research.google.com/drive/112T7j...,2,https://drive.google.com/file/d/1di3Ikq5LiB3mo...
557,https://colab.research.google.com/drive/1R7Z-6...,4,https://drive.google.com/file/d/1jF1X3ZA2GG8ni...
558,https://colab.research.google.com/drive/1WQuPA...,4,https://drive.google.com/file/d/180pheUxammHNP...
559,https://colab.research.google.com/drive/17WwbW...,4,https://drive.google.com/file/d/1KlHJdNsI5D_8o...


## Upload Batch Sheet

In [22]:
from src.sheets_utils import upload_df_to_sheet

cols = ["task_link", "jsonl_link", "number_of_turns"]
upload_df_to_sheet(service_account_file, "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4", DELIVERY_BATCH_NAME, df_merged[cols])