## Settings and pointers

In [1]:
service_account_file = '../../creds/google__sa.json'

tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"
included_sheet_names = [
    "historical__personal_corrections",
]

jupyter_gdrive_folder_ids = [
    "1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9", # V0
    "1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb", # V1
    "1jV7WA5zB172DJUp7Z2XzHr62E6U6_NtY",
]

delivery_sheet_id = "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4"
delivery_jsonl_gdrive_folder_id = "1xkWC7VYe0NthWxdxzZ8aVQI7s4QiWP6t"
destination_folder_url = f"https://drive.google.com/drive/folders/{delivery_jsonl_gdrive_folder_id}"


BATCH_NAME = "Redo 1-5 (Jan 25 Feedback)"

## Source Code


In [2]:
import sys 
sys.path.append('../../')
import io
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

import nbformat
import pandas as pd
from tqdm import tqdm
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload



def get_number_of_turns(messages):
    count = 0
    for message in messages:
        if message["role"] == "User":
            count += 1
    return count


def standardize_date_format(date):
    """
    Given a date string, standardize the date format to YYYY/MM/DD.
    """
    try:
        # Parse the date string into a datetime object
        standardized_date = datetime.strptime(date, "%Y/%m/%d")
    except ValueError:
        try:
            # Attempt to parse other common formats here
            # Example: MM/DD/YYYY
            standardized_date = datetime.strptime(date, "%m/%d/%Y")
        except ValueError:
            return "Invalid date format"

    # Format the datetime object into the desired string format
    return standardized_date.strftime("%Y/%m/%d")
###################################


#########################
    # Colab #
#########################


def update_colab_notebook(colab_link, local_nb_path, sa_creds_path):
    """
    Update a Google Colab notebook file in Google Drive.

    :param colab_link: The link to the Colab notebook in Google Drive.
    :param local_nb_path: The local path of the notebook file to upload.
    :param sa_creds_path: The path to the service account credentials.
    """
    try:
        # Extract file ID from the Colab link
        file_id = colab_link.split('/drive/')[1].split('/')[0]
    except IndexError:
        raise ValueError("Invalid Colab link format")

    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(sa_creds_path, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Specify the file to upload
    media = MediaFileUpload(local_nb_path, resumable=True)

    # Update the file
    try:
        updated_file = service.files().update(fileId=file_id, media_body=media).execute()
        return f"Updated file ID: {updated_file.get('id')}"
    except Exception as e:
        return f"Error updating file: {e}"


def get_colab_notebook(colab_link, sa_creds_path) -> nbformat.NotebookNode:
    try:
        # Extract file ID from the Colab link
        file_id = colab_link.split('/drive/')[1].split('/')[0]
    except IndexError:
        raise ValueError("Invalid Colab link format")

    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(sa_creds_path, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Download the file
    try:
        request = service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)

        done = False
        while not done:
            status, done = downloader.next_chunk()
        
        # Load as nbformat notebook
        notebook = nbformat.reads(fh.getvalue().decode(), as_version=4)
        return notebook
    except Exception as e:
        print(f"Error downloading file: {e}")
        return None


def get_file_name_from_colab_link(colab_link, service_account_file):
    try:
        file_id = colab_link.split('/drive/')[1]
    except IndexError:
        return None

    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    try:
        file = service.files().get(fileId=file_id).execute()
        return file.get('name')
    except Exception as e:
        return None


def fetch_file_names_parallel(links, service_account_file, max_workers=100):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_file_name_from_colab_link, link, service_account_file): link for link in links}
        results = {}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching File Names"):
            link = futures[future]
            try:
                file_name = future.result()
                results[link] = file_name
            except Exception as e:
                results[link] = None
        return results

## Read Remote Sheet

In [4]:
from src.sheets_utils import download_sheet_as_df


df = download_sheet_as_df(service_account_file, tracking_sheet_id, "historical__personal_corrections")
completed_df = df[df["completion_status"] == "Done"]
completed_df = completed_df.drop_duplicates(subset=["task_link"])

delivered = pd.concat([
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 1"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 2"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 3"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 4"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 5"),
], ignore_index=True)


# Redeliver all previous batches
completed_to_be_delivered_df = completed_df[completed_df["task_link"].isin(delivered["task_link"])]
completed_to_be_delivered_df

Unnamed: 0,task_link,original_author,original_date,resolved_by_email,resolution_duration,completion_status,completion_date,corrections
0,https://colab.research.google.com/drive/1iF83Q...,caio.s@turing.com,12/21/2023,abdul.r@turing.com,4,Done,1/30/2024,
1,https://colab.research.google.com/drive/1Z3U6A...,edwin.n@turing.com,1/3/2024,aman.s@turng.com,13,Done,1/29/2024,added additional explanation in first turn
2,https://colab.research.google.com/drive/1PDXIU...,edwin.n@turing.com,1/5/2024,martinho.h@turing.com,35,Done,1/30/2024,"more formal language, remove links, rewrite di..."
3,https://colab.research.google.com/drive/1YdJXI...,edwin.n@turing.com,1/6/2024,aman.s@turng.com,5,Done,1/29/2024,general check
4,https://colab.research.google.com/drive/1RGj4d...,edwin.n@turing.com,1/7/2024,andranik.g@turing.com,20,Done,1/31/2024,created more formal text
...,...,...,...,...,...,...,...,...
3463,https://colab.research.google.com/drive/1Q37Ui...,ritesh.r@turing.com,12/30/2023,zain.v@turing.com,20,Done,1/29/2024,added a new turn and made the code pep8 compli...
3464,https://colab.research.google.com/drive/1xfwmc...,ruturaj.m@turing.com,12/21/2023,aman.s@turing.com,15,Done,01/30/2024,"added comments to code, pep8, fixed backticks"
3465,https://colab.research.google.com/drive/1mBEuv...,shaharyar.t@turing.com,1/4/2024,zain.v@turing.com,25,Done,1/29/2024,added more complex first turn and explanation....
3466,https://colab.research.google.com/drive/1NNZsK...,toh.y@turing.com,1/8/2024,zain.v@turing.com,7,Done,1/29/2024,


In [5]:
from src.llm_reviewer.notebook_parser import notebook_parser
from concurrent.futures import ThreadPoolExecutor, as_completed


notebooks = []
results = []

def process_task_link(task_link):
    notebook = get_colab_notebook(task_link, service_account_file)
    parsed_notebook = notebook_parser(notebook)
    parsed_messages = parsed_notebook["messages"]
    number_of_turns = get_number_of_turns(parsed_messages)
    return parsed_notebook, {
        "task_link": task_link,
        "n_messages": len(parsed_messages),
        "number_of_turns": number_of_turns,
    }

with ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(process_task_link, task_link) for task_link in completed_to_be_delivered_df["task_link"].tolist()]
    for future in as_completed(futures):
        notebook, result = future.result()
        notebooks.append(notebook)
        results.append(result)


results_df = pd.DataFrame(results)
results_df

Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='User', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message

  validate(nb)


Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, me

Unnamed: 0,task_link,n_messages,number_of_turns
0,https://colab.research.google.com/drive/1Z3U6A...,4,2
1,https://colab.research.google.com/drive/1SFg4m...,12,3
2,https://colab.research.google.com/drive/1YsBFy...,15,3
3,https://colab.research.google.com/drive/1j-vwd...,31,6
4,https://colab.research.google.com/drive/1iF83Q...,2,1
...,...,...,...
2992,https://colab.research.google.com/drive/1DipnX...,2,1
2993,https://colab.research.google.com/drive/1NNZsK...,12,6
2994,https://colab.research.google.com/drive/1xfwmc...,3,1
2995,https://colab.research.google.com/drive/1lS1I4...,18,5


## GPT Review

In [7]:
import os
from typing import List
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import tiktoken
from pydantic import BaseModel, Field
from llama_index.llms.openai import OpenAI
from llama_index import ServiceContext, set_global_service_context
from llama_index.program import OpenAIPydanticProgram
from llama_index.callbacks import CallbackManager, TokenCountingHandler

api_key = os.environ["OPENAI_API_KEY"]


token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4-1106-preview").encode
)
callback_manager = CallbackManager([token_counter])

class Feedback(BaseModel):
    issues: List[str] = Field(description="A concrete list of issues in the conversation. 15 words or less each.")
    praises: List[str] = Field(description="A concrete list of a highlight praise for exceptional behavior in the conversation. 15 words or less. 1 or 0 items.")
    score: int = Field(description="A score representing how good the conversation is in the given quality aspect, 1 is terrible, 5 is exemplary and flawless.", ge=1, le=5)


class QualityAspect(BaseModel):
    name: str = Field(description="The name of the quality aspect.")
    instruction: str = Field(description="Instructions & details on how to inspect this quality aspect.")


quality_aspects = {
    "Overall": [
        QualityAspect(
            name="Completness",
            instruction="""
            How complete is the conversation? Completeness is defined as:
            - The assistant always responds to the user.
            - The conversation contains at least 1 back and forth between the user and the assistant.
            - The conversation flow makes sense and does not seem like it's out of order or contains gaps.

            JUDGE THE ENTIRE CONVERSATION AS A WHOLE.
            """
        ),
    ],
    "User": [
        QualityAspect(
            name="Natural & Realistic", 
            instruction="""
            How does the user interaction resemble a real conversation and interactions a real technical user would have with a highly intelligent coding assistant as part of his day to day workflow.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        )
    ],
    "Assistant": [
        QualityAspect(
            name="Code Quality", 
            instruction="""
            How good is the code that the assistant generates.
            Qualities:
            #   - Correctness
            #   - Optimality
            #   - PEP8 Compliance & Readability

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Text Quality", 
            instruction="""
            How good is the text that the assistant generates.
            Qualities:
            #   - Spelling
            #   - Grammar
            #   - Capitalization & Punctuation
            #   - Information Density (Should be a sweet spot leaning on the concise side, but not too concise... definitely not too verbose)
            #   - Explains Code Well

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Markdown Formatting", 
            instruction="""
            How good is the markdown formatting that the assistant generates. Is it leveraging markdown syntax tools to maximize the readability of the text?

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        )
    ]
}


def inspect_conversation_quality_aspect(conversation: List[List[dict]], quality_aspect: QualityAspect):
    """Inspect a conversation for a given quality aspect."""

    prompt_template_str = """
    IDENTITY:
    You are one of many specialized judges, so precisely focus on your quality aspect only.

    SITUATION:
    A large team is building a dataset of illustractions of dialogues showcasing the interaction between a user and a highly intelligent AI in the context of software development scenarios.
    - The user's replies should closely resemble authentic user engagement.
    - The AI's responses should aim to provide maximum benefit to the user.

    INSTRUCTIONS:
    Given the following conversation, please rate the quality of the conversation according to the given quality aspect.
    
    ALL QUALITY ASPECTS:
    {all_quality_aspects}

    YOUR QUALITY ASPECT:
    {quality_aspect}
    
    CONVERSATION:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        callback_manager=callback_manager,
        output_cls=Feedback, 
        prompt_template_str=prompt_template_str, 
    )
    all_quality_aspects = "\n".join([f"- {key}: {quality_aspect.name}" for key in quality_aspects.keys() for quality_aspect in quality_aspects[key]])
    output = program( 
        all_quality_aspects=all_quality_aspects,
        quality_aspect=quality_aspect.model_dump(),
        conversation=conversation["messages"],
        description="Judge the quality of the conversation according to the given quality aspect. Provide constructive criticism, rarely praise."
    )
    return output


def inspect_all_conversation_quality_aspects(conversation) -> dict:
    """Inspect a conversation for all quality aspects."""

    quality_results = {}
    for key in quality_aspects.keys():
        for quality_aspect in quality_aspects[key]:
            r = inspect_conversation_quality_aspect(conversation, quality_aspect)
            quality_results[f"{key} - {quality_aspect.name}"] = r.model_dump()

    return quality_results

In [12]:
for r, n in zip(results, notebooks):
    r.update({"notebook": n})

In [13]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

reviewed_results = []

def process_notebook(result):
    if result is None or len(result["notebook"]["messages"]) == 0:
        return None
    result["quality_review"] = inspect_all_conversation_quality_aspects(result["notebook"])
    return result

with tqdm(total=len(results), desc="Processing notebooks") as pbar:
    with ThreadPoolExecutor(max_workers=15) as executor:
        futures = [executor.submit(process_notebook, result) for result in results]
        for future in as_completed(futures):
            r = future.result()
            if r is not None:
                reviewed_results.append(r)
            pbar.update(1)


Processing notebooks:  57%|█████▋    | 1695/2997 [2:24:10<1:50:44,  5.10s/it]


BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 522679 tokens (522519 in the messages, 160 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [14]:
import json 

with open("redos_1to5_parsed_reviewed.json", "w") as f:
    json.dump(reviewed_results, f, indent=4)

with open("redos_1to5_parsed_reviewed.json", "r") as f:
    reviewed_results = json.load(f)

In [16]:
len(reviewed_results)

1695

In [18]:
data_skeleton = []
for rr in reviewed_results:
    scores = []
    feedback = ""
    for key in rr["quality_review"].keys():
        scores.append(rr["quality_review"][key]["score"])
        stringified_issues = "\n".join([f"- {issue}" for issue in rr["quality_review"][key]["issues"]])
        feedback += f"{key}: \n{stringified_issues}\n\n"

    data_skeleton.append({
        "task_link": rr["task_link"],
        "avg_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "issues": feedback,
    })

df_gpt_reviews = pd.DataFrame(data_skeleton)
df_gpt_reviews

Unnamed: 0,task_link,avg_score,min_score,issues
0,https://colab.research.google.com/drive/17A_7WM5vPNF7yRaNChYQO-uFUdMJUbFf,3.2,3,Overall - Completness: \n- The conversation ends abruptly without confirming user understanding or satisfaction.\n- The assistant does not ask for further clarification or offer additional help.\n\nUser - Natural & Realistic: \n- User's question is too technical and specific for a general audience.\n- User's follow-up question lacks context on how to measure memory usage.\n\nAssistant - Code Quality: \n- The code provided is correct but lacks explanation on generator benefits.\n- No mention ...
1,https://colab.research.google.com/drive/1Z3U6AG3-61TGP5_A0xQVBGBkeRYGqZ9e,2.0,1,"Overall - Completness: \n- The conversation ends abruptly without a resolution or next steps.\n- The user's last message is not adequately addressed by the assistant.\n\nUser - Natural & Realistic: \n- User's message is overly technical and detailed for a casual conversation.\n- User's message lacks casual language, making it less natural.\n- User's message could be more concise for a real-world scenario.\n\nAssistant - Code Quality: \n- The assistant did not provide any code.\n- The respons..."
2,https://colab.research.google.com/drive/1Uht8yp4xoLJvqrgqbyva5Xk0EakOSFAx,2.2,1,"Overall - Completness: \n- The conversation ends abruptly without resolving the user's issue.\n- The assistant's response does not address the specific error message.\n\nUser - Natural & Realistic: \n- The user's message is too long and detailed for a casual conversation.\n- The user's message includes a full error trace, which is not typical in casual conversation.\n- The user's message lacks context about previous troubleshooting steps.\n\nAssistant - Code Quality: \n- Assistant did not pr..."
3,https://colab.research.google.com/drive/1R8reDgkmTqIiYLdR0_iRwhNgK-C2W_xZ,3.6,3,"Overall - Completness: \n- The conversation lacks a closing statement from the user or the assistant.\n\nUser - Natural & Realistic: \n- User's language is too formal for a casual conversation.\n- User's questions are overly specific, lacking context.\n\nAssistant - Code Quality: \n- The code snippets are basic and lack context-specific optimization.\n- No mention of advanced profiling tools or techniques for DevOps.\n- PEP8 compliance not explicitly addressed in the conversation.\n- No disc..."
4,https://colab.research.google.com/drive/13APEjopYQLDKaNbfgHxbexC1pgdmAlrk,2.8,2,Overall - Completness: \n- The conversation ends abruptly without a closing or next steps.\n- The AI does not confirm understanding of the user's code snippet.\n\nUser - Natural & Realistic: \n- User's question about AI for tic-tac-toe lacks context or specific requirements.\n\nAssistant - Code Quality: \n- The code does not use the existing 'check_winner' function.\n- The code does not handle the case where a winner exists.\n- The code does not follow PEP8 guidelines for function naming.\n-...
...,...,...,...,...
1690,https://colab.research.google.com/drive/1bfpC-fOrkhzYRBj9KQGi2SxlMZ-kmVqs,3.8,3,"Overall - Completness: \n- The conversation seems complete with no missing steps or gaps.\n\nUser - Natural & Realistic: \n- User's initial request could be more concise.\n- User's follow-up question is generic, lacks context.\n\nAssistant - Code Quality: \n- The code lacks comments explaining each step.\n- No error handling for network issues or user cancellation.\n- The code does not show how to handle the sign-in result securely.\n\nAssistant - Text Quality: \n- Some explanations may be t..."
1691,https://colab.research.google.com/drive/1rW3qFwNn6af5FPrcZZH5KyZP6yYx0pRL,3.0,3,Overall - Completness: \n- The conversation ends abruptly without a closing statement.\n- The user's last question is not fully addressed with examples or details.\n\nUser - Natural & Realistic: \n- User's second message lacks technical depth expected from a real user.\n- User's questions could be more specific and detailed.\n\nAssistant - Code Quality: \n- The code example is correct but lacks comments explaining the closure concept.\n- The code does not handle non-integer powers or negativ...
1692,https://colab.research.google.com/drive/1pZEdjXACplnkfmrly2Hjo5gYNJSpCSng,3.0,2,Overall - Completness: \n- The conversation lacks a closing statement from the user acknowledging the explanation.\n- The conversation ends abruptly without confirming user understanding or satisfaction.\n\nUser - Natural & Realistic: \n- User's request for elaboration could be more specific.\n\nAssistant - Code Quality: \n- The code does not handle the case where n or k is not an integer.\n- No explanation of time complexity is provided.\n- The code lacks comments explaining the logic withi...
1693,https://colab.research.google.com/drive/1P9NxK816SM_kzc6_uhNdlaQY3mp3GxRp,3.0,2,"Overall - Completness: \n- The conversation does not contain any user feedback or confirmation of understanding.\n- The conversation ends abruptly without a closing or check for further questions.\n\nUser - Natural & Realistic: \n- User's initial question is clear but lacks context or specifics.\n- User's follow-up questions are too brief and lack detail.\n- User's responses are too structured, lacking conversational flow.\n\nAssistant - Code Quality: \n- The patch decorator should use a str..."


In [19]:
pd.set_option('display.max_colwidth', 500)


borderline_avg_flags = df_gpt_reviews.sort_values(by="avg_score", ascending=False)[df_gpt_reviews["avg_score"] < 4]
critical_mistake_flags = df_gpt_reviews.sort_values(by="min_score", ascending=False)[df_gpt_reviews["min_score"] < 3]

all_flags = pd.concat([borderline_avg_flags, critical_mistake_flags], ignore_index=True)
all_flags = all_flags.drop_duplicates(subset=["task_link"])

all_flags.sort_values(by="min_score", ascending=False)

  borderline_avg_flags = df_gpt_reviews.sort_values(by="avg_score", ascending=False)[df_gpt_reviews["avg_score"] < 4]
  critical_mistake_flags = df_gpt_reviews.sort_values(by="min_score", ascending=False)[df_gpt_reviews["min_score"] < 3]


Unnamed: 0,task_link,avg_score,min_score,issues
0,https://colab.research.google.com/drive/1g0rdBIbvX6Tgfk46_NohBbiZvalpyVY6,3.8,3,"Overall - Completness: \n- The conversation is complete and follows a logical order.\n\nUser - Natural & Realistic: \n- User's questions are too technical and specific for a general user.\n- User's questions lack context or explanation of their scenario.\n- User's questions are consistently well-formed, which is uncommon in casual conversation.\n\nAssistant - Code Quality: \n- The code lacks comments explaining the logic.\n- The code examples are missing import statements.\n- The code does n..."
638,https://colab.research.google.com/drive/10oKku567sJy5iRSR_eqnU2FxsID6nKSQ,3.2,3,"Overall - Completness: \n- The conversation ends abruptly without a closing or confirmation from the user.\n\nUser - Natural & Realistic: \n- User's questions are too structured and lack informal language.\n- User's follow-up questions are too immediate, lacking natural pauses.\n- User does not make any typos or errors, which is unrealistic.\n\nAssistant - Code Quality: \n- Code snippets are repeated unnecessarily in multiple responses.\n- The lambda function in the last code snippet should ..."
526,https://colab.research.google.com/drive/10G-Hx2XI60GxGVAfRsBKnLiPVZPkc-Jd,3.4,3,Overall - Completness: \n- The conversation lacks an initial user query to start the interaction.\n- The Assistant's first message is out of context without a user prompt.\n\nUser - Natural & Realistic: \n- User's questions are clear and relevant to the context.\n- User's engagement is consistent with a real-world scenario.\n- User's progression of questions shows a logical flow.\n\nAssistant - Code Quality: \n- The code uses single quotes inside a dictionary key which can cause a syntax err...
528,https://colab.research.google.com/drive/1p257boPIhj115UkZiqTEF896R9VaA4VJ,3.4,3,Overall - Completness: \n- The Assistant's second response is missing a real example as requested by the User.\n\nUser - Natural & Realistic: \n- User's request for a 'real example' could be more specific.\n- User's question about file separators could be more technically precise.\n- User's follow-up question on static config could be more detailed.\n\nAssistant - Code Quality: \n- The code example is correct but lacks comments explaining the code.\n- The Assistant should mention the importa...
529,https://colab.research.google.com/drive/1gUgnrNw04T8qmYwWBO_pGCNDa-Nsp44r,3.4,3,Overall - Completness: \n- The conversation lacks user confirmation or follow-up after the final assistant message.\n\nUser - Natural & Realistic: \n- User's request to change learning rate and optimizer is too specific for a casual user.\n- User's knowledge of saving and loading model weights seems advanced for a general user.\n\nAssistant - Code Quality: \n- The code provided is correct and functional.\n- Optimality is not fully assessed without context.\n- PEP8 compliance and readability ...
...,...,...,...,...
1526,https://colab.research.google.com/drive/1lQ2P9yIKk_ykKbW6bEHKe29IyKFarbmX,2.4,1,Overall - Completness: \n- The conversation lacks a clear conclusion or next steps.\n- The conversation ends abruptly without a user response to the last message.\n\nUser - Natural & Realistic: \n- User's initial request is too formal and lacks casual language.\n- User's technical response is overly detailed for a casual conversation.\n- User's code sharing lacks context or lead-in conversation.\n\nAssistant - Code Quality: \n- The assistant did not provide any code.\n- The assistant's feedb...
1393,https://colab.research.google.com/drive/1_Kd1nOJCPSNiEJBLfddd5N9k-0BdAxDp,2.8,1,Overall - Completness: \n- The conversation ends abruptly without a user response to the Assistant's suggestions.\n\nUser - Natural & Realistic: \n- User's initial message could include a code snippet for context.\n- User's description of the issue is somewhat technical but lacks clarity.\n- User's follow-up with code and traceback is helpful but delayed.\n\nAssistant - Code Quality: \n- The Assistant's code does not address the user's issue.\n- The Assistant's code is identical to the user'...
1480,https://colab.research.google.com/drive/18OoTGhJ7GilsEqQMsm_O8nBh11T26ctE,2.6,1,"Overall - Completness: \n- The Assistant's initial message incorrectly assumes the role of the User.\n\nUser - Natural & Realistic: \n- User's initial greeting 'Hey buddy!' is too informal for a professional setting.\n- User's challenge 'do you believe you can actually achieve this in a one-liner' seems condescending.\n- User's language is casual, may not reflect a typical professional technical user's tone.\n\nAssistant - Code Quality: \n- Assistant did not provide any code.\n- Assistant's ..."
1523,https://colab.research.google.com/drive/17qaUhO3VmOkVJiRx_JSEDspVsqRZyCne,2.4,1,"Overall - Completness: \n- The Assistant's initial message incorrectly assumes the role of the User.\n\nUser - Natural & Realistic: \n- User's initial response is too formal and lacks conversational tone.\n- User's explanation is overly technical for casual conversation.\n- User's code example is correct but lacks any personal touch or context.\n\nAssistant - Code Quality: \n- The assistant did not generate code, only explained the user's code.\n- No code quality can be judged as no new code..."


In [None]:
all_flags = all_flags.merge(completed_to_be_delivered_df, on="task_link", how="left")[["task_link", "avg_score", "min_score", "issues", "assigned_to_email"]]
all_flags = all_flags.rename(columns={"assigned_to_email": "original_author_email"})
all_flags.to_csv("batch_5_flags.csv", index=False)

## Fetch & Integrate Advanced Metadata

In [20]:
import sys 
sys.path.append('../../')
import pandas as pd
from src.sheets_utils import download_sheet_as_df


service_account_file = '../../creds/google__sa.json'
insights_spreadsheet_id = "1wUWll720oz6Rnc4YKHeWFWbZbTMWbda2rfVh8jPLU2g"

included_sheets = [
    "Expanded Enriched Data",
    "Behavioural Tags",
    "Use Case Tags",
    "Programming Language Tags",
    "Dependency Tags"
]

df_metadata = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Expanded Enriched Data")

df_metadata__behavioral = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Behavioural Tags")

df_metadata__use_case = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Use Case Tags")

df_metadata__programming_language = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Programming Language Tags")

df_metadata__dependency = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Dependency Tags")

In [24]:
rich_task_metadata = []

for task_link in df_metadata["task_link"].tolist():
    row_main = df_metadata.loc[df_metadata["task_link"] == task_link].to_dict(orient="records")[0]

    behavioral_rows = df_metadata__behavioral.loc[df_metadata__behavioral["task_link"] == task_link].to_dict(orient="records")
    row_main["behavioral_tags"] = [{key: row[key] for key in row.keys() if key != "task_link"} for row in behavioral_rows]


    programming_language_rows = df_metadata__programming_language.loc[df_metadata__programming_language["task_link"] == task_link].to_dict(orient="records")
    row_main["programming_language_tags"] = [{key: row[key] for key in row.keys() if key != "task_link"} for row in programming_language_rows]

    dependency_rows = df_metadata__dependency.loc[df_metadata__dependency["task_link"] == task_link].to_dict(orient="records")
    row_main["dependency_tags"] = [{key: row[key] for key in row.keys() if key != "task_link"} for row in dependency_rows]    

    # Edit the original metadata
    row_main["topic_classication"] = row_main.pop("metadata__topic")

    row_main["area_of_focus_classification"] = {
        "top_level": row_main.pop("area_of_focus__top_level"),
        "sub_level": row_main.pop("area_of_focus__sub_level"),
        # "detailed_level": row_main.pop("area_of_focus__detailed_level"),
    }
    row_main["domain_classification"] = {
        "top_level": row_main.pop("domain__top_level"),
        "sub_level": row_main.pop("domain__sub_level"),
        # "detailed_level": row_main.pop("domain__detailed_level"),
    }
    rich_task_metadata.append(row_main)

In [25]:
valid_notebooks = []
for r, n in zip(results, notebooks):
    if r is None or r["n_messages"] == 0:
        continue
    n["task_link"] = r["task_link"]
    valid_notebooks.append(n)


parsed_jsons = []
for vn in valid_notebooks:
    for rtm in rich_task_metadata:
        if vn["task_link"] == rtm["task_link"]:
            vn["metadata"] = rtm
            parsed_jsons.append(vn)


for pj in parsed_jsons:
    pj["id"] = pj.pop("task_link").split("/")[-1]
    try:
        pj["metadata"].pop("duration_mins")
        pj["metadata"].pop("batch_id")
    except KeyError:
        pass


for i, conversation in enumerate(parsed_jsons):
    drive_id = conversation["id"] 
    with open(f"json_conversations/{BATCH_NAME}/{drive_id}.json", "w") as f:
        f.write(json.dumps(conversation))

## Upload JSONL

In [26]:
from src.gdrive_api import build_service
from src.gdrive_api.folder_upload import upload_folder

service = build_service(service_account_file)
uploaded_files = upload_folder(service, f'json_conversations/{BATCH_NAME}/', destination_folder_url, force_replace = True, is_url=True)
uploaded_files

------------------------------------------------------------
Processing directory .: 1 of 0 in total.
Uploading file 1 of 2995 in '.', 1 of 2995 in total.
Uploading new file '12EkLbqhOebbUraSDmW-xnEtcSlcdDRFP.json'.
File '12EkLbqhOebbUraSDmW-xnEtcSlcdDRFP.json' has been uploaded.
Uploaded '12EkLbqhOebbUraSDmW-xnEtcSlcdDRFP.json' to folder ID '1xkWC7VYe0NthWxdxzZ8aVQI7s4QiWP6t'.
12EkLbqhOebbUraSDmW-xnEtcSlcdDRFP.json
Uploading file 2 of 2995 in '.', 2 of 2995 in total.
Uploading new file '18ETs1OvBXFDQt12yoMMtRGKjh0r1nGgF.json'.
File '18ETs1OvBXFDQt12yoMMtRGKjh0r1nGgF.json' has been uploaded.
Uploaded '18ETs1OvBXFDQt12yoMMtRGKjh0r1nGgF.json' to folder ID '1xkWC7VYe0NthWxdxzZ8aVQI7s4QiWP6t'.
18ETs1OvBXFDQt12yoMMtRGKjh0r1nGgF.json
Uploading file 3 of 2995 in '.', 3 of 2995 in total.
Uploading new file '1cx0m_zX4ZiDyoRUYIscfg26xiy4gZboE.json'.
File '1cx0m_zX4ZiDyoRUYIscfg26xiy4gZboE.json' has been uploaded.
Uploaded '1cx0m_zX4ZiDyoRUYIscfg26xiy4gZboE.json' to folder ID '1xkWC7VYe0NthWxdxzZ

{'12EkLbqhOebbUraSDmW-xnEtcSlcdDRFP.json': 'https://drive.google.com/uc?id=1LaCg8DS1U7PfVDnRnyEczxECWJDv-Ki1',
 '18ETs1OvBXFDQt12yoMMtRGKjh0r1nGgF.json': 'https://drive.google.com/uc?id=1nZReO0nJRokaLSHgN1syRrzS8pnIquNt',
 '1cx0m_zX4ZiDyoRUYIscfg26xiy4gZboE.json': 'https://drive.google.com/uc?id=10A43LcYHIVMMZdlZUuIhYHTi9h22i3Gp',
 '1C6aodI2G8_fP3HUXjSPivrdLPl0Rc17s.json': 'https://drive.google.com/uc?id=1bU0n1p4l9u5RupzFzOKyu5umdx5lB2i0',
 '1WUFylpi4ykfVh6s9oOAUED43SUPmKKGk.json': 'https://drive.google.com/uc?id=1OPo66fP5SzoeKyiEfGhKZDXLG_W1hHNG',
 '1_HLk3ZzRK97Qmxg6_X-XUa8MJtZfhNa2.json': 'https://drive.google.com/uc?id=1wckmkpsXrpTLnXmyCU50OKbJa53ZEifG',
 '14Trb9FuTOc0qAzJSjDUNK9kSCDmzoC4P.json': 'https://drive.google.com/uc?id=1zxliQerlVWbcvArn8eCRKESfqs6MMiQ7',
 '1kXNPgDueZBr-4Uv-2MckDHCnuwUpmxxK.json': 'https://drive.google.com/uc?id=1Vy0mjkbS59fbgu1nSWKb34nyi5fVwbz1',
 '1X-6J-O_WJhbDeTTq-Rbf8E7gGMfcDHNI.json': 'https://drive.google.com/uc?id=15rUBbxWxI3riPoN_Baeue8YJo1vE0VSW',
 

In [27]:
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Function to check if a file is a folder
def is_folder(file):
    return file.get('mimeType') == 'application/vnd.google-apps.folder'

# Function to process files and folders
def process_files(service, folder_id, parent_folders=[]):
    query = f"'{folder_id}' in parents and trashed = false"
    page_token = None

    all_files = []
    while True:
        response = service.files().list(q=query,
                                        spaces='drive',
                                        fields='nextPageToken, files(id, name, mimeType, webViewLink)',
                                        pageToken=page_token).execute()

        for file in response.get('files', []):
            # Skip 'tool_data' folder
            if file.get('name') == 'tool_data' and is_folder(file):
                continue

            all_files.append(file)

            # Process the file or folder
            print('Processing:', '/'.join(parent_folders + [file.get('name')]))

            # If it's a folder, recursively process its contents
            if is_folder(file):
                children_files = process_files(service, file.get('id'), parent_folders + [file.get('name')])
                all_files.extend(children_files)

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

    return all_files

# Authenticate and create the service
SERVICE_ACCOUNT_FILE = service_account_file
SCOPES = ['https://www.googleapis.com/auth/drive']
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('drive', 'v3', credentials=credentials)

# Replace with your Google Drive folder ID
folder_id = destination_folder_url.split("/")[-1]

# Start processing from the specified folder
all_files = process_files(service, folder_id)

jsonl_df = pd.DataFrame(all_files)
jsonl_df = jsonl_df[["id", "name", "webViewLink"]]
jsonl_df

Processing: 1gqcai6cQIRqom4lEh2OGfnlbQG-Olu_5.json
Processing: 1nPu1EZOCK7cfdAxo5WOyucIg7hh-ilyh.json
Processing: 1bwQicLSppcW1V_Hn0puY0QTTSDHJFxge.json
Processing: 1hE3B5r50_Gvyp-AivlISvvtl2Wk2Oc5x.json
Processing: 1WxS6udc_AfScoFkUvbjViV358S1iNxLP.json
Processing: 14mb92qOKaF6rdGhUds5QY9x6MN4SbnKH.json
Processing: 1uA3D1luYvAclXPMawVnJlRg9iElP08sK.json
Processing: 1m2wMEyo8zMgStvkZ4zUSrjmXdemR9vfq.json
Processing: 18OBn8VwiBxiVAidtkwJskkt0CE2LlO5I.json
Processing: 1n4u7Uk0ix9bUx16-gC3RpABRfeb2POQh.json
Processing: 1CXoH2A8oGldi5N0T7OGfbfKIksRCYjEb.json
Processing: 1aiZ-txmmThDVYYXEAl7TuqqesKeogR7Q.json
Processing: 1YqG98aPCiQFU1COAdDz4u0H-Iq16IRPl.json
Processing: 1imCRydxS9zHZKiJxETiTu9_3BBey7e5E.json
Processing: 1HMUzZYEcP2A5VhmqNRYtn41h-6WbMdcr.json
Processing: 1Mndxp2Y9mxHJz1cj2nS397DALQOv97nb.json
Processing: 1J4ftUr1iSgQOXjaHehIsp9Sxs-0tF9qG.json
Processing: 13sFngq6I2_80c6qCFdRbCOKrYTv-DqHB.json
Processing: 1KBcPnb2akVwgT_D0PNJXlwKQxNkqEcMB.json
Processing: 1V1xnMuhDo8Wa7RL_EW

Unnamed: 0,id,name,webViewLink
0,1bAGi6c369HQkX7U3Zm3PLk6_QdYi_qIn,1gqcai6cQIRqom4lEh2OGfnlbQG-Olu_5.json,https://drive.google.com/file/d/1bAGi6c369HQkX7U3Zm3PLk6_QdYi_qIn/view?usp=drivesdk
1,1aFIp1bE2JzGKN7fiISn1Zb6IiqAHty1d,1nPu1EZOCK7cfdAxo5WOyucIg7hh-ilyh.json,https://drive.google.com/file/d/1aFIp1bE2JzGKN7fiISn1Zb6IiqAHty1d/view?usp=drivesdk
2,1cDqgnN4AGvXCLQh5smgwzVfe1tyBHJgG,1bwQicLSppcW1V_Hn0puY0QTTSDHJFxge.json,https://drive.google.com/file/d/1cDqgnN4AGvXCLQh5smgwzVfe1tyBHJgG/view?usp=drivesdk
3,1bSVr8nSnRHmeGuAdgJfiVeHa4F5UvTiS,1hE3B5r50_Gvyp-AivlISvvtl2Wk2Oc5x.json,https://drive.google.com/file/d/1bSVr8nSnRHmeGuAdgJfiVeHa4F5UvTiS/view?usp=drivesdk
4,1AqAJCoHt4HnlbDeOAgomgfsAH_2kyvk9,1WxS6udc_AfScoFkUvbjViV358S1iNxLP.json,https://drive.google.com/file/d/1AqAJCoHt4HnlbDeOAgomgfsAH_2kyvk9/view?usp=drivesdk
...,...,...,...
2978,1OPo66fP5SzoeKyiEfGhKZDXLG_W1hHNG,1WUFylpi4ykfVh6s9oOAUED43SUPmKKGk.json,https://drive.google.com/file/d/1OPo66fP5SzoeKyiEfGhKZDXLG_W1hHNG/view?usp=drivesdk
2979,1bU0n1p4l9u5RupzFzOKyu5umdx5lB2i0,1C6aodI2G8_fP3HUXjSPivrdLPl0Rc17s.json,https://drive.google.com/file/d/1bU0n1p4l9u5RupzFzOKyu5umdx5lB2i0/view?usp=drivesdk
2980,10A43LcYHIVMMZdlZUuIhYHTi9h22i3Gp,1cx0m_zX4ZiDyoRUYIscfg26xiy4gZboE.json,https://drive.google.com/file/d/10A43LcYHIVMMZdlZUuIhYHTi9h22i3Gp/view?usp=drivesdk
2981,1nZReO0nJRokaLSHgN1syRrzS8pnIquNt,18ETs1OvBXFDQt12yoMMtRGKjh0r1nGgF.json,https://drive.google.com/file/d/1nZReO0nJRokaLSHgN1syRrzS8pnIquNt/view?usp=drivesdk


In [28]:
conversation.keys()

parsed_jsons_ref = [
    {
        "colab_id": pj["id"],
        "task_link": pj["metadata"]["task_link"],
        "number_of_turns": pj["metadata"]["number_of_turns"],
    }
    for pj
    in parsed_jsons
]
conversation_df = pd.DataFrame(parsed_jsons_ref)
conversation_df

Unnamed: 0,colab_id,task_link,number_of_turns
0,1Z3U6AG3-61TGP5_A0xQVBGBkeRYGqZ9e,https://colab.research.google.com/drive/1Z3U6AG3-61TGP5_A0xQVBGBkeRYGqZ9e,2
1,1SFg4m32lEf0exZQBBTWKhwkxI7ca5tA3,https://colab.research.google.com/drive/1SFg4m32lEf0exZQBBTWKhwkxI7ca5tA3,3
2,1YsBFyzBzbREpp25amkSzN7YpdeY-Cchm,https://colab.research.google.com/drive/1YsBFyzBzbREpp25amkSzN7YpdeY-Cchm,3
3,1j-vwdQhEdZr-quj_2A3-UneHptLxvM4B,https://colab.research.google.com/drive/1j-vwdQhEdZr-quj_2A3-UneHptLxvM4B,6
4,1iF83QpKtf_JIBNNoKXnSKjI2wjZ26HES,https://colab.research.google.com/drive/1iF83QpKtf_JIBNNoKXnSKjI2wjZ26HES,1
...,...,...,...
2990,1mBEuvwcMqWABDMX_DzN3OUJD2yf0XME9,https://colab.research.google.com/drive/1mBEuvwcMqWABDMX_DzN3OUJD2yf0XME9,3
2991,1DipnXmBpceyEgHZ_aDzlowxy3zQO21Q0,https://colab.research.google.com/drive/1DipnXmBpceyEgHZ_aDzlowxy3zQO21Q0,1
2992,1NNZsKjc4-uBWN_OExf-F4mGADktzPCZc,https://colab.research.google.com/drive/1NNZsKjc4-uBWN_OExf-F4mGADktzPCZc,1
2993,1xfwmcKq6KHzXlYr3p7xBN1ehgWCiiRkF,https://colab.research.google.com/drive/1xfwmcKq6KHzXlYr3p7xBN1ehgWCiiRkF,1


In [29]:
jsonl_df["colab_id"] = jsonl_df["name"].apply(lambda x: x.split(".")[0])


df_merged = conversation_df.merge(jsonl_df, on="colab_id", how="inner")
df_merged = df_merged[["task_link", "number_of_turns", "webViewLink"]]
df_merged = df_merged.rename(columns={"webViewLink": "jsonl_link"})
df_merged

Unnamed: 0,task_link,number_of_turns,jsonl_link
0,https://colab.research.google.com/drive/1Z3U6AG3-61TGP5_A0xQVBGBkeRYGqZ9e,2,https://drive.google.com/file/d/1Yq2rt9XX5sfQVGjEWPPxcIf5Pa3whCop/view?usp=drivesdk
1,https://colab.research.google.com/drive/1SFg4m32lEf0exZQBBTWKhwkxI7ca5tA3,3,https://drive.google.com/file/d/1THhADY45DwMSNUmjWn3HJycLhN2aO6cC/view?usp=drivesdk
2,https://colab.research.google.com/drive/1YsBFyzBzbREpp25amkSzN7YpdeY-Cchm,3,https://drive.google.com/file/d/1Qdzw_pcNlUFoMq6w-J-V78yJT8VwE5Yc/view?usp=drivesdk
3,https://colab.research.google.com/drive/1j-vwdQhEdZr-quj_2A3-UneHptLxvM4B,6,https://drive.google.com/file/d/1G4hFObqJQFUbwkezChxCAl6yirht0u6-/view?usp=drivesdk
4,https://colab.research.google.com/drive/1iF83QpKtf_JIBNNoKXnSKjI2wjZ26HES,1,https://drive.google.com/file/d/1s0KIPmCY0dlh5FIBMP_6k3DKWL18N3s5/view?usp=drivesdk
...,...,...,...
2978,https://colab.research.google.com/drive/1mBEuvwcMqWABDMX_DzN3OUJD2yf0XME9,3,https://drive.google.com/file/d/1kfa5FD8W3Q_l86rlsBPfsVr9BnfHoRly/view?usp=drivesdk
2979,https://colab.research.google.com/drive/1DipnXmBpceyEgHZ_aDzlowxy3zQO21Q0,1,https://drive.google.com/file/d/112-opJ16Ouf8LucHrkiFsVSIYXCs3HEW/view?usp=drivesdk
2980,https://colab.research.google.com/drive/1NNZsKjc4-uBWN_OExf-F4mGADktzPCZc,1,https://drive.google.com/file/d/1gsqAeCMqeI_f-mGVtBFCDX_9TxNWWEMr/view?usp=drivesdk
2981,https://colab.research.google.com/drive/1xfwmcKq6KHzXlYr3p7xBN1ehgWCiiRkF,1,https://drive.google.com/file/d/1SZYN-CbkXej2QjZR-8J8OjGrWSs0NDog/view?usp=drivesdk


## Upload Batch Sheet

In [36]:
from src.sheets_utils import upload_df_to_sheet

cols = ["task_link", "jsonl_link", "number_of_turns"]

upload_df_to_sheet(service_account_file, delivery_sheet_id, BATCH_NAME, df_merged[cols].merge(df[["task_link", "corrections"]], on="task_link", how="inner"))