## Settings and pointers

In [57]:
service_account_file = '../../creds/google__sa.json'

tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"
included_sheet_names = [
    "Conversations_Batch_7",
]

jupyter_gdrive_folder_ids = [
    "1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9", # V0
    "1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb", # V1
    "1jV7WA5zB172DJUp7Z2XzHr62E6U6_NtY",
]

delivery_sheet_id = "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4"
delivery_jsonl_gdrive_folder_id = "1Iu-cBZ-kXzRYVO84oifrtVC2putauvt7"
destination_folder_url = f"https://drive.google.com/drive/folders/{delivery_jsonl_gdrive_folder_id}"
DELIVERY_BATCH_NAME = "Batch 6"

insights_sheet_id = "1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w"
INSIGHTS_VERSION_TAB = "v1 (Jan 25)"

## Source Code


In [9]:
import sys 
sys.path.append('../../')
import io
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

import nbformat
import pandas as pd
from tqdm import tqdm
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload



def get_number_of_turns(messages):
    count = 0
    for message in messages:
        if message["role"] == "User":
            count += 1
    return count


def standardize_date_format(date):
    """
    Given a date string, standardize the date format to YYYY/MM/DD.
    """
    try:
        # Parse the date string into a datetime object
        standardized_date = datetime.strptime(date, "%Y/%m/%d")
    except ValueError:
        try:
            # Attempt to parse other common formats here
            # Example: MM/DD/YYYY
            standardized_date = datetime.strptime(date, "%m/%d/%Y")
        except ValueError:
            return "Invalid date format"

    # Format the datetime object into the desired string format
    return standardized_date.strftime("%Y/%m/%d")
###################################


#########################
    # Colab #
#########################


def update_colab_notebook(colab_link, local_nb_path, sa_creds_path):
    """
    Update a Google Colab notebook file in Google Drive.

    :param colab_link: The link to the Colab notebook in Google Drive.
    :param local_nb_path: The local path of the notebook file to upload.
    :param sa_creds_path: The path to the service account credentials.
    """
    try:
        # Extract file ID from the Colab link
        file_id = colab_link.split('/drive/')[1].split('/')[0]
    except IndexError:
        raise ValueError("Invalid Colab link format")

    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(sa_creds_path, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Specify the file to upload
    media = MediaFileUpload(local_nb_path, resumable=True)

    # Update the file
    try:
        updated_file = service.files().update(fileId=file_id, media_body=media).execute()
        return f"Updated file ID: {updated_file.get('id')}"
    except Exception as e:
        return f"Error updating file: {e}"


def get_colab_notebook(colab_link, sa_creds_path) -> nbformat.NotebookNode:
    try:
        # Extract file ID from the Colab link
        file_id = colab_link.split('/drive/')[1].split('/')[0]
    except IndexError:
        raise ValueError("Invalid Colab link format")

    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(sa_creds_path, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Download the file
    try:
        request = service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)

        done = False
        while not done:
            status, done = downloader.next_chunk()
        
        # Load as nbformat notebook
        notebook = nbformat.reads(fh.getvalue().decode(), as_version=4)
        return notebook
    except Exception as e:
        print(f"Error downloading file: {e}")
        return None


def get_file_name_from_colab_link(colab_link, service_account_file):
    try:
        file_id = colab_link.split('/drive/')[1]
    except IndexError:
        return None

    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    try:
        file = service.files().get(fileId=file_id).execute()
        return file.get('name')
    except Exception as e:
        return None


def fetch_file_names_parallel(links, service_account_file, max_workers=100):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_file_name_from_colab_link, link, service_account_file): link for link in links}
        results = {}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching File Names"):
            link = futures[future]
            try:
                file_name = future.result()
                results[link] = file_name
            except Exception as e:
                results[link] = None
        return results

## Read Remote Sheet

In [10]:
from src.sheets_utils import download_sheet_as_df


progress_batches = []
for sheet_name in included_sheet_names:
    print(sheet_name)
    bdf = download_sheet_as_df(service_account_file, tracking_sheet_id, sheet_name)
    progress_batches.append(bdf)
    print(bdf.shape)

df = pd.concat(progress_batches, ignore_index=True)
completed_df = df[df["completion_status"] == "Done"]
completed_df = completed_df.drop_duplicates(subset=["task_link"])

delivered = pd.concat([
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 1"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 2"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 3"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 4"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 5"),
], ignore_index=True)


completed_to_be_delivered_df = completed_df[~completed_df["task_link"].isin(delivered["task_link"])]
completed_to_be_delivered_df

Conversations_Batch_7
(2999, 12)


Unnamed: 0,task_link,metadata__topic,assigned_to_email,completion_status,modified_question?,duration_mins,completion_date,comments,metadata__type,metadata__target_length,review_status,reviewer_email
0,https://colab.research.google.com/drive/1eG_ms...,machine learning - write end to end ML trainin...,aarunik.g@turing.com,Done,FALSE,80,2/1/2024,,,,,
1,https://colab.research.google.com/drive/11QTHz...,interview prep - user asks AI to help take an ...,aarunik.g@turing.com,Done,FALSE,95,2/1/2024,,,,,
2,https://colab.research.google.com/drive/1DnJMt...,interview prep - user asks AI to help take an ...,aarunik.g@turing.com,Done,FALSE,90,2/1/2024,,,,,
3,https://colab.research.google.com/drive/1Pcxi2...,problem solving - write code in python.ipynb,aarunik.g@turing.com,Done,FALSE,80,2/1/2024,,,,Reviewed,safi.u@turing.com
4,https://colab.research.google.com/drive/126DOV...,problem solving - write code in python.ipynb,aarunik.g@turing.com,Done,FALSE,90,2/1/2024,,,,Reviewed,safi.u@turing.com
...,...,...,...,...,...,...,...,...,...,...,...,...
688,https://colab.research.google.com/drive/1Z2mCy...,python basics & scripting - write code in pyth...,adnan.k@turing.com,Done,FALSE,55,2/7/2024,,,,,
689,https://colab.research.google.com/drive/1OYfMg...,python basics & scripting - write code in pyth...,caram.v@turing.com,Done,FALSE,45,2/6/2024,,,,,
700,https://colab.research.google.com/drive/1ME8M-...,web development - write code in python.ipynb,hasanuddin.h@turing.com,Done,FALSE,50,6/2/2024,,,,,
706,https://colab.research.google.com/drive/1s7tfw...,testing - write unit tests.ipynb,archit.k@turing.com,Done,FALSE,60,6/2/2024,,,,,


In [11]:
from src.llm_reviewer.notebook_parser import notebook_parser
from concurrent.futures import ThreadPoolExecutor, as_completed


notebooks = []
results = []

def process_task_link(task_link):
    notebook = get_colab_notebook(task_link, service_account_file)
    parsed_notebook = notebook_parser(notebook)
    parsed_messages = parsed_notebook["messages"]
    number_of_turns = get_number_of_turns(parsed_messages)
    return parsed_notebook, {
        "task_link": task_link,
        "n_messages": len(parsed_messages),
        "number_of_turns": number_of_turns,
    }

with ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(process_task_link, task_link) for task_link in completed_to_be_delivered_df["task_link"].tolist()]
    for future in as_completed(futures):
        notebook, result = future.result()
        notebooks.append(notebook)
        results.append(result)


results_df = pd.DataFrame(results)
results_df

Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='User', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message

Unnamed: 0,task_link,n_messages,number_of_turns
0,https://colab.research.google.com/drive/11QTHz...,16,9
1,https://colab.research.google.com/drive/1eG_ms...,10,4
2,https://colab.research.google.com/drive/126DOV...,14,5
3,https://colab.research.google.com/drive/1v53wy...,10,5
4,https://colab.research.google.com/drive/19w-98...,14,9
...,...,...,...
477,https://colab.research.google.com/drive/1OYfMg...,13,3
478,https://colab.research.google.com/drive/1qmp_C...,15,4
479,https://colab.research.google.com/drive/1ME8M-...,2,1
480,https://colab.research.google.com/drive/1Z2mCy...,16,4


## GPT Review

In [None]:
import json

with open("batch_5_parsed_reviewed.json", "r") as f:
    results = json.load(f)

results = [r for r in results if r["n_messages"] != 0]
for r in results:
    r.pop("quality_review")

In [None]:
import os
from typing import List
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import tiktoken
from pydantic import BaseModel, Field
from llama_index.llms.openai import OpenAI
from llama_index import ServiceContext, set_global_service_context
from llama_index.program import OpenAIPydanticProgram
from llama_index.callbacks import CallbackManager, TokenCountingHandler

api_key = os.environ["OPENAI_API_KEY"]


token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4-1106-preview").encode
)
callback_manager = CallbackManager([token_counter])

class Feedback(BaseModel):
    score: int = Field(description="A score representing how good the conversation is in the given quality aspect, 1 is terrible, 5 is exemplary and flawless.", ge=1, le=5)
    issues: List[str] = Field(description="A concrete list of issues in the conversation. 15 words or less each.")
    praises: List[str] = Field(description="A concrete list of praise for exceptional behavior the conversation. 15 words or less each.")


class QualityAspect(BaseModel):
    name: str = Field(description="The name of the quality aspect.")
    instruction: str = Field(description="Instructions & details on how to inspect this quality aspect.")


quality_aspects = {
    "Overall": [
        QualityAspect(
            name="Completness",
            instruction="""
            How complete is the conversation? Completeness is defined as:
            - The assistant always responds to the user.
            - The conversation contains at least 1 back and forth between the user and the assistant.
            - The conversation flow is not broken.

            JUDGE THE ENTIRE CONVERSATION AS A WHOLE.
            """
        ),
    ],
    "User": [
        QualityAspect(
            name="Natural & Realistic", 
            instruction="""
            How does the user interaction resemble a real conversation and interactions a real user would have with a highly intelligent coding assistant.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
        QualityAspect(
            name="Coherent Follow ups", 
            instruction="""
            How coherent are the user's follow up messages to the assistant's messages in the conversation as a whole.
            Ideally, the user would incrementally build on the conversation to achieve their goal.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
    ],
    "Assistant": [
        QualityAspect(
            name="Code Quality", 
            instruction="""
            How good is the code that the assistant generates.
            Qualities:
            #   - Correctness
            #   - Optimality
            #   - PEP8 Compliance & Readability

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Text Quality", 
            instruction="""
            How good is the text that the assistant generates.
            Qualities:
            #   - Spelling
            #   - Grammar
            #   - Capitalization & Punctuation
            #   - Information Density (Should be a sweet spot leaning on the concise side, but not too concise... definitely not too verbose)
            #   - Explains Code Well

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Markdown Formatting", 
            instruction="""
            How good is the markdown formatting that the assistant generates. Is it leveraging markdown syntax tools to maximize the readability of the text?

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        )
    ]
}


def inspect_conversation_quality_aspect(conversation: List[List[dict]], quality_aspect: QualityAspect):
    """Inspect a conversation for a given quality aspect."""

    prompt_template_str = """
    IDENTITY:
    You are one of many specialized judges, so precisely focus on your quality aspect only.

    SITUATION:
    A large team is building a dataset of illustractions of dialogues showcasing the interaction between a user and a highly intelligent AI in the context of software development scenarios.
    - The user's replies should closely resemble authentic user engagement.
    - The AI's responses should aim to provide maximum benefit to the user.

    INSTRUCTIONS:
    Given the following conversation, please rate the quality of the conversation according to the given quality aspect.
    
    ALL QUALITY ASPECTS:
    {all_quality_aspects}

    YOUR QUALITY ASPECT:
    {quality_aspect}
    
    CONVERSATION:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        callback_manager=callback_manager,
        output_cls=Feedback, 
        prompt_template_str=prompt_template_str, 
    )
    all_quality_aspects = "\n".join([f"- {key}: {quality_aspect.name}" for key in quality_aspects.keys() for quality_aspect in quality_aspects[key]])
    output = program( 
        all_quality_aspects=all_quality_aspects,
        quality_aspect=quality_aspect.model_dump(),
        conversation=conversation["messages"],
        description="Judge the quality of the conversation according to the given quality aspect. Provide constructive criticism, rarely praise."
    )
    return output


def inspect_all_conversation_quality_aspects(conversation) -> dict:
    """Inspect a conversation for all quality aspects."""

    quality_results = {}
    for key in quality_aspects.keys():
        for quality_aspect in quality_aspects[key]:
            r = inspect_conversation_quality_aspect(conversation, quality_aspect)
            quality_results[f"{key} - {quality_aspect.name}"] = r.model_dump()

    return quality_results

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

reviewed_results = []

def process_notebook(result):
    if result is None or len(result["notebook"]["messages"]) == 0:
        return None
    result["quality_review"] = inspect_all_conversation_quality_aspects(result["notebook"])
    return result

with tqdm(total=len(results), desc="Processing notebooks") as pbar:
    with ThreadPoolExecutor(max_workers=15) as executor:
        futures = [executor.submit(process_notebook, result) for result in results]
        for future in as_completed(futures):
            r = future.result()
            if r is not None:
                reviewed_results.append(r)
            pbar.update(1)


Processing notebooks: 100%|██████████| 1060/1060 [52:46<00:00,  2.99s/it] 


In [None]:
import json 

with open("batch_5_parsed_reviewed.json", "w") as f:
    json.dump(reviewed_results, f, indent=4)

with open("batch_5_parsed_reviewed.json", "r") as f:
    reviewed_results = json.load(f)

NameError: name 'reviewed_results' is not defined

In [None]:
reviewed_results[2]

{'task_link': 'https://colab.research.google.com/drive/1yHgim2bxGo_Sf0c1BsSaFROtZZdfpiud',
 'n_messages': 6,
 'number_of_turns': 2,
 'notebook': {'metadata': {'topic': 'algorithms > by_data_structure > heaps',
   'type': 'modification',
   'target_turns': '2+'},
  'messages': [{'role': 'User',
    'content': "I have a min-heap implemented as a list in Python, and I'd like to add a method to get the kth smallest element. Could you help me with that? Here's my current min-heap class:\n```python\nclass MinHeap:\n    def __init__(self):\n        self.heap = []\n\n    def insert(self, val):\n        # Implementation for insert\n        pass\n\n    def get_min(self):\n        # Implementation for get_min\n        pass\n```",
    'type': 'markdown'},
   {'role': 'Assistant',
    'content': "Sure, here's a simple breakdown of how the kth_smallest method works in our MinHeap class:\n\n- ***Copy Heap***: Start by making a copy of the min-heap.\n- ***Heapify***: Reorganize the copy to maintain mi

In [None]:
data_skeleton = []
for rr in reviewed_results:
    scores = []
    feedback = ""
    for key in rr["quality_review"].keys():
        scores.append(rr["quality_review"][key]["score"])
        stringified_issues = "\n".join([f"- {issue}" for issue in rr["quality_review"][key]["issues"]])
        feedback += f"{key}: \n{stringified_issues}\n\n"

    data_skeleton.append({
        "task_link": rr["task_link"],
        "avg_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "issues": feedback,
    })

df_gpt_reviews = pd.DataFrame(data_skeleton)
df_gpt_reviews

Unnamed: 0,task_link,avg_score,min_score,issues
0,https://colab.research.google.com/drive/182asg...,4.000000,3,Overall - Completness: \n- Assistant's initial...
1,https://colab.research.google.com/drive/1qfFDR...,4.666667,4,Overall - Completness: \n\n\nUser - Natural & ...
2,https://colab.research.google.com/drive/1yHgim...,4.333333,3,Overall - Completness: \n\n\nUser - Natural & ...
3,https://colab.research.google.com/drive/1IMCwR...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
4,https://colab.research.google.com/drive/1jAjfx...,4.333333,4,Overall - Completness: \n\n\nUser - Natural & ...
...,...,...,...,...
1055,https://colab.research.google.com/drive/1PFqmH...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
1056,https://colab.research.google.com/drive/1qPvTL...,4.833333,4,Overall - Completness: \n\n\nUser - Natural & ...
1057,https://colab.research.google.com/drive/1lekWL...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
1058,https://colab.research.google.com/drive/1x_-q3...,4.666667,4,Overall - Completness: \n\n\nUser - Natural & ...


In [None]:
pd.set_option('display.max_colwidth', 500)


borderline_avg_flags = df_gpt_reviews.sort_values(by="avg_score", ascending=False)[df_gpt_reviews["avg_score"] < 4]
critical_mistake_flags = df_gpt_reviews.sort_values(by="min_score", ascending=False)[df_gpt_reviews["min_score"] < 3]

all_flags = pd.concat([borderline_avg_flags, critical_mistake_flags], ignore_index=True)
all_flags = all_flags.drop_duplicates(subset=["task_link"])

all_flags.sort_values(by="min_score", ascending=False)

In [None]:
all_flags = all_flags.merge(completed_to_be_delivered_df, on="task_link", how="left")[["task_link", "avg_score", "min_score", "issues", "assigned_to_email"]]
all_flags = all_flags.rename(columns={"assigned_to_email": "original_author_email"})
all_flags.to_csv("batch_5_flags.csv", index=False)

## Fetch & Integrate Advanced Metadata

In [None]:
import sys 
sys.path.append('../../')
import pandas as pd
from src.sheets_utils import download_sheet_as_df


service_account_file = '../../creds/google__sa.json'
insights_spreadsheet_id = "1wUWll720oz6Rnc4YKHeWFWbZbTMWbda2rfVh8jPLU2g"

included_sheets = [
    "Expanded Enriched Data",
    # "Behavioural Tags",
    # "Use Case Tags",
    # "Programming Language Tags",
    # "Dependency Tags"
]

df_metadata = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Expanded Enriched Data")

# df_metadata__behavioral = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Behavioural Tags")

# df_metadata__use_case = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Use Case Tags")

# df_metadata__programming_language = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Programming Language Tags")

# df_metadata__dependency = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Dependency Tags")

In [50]:
import os
from typing import List
from pydantic import BaseModel, Field
from llama_index.program import OpenAIPydanticProgram
from llama_index.llms.openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures

from tqdm import tqdm
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
api_key = os.environ["OPENAI_API_KEY"]


class HierarchicalCategory(BaseModel):
    """Data model for hierarchical category classification."""
    top_level: str
    sub_level: str

def classify_conversation_by_domain(conversation: List[dict]) -> HierarchicalCategory:
    DOMAIN_CATEGORIES = """
        - Python basics & scripting
        - Problem Solving
        - Interview Prep
        - Web Development
        - Testing
        - Cloud Computing / Frameworks
        - Data Analysis
        - Machine Learning
        - Other languages
        - Other
    """

    prompt_template_str = """
    Categorize the theme of user requests in the following conversation by domain into one of the following top-level categories, then sub categories that you think is descriptive & appropriate:
    {categories}

    Conversation:
    {conversation}
    """

    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=HierarchicalCategory,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        categories=DOMAIN_CATEGORIES,
        conversation=conversation["messages"],
    )
    return output


def classify_conversation_by_action(
    conversation: List[dict]
) -> HierarchicalCategory:
    ACTION_CATEGORIES = """
    - Write code in python
    - Explain code
    - Fix / refactor / optimize code
    - Debug error trace
    - Write unit tests
    - Write CI/CD code
    - Do a code review
    - Write / modify / fix beam code
    - Write / modify / fix spark code
    - Write end to end ML training code
    - Help me take an interview
    - Answer ML research questions
    - Answer infra questions
    - Write / modify / fix SQL code
    - Write / modify / fix JavaScript code
    - Scrape a website
    """
    prompt_template_str = """
    Categorize the initial user requests in the following conversation by requested action into one of the following top-level categories. Sub-level should be empty string always. In case there's no natural fit, use "Other" as the top-level category.
    
    Categories:
    {categories}

    Conversation:
    {conversation}

    """

    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=HierarchicalCategory,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        categories=ACTION_CATEGORIES,
        conversation=conversation["messages"],
    )
    return output


class SummaryResult(BaseModel):
    """Data model for the summary result."""
    summary: str = Field(
        description="A short summary containing 1 sentence, 15 words max, focused on the specific theme. [super concise language]"
    )

def exec_summary(conversation: List[List[dict]]):
    prompt_template_str = """
    Given the following conversation, please, generate an executive summary of the conversation.

    User Use Case, why user uses the Assistant in this conversation, in general terms, **for what** the User is using it. Not from a technical perspective, but from a daily life situation perspective. 
    Example: work, homework, exam, studying, inteview, debugging, etc...

    It should also contain a little bit of the context of the conversation, and the main goal of the conversation.

    Conversation:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=SummaryResult,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        conversation=conversation["messages"]
    )
    return output


class GPTEstimationResult(BaseModel):
    """Data model for the GPT estimation result."""
    estimated_duration: int = Field(
        description="The estimated duration of the conversation in minutes."
    )

def gpt_estimated_duration(conversation: List[dict]) -> int:
    prompt_template_str = """
    Given the following conversation which has been generated by a median skilled technical human playing both User and Assistant... He also is responsible for making sure the assistant responses are flawless...
    Estimate how many minutes it would take to Design, Write & Verify this (Conversation Length, Complexity & Quality should be considered).

    Conversation:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=GPTEstimationResult,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        conversation=conversation["messages"]
    )
    return output


def process_conversation__metadata_extraction(conversation, task_link):
    domain = classify_conversation_by_domain(conversation)
    action = classify_conversation_by_action(conversation)
    summary = exec_summary(conversation)
    estimate_duration = gpt_estimated_duration(conversation)
    conversation["metadata"].update({
        "domain": domain.model_dump(), 
        "action": action.model_dump()["top_level"], 
        "use_case_summary": summary.model_dump()["summary"],
        "gpt_estimated_duration": estimate_duration.model_dump()["estimated_duration"],
        "task_link": task_link
    })
    return conversation
    
    

def extract_metadata_parallel(conversations, task_links, max_workers=10):
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(process_conversation__metadata_extraction, conversation, task_link)
            for conversation, task_link in zip(conversations, task_links)
        ]
        progress_bar = tqdm(total=len(futures))
        for future in concurrent.futures.as_completed(futures):
            results.append(future.result())
            progress_bar.update(1)
        progress_bar.close()
    return results


metadata_results = extract_metadata_parallel(notebooks, results_df["task_link"].tolist())

  0%|          | 0/482 [00:00<?, ?it/s]

100%|██████████| 482/482 [08:56<00:00,  1.11s/it]


In [60]:
rich_task_metadata = []

for task_link in metadata_results:

    rich_task_metadata = []

    for task_link in metadata_results:
        row_main = {
            "task_link": task_link["metadata"]["task_link"],
            "number_of_turns": get_number_of_turns(task_link["messages"]),
            "batch_id": DELIVERY_BATCH_NAME[-1],
            "domain": {
                "top_level": task_link["metadata"]["domain"]["top_level"],
                "sub_level": task_link["metadata"]["domain"]["sub_level"],
            },
            "action": task_link["metadata"]["action"],
            "use_case__summary": task_link["metadata"]["use_case_summary"],
            # "gpt_estimated_duration": task_link["metadata"]["gpt_estimated_duration"],
        }
        rich_task_metadata.append(row_main)



rich_flattened_metadata = []
for metadata in rich_task_metadata:
    current_metadata = metadata.copy()
    for key in metadata.keys():
        if isinstance(metadata[key], dict):
            for sub_key in metadata[key].keys():
                current_metadata[f"{key}__{sub_key}"] = current_metadata[key][sub_key]
            current_metadata.pop(key)
    rich_flattened_metadata.append(current_metadata)

df_metadata__output = pd.DataFrame(rich_flattened_metadata)
df_metadata__output

  0%|          | 0/482 [45:13<?, ?it/s]


Unnamed: 0,task_link,number_of_turns,batch_id,action,use_case__summary,domain__top_level,domain__sub_level
0,https://colab.research.google.com/drive/1LdGSk...,5,6,Write code in python,The User seeks assistance with automating emai...,Web Development,Email Integration
1,https://colab.research.google.com/drive/1v53wy...,5,6,Write end to end ML training code,User seeks assistance for implementing anomaly...,Machine Learning,Anomaly Detection
2,https://colab.research.google.com/drive/1WB_2L...,9,6,Help me take an interview,User practices for a Python algorithms intervi...,Interview Prep,Mock Interview
3,https://colab.research.google.com/drive/1wqrEX...,4,6,Help me take an interview,User prepares for a Python programming intervi...,Interview Prep,Mock Interview
4,https://colab.research.google.com/drive/11QTHz...,9,6,Help me take an interview,The user is practicing for a Python dynamic pr...,Interview Prep,Dynamic Programming
...,...,...,...,...,...,...,...
477,https://colab.research.google.com/drive/1s7tfw...,3,6,Write unit tests,The User seeks assistance in writing unit test...,Python basics & scripting,Decorators
478,https://colab.research.google.com/drive/1Z2mCy...,4,6,Write code in python,The user seeks assistance with filtering email...,Python basics & scripting,Email handling with imaplib
479,https://colab.research.google.com/drive/1OYfMg...,3,6,Write code in python,User seeks assistance for cloud storage file b...,Cloud Computing / Frameworks,AWS S3 with boto3
480,https://colab.research.google.com/drive/1m8Efs...,4,6,Write unit tests,The User is using the Assistant to verify a Py...,Web Development,API Development & Testing


In [61]:
from src.sheets_utils import upload_df_to_sheet

upload_df_to_sheet(service_account_file, insights_sheet_id, INSIGHTS_VERSION_TAB, df_metadata__output)

In [64]:
import os
import json

directory = f"json_conversations/{DELIVERY_BATCH_NAME}"
if not os.path.exists(directory):
    os.makedirs(directory)

valid_notebooks = []
for r, n in zip(results, notebooks):
    if r is None or r["n_messages"] == 0:
        continue
    n["task_link"] = r["task_link"]
    valid_notebooks.append(n)

parsed_jsons = []
for vn in valid_notebooks:
    for rtm in rich_task_metadata:
        if vn["task_link"] == rtm["task_link"]:
            vn["metadata"] = rtm
            parsed_jsons.append(vn)

for pj in parsed_jsons:
    pj["id"] = pj.pop("task_link").split("/")[-1]
    try:
        pj["metadata"].pop("duration_mins")
        pj["metadata"].pop("batch_id")
    except KeyError:
        pass

for i, conversation in enumerate(parsed_jsons):
    drive_id = conversation["id"] 
    with open(f"json_conversations/{DELIVERY_BATCH_NAME}/{drive_id}.json", "w") as f:
        f.write(json.dumps(conversation))

## Upload JSONL

In [65]:
from src.gdrive_api import build_service
from src.gdrive_api.folder_upload import upload_folder

service = build_service(service_account_file)
uploaded_files = upload_folder(service, f'json_conversations/{DELIVERY_BATCH_NAME}/', destination_folder_url, force_replace = True, is_url=True)
uploaded_files

------------------------------------------------------------
Processing directory .: 1 of 0 in total.
Uploading file 1 of 482 in '.', 1 of 482 in total.
Uploading new file '1yTUUhoAt1kDrnWlXKvQmVxnVdYEwvtXv.json'.
File '1yTUUhoAt1kDrnWlXKvQmVxnVdYEwvtXv.json' has been uploaded.
Uploaded '1yTUUhoAt1kDrnWlXKvQmVxnVdYEwvtXv.json' to folder ID '1Iu-cBZ-kXzRYVO84oifrtVC2putauvt7'.
1yTUUhoAt1kDrnWlXKvQmVxnVdYEwvtXv.json
Uploading file 2 of 482 in '.', 2 of 482 in total.
Uploading new file '1I2-dug-toxwrBcp-5eJBk3YiWnzb-t8T.json'.
File '1I2-dug-toxwrBcp-5eJBk3YiWnzb-t8T.json' has been uploaded.
Uploaded '1I2-dug-toxwrBcp-5eJBk3YiWnzb-t8T.json' to folder ID '1Iu-cBZ-kXzRYVO84oifrtVC2putauvt7'.
1I2-dug-toxwrBcp-5eJBk3YiWnzb-t8T.json
Uploading file 3 of 482 in '.', 3 of 482 in total.
Uploading new file '1cFSBdBFZRnqF0nHVXuFE6qCg80bR2kzY.json'.
File '1cFSBdBFZRnqF0nHVXuFE6qCg80bR2kzY.json' has been uploaded.
Uploaded '1cFSBdBFZRnqF0nHVXuFE6qCg80bR2kzY.json' to folder ID '1Iu-cBZ-kXzRYVO84oifrtVC2

{'1yTUUhoAt1kDrnWlXKvQmVxnVdYEwvtXv.json': 'https://drive.google.com/uc?id=1CdVYKEW9y84HUjQqAQVxCQls2uTHzEll',
 '1I2-dug-toxwrBcp-5eJBk3YiWnzb-t8T.json': 'https://drive.google.com/uc?id=113ocCDlXEF2aCpkdhuQh3LJ_czpuatMI',
 '1cFSBdBFZRnqF0nHVXuFE6qCg80bR2kzY.json': 'https://drive.google.com/uc?id=1sHko4ua1h23KBhH1Bkzoj9CNmY3qMy1y',
 '14GX84bPpres57LzsCzaW4GmF9DbJCxv_.json': 'https://drive.google.com/uc?id=1dMAJTU0K641woCuJCxpnJWC6spfZg9zc',
 '1V5IrWLiNYcl808HzuLrgnPDZs9mkx7NC.json': 'https://drive.google.com/uc?id=12iUZdl3tYvEGyOdu7Swcv8zTrWd9IQ0A',
 '1fJsYn5ocF4sXY4yOvUY94O5FLzu4x5_H.json': 'https://drive.google.com/uc?id=1GBbwQTNN0oktw_pey5-42Bub281Fj7gg',
 '128_OqqWyRq1WiUUPhvHrPVBzgv0AGP8-.json': 'https://drive.google.com/uc?id=198GsUtjwqp0hOyP-_kCQQSdh6iOUyaUT',
 '1xWI7915A5utQtZ6qrjxQb1weps1D2aKy.json': 'https://drive.google.com/uc?id=1TbKWoB6JtOaiw09ZATZjpFLVkQMjXKMs',
 '1AjM1Yd7NZ-_JnO85vEHXygilskUphF7h.json': 'https://drive.google.com/uc?id=1xAaGrCLEy-IgXE1ctwaFlhpQfmaSTmZL',
 

In [66]:
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Function to check if a file is a folder
def is_folder(file):
    return file.get('mimeType') == 'application/vnd.google-apps.folder'

# Function to process files and folders
def process_files(service, folder_id, parent_folders=[]):
    query = f"'{folder_id}' in parents and trashed = false"
    page_token = None

    all_files = []
    while True:
        response = service.files().list(q=query,
                                        spaces='drive',
                                        fields='nextPageToken, files(id, name, mimeType, webViewLink)',
                                        pageToken=page_token).execute()

        for file in response.get('files', []):
            # Skip 'tool_data' folder
            if file.get('name') == 'tool_data' and is_folder(file):
                continue

            all_files.append(file)

            # Process the file or folder
            print('Processing:', '/'.join(parent_folders + [file.get('name')]))

            # If it's a folder, recursively process its contents
            if is_folder(file):
                children_files = process_files(service, file.get('id'), parent_folders + [file.get('name')])
                all_files.extend(children_files)

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

    return all_files

# Authenticate and create the service
SERVICE_ACCOUNT_FILE = service_account_file
SCOPES = ['https://www.googleapis.com/auth/drive']
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('drive', 'v3', credentials=credentials)

# Replace with your Google Drive folder ID
folder_id = destination_folder_url.split("/")[-1]

# Start processing from the specified folder
all_files = process_files(service, folder_id)

jsonl_df = pd.DataFrame(all_files)
jsonl_df = jsonl_df[["id", "name", "webViewLink"]]
jsonl_df

Processing: 1MPG8AptQuQkSxvKxMP_3g49S4TbXIf7H.json
Processing: 1o_TUea7L45K4qyjdyHgn8osHXdo__RYr.json
Processing: 1HWZwARaE9jTsYc2OzTW0RY7Pf3MDpT1S.json
Processing: 11iTZzCp4eFKQXU-5-lB1qqtcL5kb4xZG.json
Processing: 1eDCG2ix1ZA0ew-N9WpXR7TISLxS_qxES.json
Processing: 1i6J5Mg1HZdoyZwIYPL0hLQNhmKnEFXoX.json
Processing: 1T9iBK25ggpkctWLcsO5PM91agm50mjiH.json
Processing: 1Hek_u6qsXtUdUlTaJxrx0Hax1G-GjK50.json
Processing: 1B27ecXOkReTArQ-nqgJujIkb2pP7-NMk.json
Processing: 1hxfQjB_VtlN4lzi2NkZo4RZJu7M76lMK.json
Processing: 1bYNhITXe9lokNDplUiQ91f3YOcpbIlrM.json
Processing: 1_8rUXAHtUdYIE2rj71NKlxCmpTRG8i-x.json
Processing: 11QTHzTf8iB5LHY1MN76hKBeMYVRxTAem.json
Processing: 1RO05UEDCUYWhkUz4GNHOc6UxJhcpzU2w.json
Processing: 1AACs4aWWtW5lnhelQFwkxFEAqro6SLS9.json
Processing: 1An5olsPP7V5z_W9U4hn04Hew37XoFo3Y.json
Processing: 1XTAe4hffq1UbmMjyr6b2z9J-raNiwl13.json
Processing: 1UzZt-oDSbvrVeOa9zOGkF5F-cZZ3VRZM.json
Processing: 1bYJTxTklIAq-NqG4UAFkB9w1MkcK5S5r.json
Processing: 1I4zKo0_XViT-KS1Y_7

Unnamed: 0,id,name,webViewLink
0,1A6zQZDgfR6UcYiteVFS5zOWqRy3gzw6R,1MPG8AptQuQkSxvKxMP_3g49S4TbXIf7H.json,https://drive.google.com/file/d/1A6zQZDgfR6UcY...
1,1yZ-2MNn1aRS-DOOUimTiMnTQjBfE1-Tx,1o_TUea7L45K4qyjdyHgn8osHXdo__RYr.json,https://drive.google.com/file/d/1yZ-2MNn1aRS-D...
2,1HObXCfsLVG93xYVO7M4cAMzNqLPNqgmP,1HWZwARaE9jTsYc2OzTW0RY7Pf3MDpT1S.json,https://drive.google.com/file/d/1HObXCfsLVG93x...
3,1ne-bLOz0TI_KnCzCctZZCuDBnQA1mjdZ,11iTZzCp4eFKQXU-5-lB1qqtcL5kb4xZG.json,https://drive.google.com/file/d/1ne-bLOz0TI_Kn...
4,1X5wV12N0N-8cFLUGWOEjD01_bjVSldw3,1eDCG2ix1ZA0ew-N9WpXR7TISLxS_qxES.json,https://drive.google.com/file/d/1X5wV12N0N-8cF...
...,...,...,...
477,12iUZdl3tYvEGyOdu7Swcv8zTrWd9IQ0A,1V5IrWLiNYcl808HzuLrgnPDZs9mkx7NC.json,https://drive.google.com/file/d/12iUZdl3tYvEGy...
478,1dMAJTU0K641woCuJCxpnJWC6spfZg9zc,14GX84bPpres57LzsCzaW4GmF9DbJCxv_.json,https://drive.google.com/file/d/1dMAJTU0K641wo...
479,1sHko4ua1h23KBhH1Bkzoj9CNmY3qMy1y,1cFSBdBFZRnqF0nHVXuFE6qCg80bR2kzY.json,https://drive.google.com/file/d/1sHko4ua1h23KB...
480,113ocCDlXEF2aCpkdhuQh3LJ_czpuatMI,1I2-dug-toxwrBcp-5eJBk3YiWnzb-t8T.json,https://drive.google.com/file/d/113ocCDlXEF2aC...


In [67]:
conversation.keys()

parsed_jsons_ref = [
    {
        "colab_id": pj["id"],
        "task_link": pj["metadata"]["task_link"],
        "number_of_turns": pj["metadata"]["number_of_turns"],
    }
    for pj
    in parsed_jsons
]
conversation_df = pd.DataFrame(parsed_jsons_ref)
conversation_df

Unnamed: 0,colab_id,task_link,number_of_turns
0,11QTHzTf8iB5LHY1MN76hKBeMYVRxTAem,https://colab.research.google.com/drive/11QTHz...,9
1,1eG_mseWBOYoDa0HT5DIA1J2FsFM9DMSc,https://colab.research.google.com/drive/1eG_ms...,4
2,126DOVxisfxkvBFBdQZ36m6bSQIPHOEtg,https://colab.research.google.com/drive/126DOV...,5
3,1v53wyDdWGkGARU2zjgIZSdgd1hEgjgtk,https://colab.research.google.com/drive/1v53wy...,5
4,19w-98Hblh1R3hqK_XxTYfCS3S8ElfJxo,https://colab.research.google.com/drive/19w-98...,9
...,...,...,...
477,1OYfMgU1sbs4nWJJND7R3wcIrhXatngox,https://colab.research.google.com/drive/1OYfMg...,3
478,1qmp_C3f5zrDOm-KtwYKFDgdWV5dd3jvk,https://colab.research.google.com/drive/1qmp_C...,4
479,1ME8M-BD9CF6IpCy2fJCb0mtQFUtxpk0t,https://colab.research.google.com/drive/1ME8M-...,1
480,1Z2mCyCTi6QpzNGQFf0We9fQrq9pD6dU_,https://colab.research.google.com/drive/1Z2mCy...,4


In [68]:
jsonl_df["colab_id"] = jsonl_df["name"].apply(lambda x: x.split(".")[0])


df_merged = conversation_df.merge(jsonl_df, on="colab_id", how="inner")
df_merged = df_merged[["task_link", "number_of_turns", "webViewLink"]]
df_merged = df_merged.rename(columns={"webViewLink": "jsonl_link"})
df_merged

Unnamed: 0,task_link,number_of_turns,jsonl_link
0,https://colab.research.google.com/drive/11QTHz...,9,https://drive.google.com/file/d/1kN798QmA9R-7N...
1,https://colab.research.google.com/drive/1eG_ms...,4,https://drive.google.com/file/d/1FncatZaSL8J5I...
2,https://colab.research.google.com/drive/126DOV...,5,https://drive.google.com/file/d/18Ogb_YpXZ-PSZ...
3,https://colab.research.google.com/drive/1v53wy...,5,https://drive.google.com/file/d/12T4WyA2yjJ-Rp...
4,https://colab.research.google.com/drive/19w-98...,9,https://drive.google.com/file/d/1cPVCjhhGMo1-d...
...,...,...,...
477,https://colab.research.google.com/drive/1OYfMg...,3,https://drive.google.com/file/d/1JiMI0Pmcji1l1...
478,https://colab.research.google.com/drive/1qmp_C...,4,https://drive.google.com/file/d/1eXo1VyIsdSX6s...
479,https://colab.research.google.com/drive/1ME8M-...,1,https://drive.google.com/file/d/1YQvP93ETi3d2U...
480,https://colab.research.google.com/drive/1Z2mCy...,4,https://drive.google.com/file/d/1DZJfqPhh8gwTR...


## Upload Batch Sheet

In [69]:
from src.sheets_utils import upload_df_to_sheet

cols = ["task_link", "jsonl_link", "number_of_turns"]
upload_df_to_sheet(service_account_file, "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4", DELIVERY_BATCH_NAME, df_merged[cols])