## Settings and pointers

In [59]:
service_account_file = '../../creds/google__sa.json'

tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"
included_sheet_names = [
    "Conversations_Batch_1",
    "Conversations_Batch_2",
    "Conversations_Batch_3",
    "Conversations_Batch_4",
    "Conversations_Batch_5",
]

jupyter_gdrive_folder_ids = [
    "1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9", # V0
    "1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb", # V1
    "1jV7WA5zB172DJUp7Z2XzHr62E6U6_NtY",
]

delivery_sheet_id = "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4"
delivery_jsonl_gdrive_folder_id = "1pEC7hlH3DTMUrkEHeDZduG7AyZf2lSRR"
destination_folder_url = f"https://drive.google.com/drive/folders/{delivery_jsonl_gdrive_folder_id}"


BATCH_NAME = "Batch 5"

## Source Code


In [60]:
import sys 
sys.path.append('../../')
import io
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

import nbformat
import pandas as pd
from tqdm import tqdm
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload



def get_number_of_turns(messages):
    count = 0
    for message in messages:
        if message["role"] == "User":
            count += 1
    return count


def standardize_date_format(date):
    """
    Given a date string, standardize the date format to YYYY/MM/DD.
    """
    try:
        # Parse the date string into a datetime object
        standardized_date = datetime.strptime(date, "%Y/%m/%d")
    except ValueError:
        try:
            # Attempt to parse other common formats here
            # Example: MM/DD/YYYY
            standardized_date = datetime.strptime(date, "%m/%d/%Y")
        except ValueError:
            return "Invalid date format"

    # Format the datetime object into the desired string format
    return standardized_date.strftime("%Y/%m/%d")
###################################


#########################
    # Colab #
#########################


def update_colab_notebook(colab_link, local_nb_path, sa_creds_path):
    """
    Update a Google Colab notebook file in Google Drive.

    :param colab_link: The link to the Colab notebook in Google Drive.
    :param local_nb_path: The local path of the notebook file to upload.
    :param sa_creds_path: The path to the service account credentials.
    """
    try:
        # Extract file ID from the Colab link
        file_id = colab_link.split('/drive/')[1].split('/')[0]
    except IndexError:
        raise ValueError("Invalid Colab link format")

    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(sa_creds_path, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Specify the file to upload
    media = MediaFileUpload(local_nb_path, resumable=True)

    # Update the file
    try:
        updated_file = service.files().update(fileId=file_id, media_body=media).execute()
        return f"Updated file ID: {updated_file.get('id')}"
    except Exception as e:
        return f"Error updating file: {e}"


def get_colab_notebook(colab_link, sa_creds_path) -> nbformat.NotebookNode:
    try:
        # Extract file ID from the Colab link
        file_id = colab_link.split('/drive/')[1].split('/')[0]
    except IndexError:
        raise ValueError("Invalid Colab link format")

    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(sa_creds_path, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Download the file
    try:
        request = service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)

        done = False
        while not done:
            status, done = downloader.next_chunk()
        
        # Load as nbformat notebook
        notebook = nbformat.reads(fh.getvalue().decode(), as_version=4)
        return notebook
    except Exception as e:
        print(f"Error downloading file: {e}")
        return None


def get_file_name_from_colab_link(colab_link, service_account_file):
    try:
        file_id = colab_link.split('/drive/')[1]
    except IndexError:
        return None

    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    try:
        file = service.files().get(fileId=file_id).execute()
        return file.get('name')
    except Exception as e:
        return None


def fetch_file_names_parallel(links, service_account_file, max_workers=100):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_file_name_from_colab_link, link, service_account_file): link for link in links}
        results = {}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching File Names"):
            link = futures[future]
            try:
                file_name = future.result()
                results[link] = file_name
            except Exception as e:
                results[link] = None
        return results

## Read Remote Sheet

In [61]:
from src.sheets_utils import download_sheet_as_df


progress_batches = []
for sheet_name in included_sheet_names:
    print(sheet_name)
    bdf = download_sheet_as_df(service_account_file, tracking_sheet_id, sheet_name)
    progress_batches.append(bdf)
    print(bdf.shape)

df = pd.concat(progress_batches, ignore_index=True)
completed_df = df[df["completion_status"] == "Done"]
completed_df = completed_df.drop_duplicates(subset=["task_link"])

delivered = pd.concat([
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 1"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 2"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 3"),
    download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 4"),
], ignore_index=True)


completed_to_be_delivered_df = completed_df[~completed_df["task_link"].isin(delivered["task_link"])]
completed_to_be_delivered_df

Conversations_Batch_1


KeyboardInterrupt: 

In [None]:
from src.llm_reviewer.notebook_parser import notebook_parser
from concurrent.futures import ThreadPoolExecutor, as_completed


notebooks = []
results = []

def process_task_link(task_link):
    notebook = get_colab_notebook(task_link, service_account_file)
    parsed_notebook = notebook_parser(notebook)
    parsed_messages = parsed_notebook["messages"]
    number_of_turns = get_number_of_turns(parsed_messages)
    return parsed_notebook, {
        "task_link": task_link,
        "n_messages": len(parsed_messages),
        "number_of_turns": number_of_turns,
    }

with ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(process_task_link, task_link) for task_link in completed_to_be_delivered_df["task_link"].tolist()]
    for future in as_completed(futures):
        notebook, result = future.result()
        notebooks.append(notebook)
        results.append(result)


results_df = pd.DataFrame(results)
results_df

Error downloading file: <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files/1rfNQU__74pEdovonm_-u6yrhF0UsAa2C?alt=media returned "File not found: 1rfNQU__74pEdovonm_-u6yrhF0UsAa2C.". Details: "[{'message': 'File not found: 1rfNQU__74pEdovonm_-u6yrhF0UsAa2C.', 'domain': 'global', 'reason': 'notFound', 'location': 'fileId', 'locationType': 'parameter'}]">
Error downloading file: <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files/1lHYB-8JiU67LlaqjvaRuLYUbetxWbnD5%23scrollTo%3DvzyhtIq2Tt8Q?alt=media returned "File not found: 1lHYB-8JiU67LlaqjvaRuLYUbetxWbnD5#scrollTo=vzyhtIq2Tt8Q.". Details: "[{'message': 'File not found: 1lHYB-8JiU67LlaqjvaRuLYUbetxWbnD5#scrollTo=vzyhtIq2Tt8Q.', 'domain': 'global', 'reason': 'notFound', 'location': 'fileId', 'locationType': 'parameter'}]">
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_re

  validate(nb)


Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='User', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message

Unnamed: 0,task_link,n_messages,number_of_turns
0,https://colab.research.google.com/drive/1rfNQU...,0,0
1,https://colab.research.google.com/drive/1IMCwR...,10,2
2,https://colab.research.google.com/drive/1lHYB-...,0,0
3,https://colab.research.google.com/drive/1d613I...,7,2
4,https://colab.research.google.com/drive/1qJPPq...,3,1
...,...,...,...
1128,https://colab.research.google.com/drive/1U7Xsu...,12,4
1129,https://colab.research.google.com/drive/19cdbn...,14,6
1130,https://colab.research.google.com/drive/1HZQNw...,8,3
1131,https://colab.research.google.com/drive/1PFqmH...,14,8


## GPT Review

In [None]:
import json

with open("batch_5_parsed_reviewed.json", "r") as f:
    results = json.load(f)

results = [r for r in results if r["n_messages"] != 0]
for r in results:
    r.pop("quality_review")

In [None]:
import os
from typing import List
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import tiktoken
from pydantic import BaseModel, Field
from llama_index.llms.openai import OpenAI
from llama_index import ServiceContext, set_global_service_context
from llama_index.program import OpenAIPydanticProgram
from llama_index.callbacks import CallbackManager, TokenCountingHandler

api_key = os.environ["OPENAI_API_KEY"]


token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4-1106-preview").encode
)
callback_manager = CallbackManager([token_counter])

class Feedback(BaseModel):
    score: int = Field(description="A score representing how good the conversation is in the given quality aspect, 1 is terrible, 5 is exemplary and flawless.", ge=1, le=5)
    issues: List[str] = Field(description="A concrete list of issues in the conversation. 15 words or less each.")
    praises: List[str] = Field(description="A concrete list of praise for exceptional behavior the conversation. 15 words or less each.")


class QualityAspect(BaseModel):
    name: str = Field(description="The name of the quality aspect.")
    instruction: str = Field(description="Instructions & details on how to inspect this quality aspect.")


quality_aspects = {
    "Overall": [
        QualityAspect(
            name="Completness",
            instruction="""
            How complete is the conversation? Completeness is defined as:
            - The assistant always responds to the user.
            - The conversation contains at least 1 back and forth between the user and the assistant.
            - The conversation flow is not broken.

            JUDGE THE ENTIRE CONVERSATION AS A WHOLE.
            """
        ),
    ],
    "User": [
        QualityAspect(
            name="Natural & Realistic", 
            instruction="""
            How does the user interaction resemble a real conversation and interactions a real user would have with a highly intelligent coding assistant.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
        QualityAspect(
            name="Coherent Follow ups", 
            instruction="""
            How coherent are the user's follow up messages to the assistant's messages in the conversation as a whole.
            Ideally, the user would incrementally build on the conversation to achieve their goal.

            ONLY JUDGE THE USER MESSAGES. DO NOT JUDGE THE ASSISTANT MESSAGES.
            """
        ),
    ],
    "Assistant": [
        QualityAspect(
            name="Code Quality", 
            instruction="""
            How good is the code that the assistant generates.
            Qualities:
            #   - Correctness
            #   - Optimality
            #   - PEP8 Compliance & Readability

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Text Quality", 
            instruction="""
            How good is the text that the assistant generates.
            Qualities:
            #   - Spelling
            #   - Grammar
            #   - Capitalization & Punctuation
            #   - Information Density (Should be a sweet spot leaning on the concise side, but not too concise... definitely not too verbose)
            #   - Explains Code Well

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        ),
        QualityAspect(
            name="Markdown Formatting", 
            instruction="""
            How good is the markdown formatting that the assistant generates. Is it leveraging markdown syntax tools to maximize the readability of the text?

            ONLY JUDGE THE ASSISTANT MESSAGES. DO NOT JUDGE THE USER MESSAGES.
            """
        )
    ]
}


def inspect_conversation_quality_aspect(conversation: List[List[dict]], quality_aspect: QualityAspect):
    """Inspect a conversation for a given quality aspect."""

    prompt_template_str = """
    IDENTITY:
    You are one of many specialized judges, so precisely focus on your quality aspect only.

    SITUATION:
    A large team is building a dataset of illustractions of dialogues showcasing the interaction between a user and a highly intelligent AI in the context of software development scenarios.
    - The user's replies should closely resemble authentic user engagement.
    - The AI's responses should aim to provide maximum benefit to the user.

    INSTRUCTIONS:
    Given the following conversation, please rate the quality of the conversation according to the given quality aspect.
    
    ALL QUALITY ASPECTS:
    {all_quality_aspects}

    YOUR QUALITY ASPECT:
    {quality_aspect}
    
    CONVERSATION:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        callback_manager=callback_manager,
        output_cls=Feedback, 
        prompt_template_str=prompt_template_str, 
    )
    all_quality_aspects = "\n".join([f"- {key}: {quality_aspect.name}" for key in quality_aspects.keys() for quality_aspect in quality_aspects[key]])
    output = program( 
        all_quality_aspects=all_quality_aspects,
        quality_aspect=quality_aspect.model_dump(),
        conversation=conversation["messages"],
        description="Judge the quality of the conversation according to the given quality aspect. Provide constructive criticism, rarely praise."
    )
    return output


def inspect_all_conversation_quality_aspects(conversation) -> dict:
    """Inspect a conversation for all quality aspects."""

    quality_results = {}
    for key in quality_aspects.keys():
        for quality_aspect in quality_aspects[key]:
            r = inspect_conversation_quality_aspect(conversation, quality_aspect)
            quality_results[f"{key} - {quality_aspect.name}"] = r.model_dump()

    return quality_results

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

reviewed_results = []

def process_notebook(result):
    if result is None or len(result["notebook"]["messages"]) == 0:
        return None
    result["quality_review"] = inspect_all_conversation_quality_aspects(result["notebook"])
    return result

with tqdm(total=len(results), desc="Processing notebooks") as pbar:
    with ThreadPoolExecutor(max_workers=15) as executor:
        futures = [executor.submit(process_notebook, result) for result in results]
        for future in as_completed(futures):
            r = future.result()
            if r is not None:
                reviewed_results.append(r)
            pbar.update(1)


Processing notebooks: 100%|██████████| 1060/1060 [52:46<00:00,  2.99s/it] 


In [None]:
import json 

with open("batch_5_parsed_reviewed.json", "w") as f:
    json.dump(reviewed_results, f, indent=4)

with open("batch_5_parsed_reviewed.json", "r") as f:
    reviewed_results = json.load(f)

NameError: name 'reviewed_results' is not defined

In [None]:
reviewed_results[2]

{'task_link': 'https://colab.research.google.com/drive/1yHgim2bxGo_Sf0c1BsSaFROtZZdfpiud',
 'n_messages': 6,
 'number_of_turns': 2,
 'notebook': {'metadata': {'topic': 'algorithms > by_data_structure > heaps',
   'type': 'modification',
   'target_turns': '2+'},
  'messages': [{'role': 'User',
    'content': "I have a min-heap implemented as a list in Python, and I'd like to add a method to get the kth smallest element. Could you help me with that? Here's my current min-heap class:\n```python\nclass MinHeap:\n    def __init__(self):\n        self.heap = []\n\n    def insert(self, val):\n        # Implementation for insert\n        pass\n\n    def get_min(self):\n        # Implementation for get_min\n        pass\n```",
    'type': 'markdown'},
   {'role': 'Assistant',
    'content': "Sure, here's a simple breakdown of how the kth_smallest method works in our MinHeap class:\n\n- ***Copy Heap***: Start by making a copy of the min-heap.\n- ***Heapify***: Reorganize the copy to maintain mi

In [None]:
data_skeleton = []
for rr in reviewed_results:
    scores = []
    feedback = ""
    for key in rr["quality_review"].keys():
        scores.append(rr["quality_review"][key]["score"])
        stringified_issues = "\n".join([f"- {issue}" for issue in rr["quality_review"][key]["issues"]])
        feedback += f"{key}: \n{stringified_issues}\n\n"

    data_skeleton.append({
        "task_link": rr["task_link"],
        "avg_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "issues": feedback,
    })

df_gpt_reviews = pd.DataFrame(data_skeleton)
df_gpt_reviews

Unnamed: 0,task_link,avg_score,min_score,issues
0,https://colab.research.google.com/drive/182asg...,4.000000,3,Overall - Completness: \n- Assistant's initial...
1,https://colab.research.google.com/drive/1qfFDR...,4.666667,4,Overall - Completness: \n\n\nUser - Natural & ...
2,https://colab.research.google.com/drive/1yHgim...,4.333333,3,Overall - Completness: \n\n\nUser - Natural & ...
3,https://colab.research.google.com/drive/1IMCwR...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
4,https://colab.research.google.com/drive/1jAjfx...,4.333333,4,Overall - Completness: \n\n\nUser - Natural & ...
...,...,...,...,...
1055,https://colab.research.google.com/drive/1PFqmH...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
1056,https://colab.research.google.com/drive/1qPvTL...,4.833333,4,Overall - Completness: \n\n\nUser - Natural & ...
1057,https://colab.research.google.com/drive/1lekWL...,4.500000,4,Overall - Completness: \n\n\nUser - Natural & ...
1058,https://colab.research.google.com/drive/1x_-q3...,4.666667,4,Overall - Completness: \n\n\nUser - Natural & ...


In [None]:
pd.set_option('display.max_colwidth', 500)


borderline_avg_flags = df_gpt_reviews.sort_values(by="avg_score", ascending=False)[df_gpt_reviews["avg_score"] < 4]
critical_mistake_flags = df_gpt_reviews.sort_values(by="min_score", ascending=False)[df_gpt_reviews["min_score"] < 3]

all_flags = pd.concat([borderline_avg_flags, critical_mistake_flags], ignore_index=True)
all_flags = all_flags.drop_duplicates(subset=["task_link"])

all_flags.sort_values(by="min_score", ascending=False)

In [None]:
all_flags = all_flags.merge(completed_to_be_delivered_df, on="task_link", how="left")[["task_link", "avg_score", "min_score", "issues", "assigned_to_email"]]
all_flags = all_flags.rename(columns={"assigned_to_email": "original_author_email"})
all_flags.to_csv("batch_5_flags.csv", index=False)

## Fetch & Integrate Advanced Metadata

In [None]:
import sys 
sys.path.append('../../')
import pandas as pd
from src.sheets_utils import download_sheet_as_df


service_account_file = '../../creds/google__sa.json'
insights_spreadsheet_id = "1wUWll720oz6Rnc4YKHeWFWbZbTMWbda2rfVh8jPLU2g"

included_sheets = [
    "Expanded Enriched Data",
    "Behavioural Tags",
    "Use Case Tags",
    "Programming Language Tags",
    "Dependency Tags"
]

df_metadata = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Expanded Enriched Data")

df_metadata__behavioral = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Behavioural Tags")

df_metadata__use_case = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Use Case Tags")

df_metadata__programming_language = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Programming Language Tags")

df_metadata__dependency = download_sheet_as_df(service_account_file, insights_spreadsheet_id, "Dependency Tags")

In [None]:
rich_task_metadata = []

for task_link in df_metadata["task_link"].tolist():
    row_main = df_metadata.loc[df_metadata["task_link"] == task_link].to_dict(orient="records")[0]

    behavioral_rows = df_metadata__behavioral.loc[df_metadata__behavioral["task_link"] == task_link].to_dict(orient="records")
    row_main["behavioral_tags"] = [{key: row[key] for key in row.keys() if key != "task_link"} for row in behavioral_rows]


    programming_language_rows = df_metadata__programming_language.loc[df_metadata__programming_language["task_link"] == task_link].to_dict(orient="records")
    row_main["programming_language_tags"] = [{key: row[key] for key in row.keys() if key != "task_link"} for row in programming_language_rows]

    dependency_rows = df_metadata__dependency.loc[df_metadata__dependency["task_link"] == task_link].to_dict(orient="records")
    row_main["dependency_tags"] = [{key: row[key] for key in row.keys() if key != "task_link"} for row in dependency_rows]    

    # Edit the original metadata
    row_main["topic_classication"] = row_main.pop("metadata__topic")

    row_main["area_of_focus_classification"] = {
        "top_level": row_main.pop("area_of_focus__top_level"),
        "sub_level": row_main.pop("area_of_focus__sub_level"),
        "detailed_level": row_main.pop("area_of_focus__detailed_level"),
    }
    row_main["domain_classification"] = {
        "top_level": row_main.pop("domain__top_level"),
        "sub_level": row_main.pop("domain__sub_level"),
        "detailed_level": row_main.pop("domain__detailed_level"),
    }
    rich_task_metadata.append(row_main)

In [None]:
valid_notebooks = []
for r, n in zip(results, notebooks):
    if r is None or r["n_messages"] == 0:
        continue
    n["task_link"] = r["task_link"]
    valid_notebooks.append(n)


parsed_jsons = []
for vn in valid_notebooks:
    for rtm in rich_task_metadata:
        if vn["task_link"] == rtm["task_link"]:
            vn["metadata"] = rtm
            parsed_jsons.append(vn)


for pj in parsed_jsons:
    pj["id"] = pj.pop("task_link").split("/")[-1]
    try:
        pj["metadata"].pop("duration_mins")
        pj["metadata"].pop("batch_id")
    except KeyError:
        pass


for i, conversation in enumerate(parsed_jsons):
    drive_id = conversation["id"] 
    with open(f"json_conversations/{BATCH_NAME}/{drive_id}.json", "w") as f:
        f.write(json.dumps(conversation))

## Upload JSONL

In [None]:
from src.gdrive_api import build_service
from src.gdrive_api.folder_upload import upload_folder

service = build_service(service_account_file)
uploaded_files = upload_folder(service, f'json_conversations/{BATCH_NAME}/', destination_folder_url, force_replace = True, is_url=True)
uploaded_files

------------------------------------------------------------
Processing directory .: 1 of 0 in total.
Uploading file 1 of 1060 in '.', 1 of 1060 in total.
Uploading new file '1WUFylpi4ykfVh6s9oOAUED43SUPmKKGk.json'.
File '1WUFylpi4ykfVh6s9oOAUED43SUPmKKGk.json' has been uploaded.
Uploaded '1WUFylpi4ykfVh6s9oOAUED43SUPmKKGk.json' to folder ID '1pEC7hlH3DTMUrkEHeDZduG7AyZf2lSRR'.
1WUFylpi4ykfVh6s9oOAUED43SUPmKKGk.json
Uploading file 2 of 1060 in '.', 2 of 1060 in total.
Uploading new file '1kXNPgDueZBr-4Uv-2MckDHCnuwUpmxxK.json'.
File '1kXNPgDueZBr-4Uv-2MckDHCnuwUpmxxK.json' has been uploaded.
Uploaded '1kXNPgDueZBr-4Uv-2MckDHCnuwUpmxxK.json' to folder ID '1pEC7hlH3DTMUrkEHeDZduG7AyZf2lSRR'.
1kXNPgDueZBr-4Uv-2MckDHCnuwUpmxxK.json
Uploading file 3 of 1060 in '.', 3 of 1060 in total.
Uploading new file '1qwc1IM4wfZW74nkWpfFccw16Me4alD0J.json'.
File '1qwc1IM4wfZW74nkWpfFccw16Me4alD0J.json' has been uploaded.
Uploaded '1qwc1IM4wfZW74nkWpfFccw16Me4alD0J.json' to folder ID '1pEC7hlH3DTMUrkEHeD

KeyboardInterrupt: 

In [66]:
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Function to check if a file is a folder
def is_folder(file):
    return file.get('mimeType') == 'application/vnd.google-apps.folder'

# Function to process files and folders
def process_files(service, folder_id, parent_folders=[]):
    query = f"'{folder_id}' in parents and trashed = false"
    page_token = None

    all_files = []
    while True:
        response = service.files().list(q=query,
                                        spaces='drive',
                                        fields='nextPageToken, files(id, name, mimeType, webViewLink)',
                                        pageToken=page_token).execute()

        for file in response.get('files', []):
            # Skip 'tool_data' folder
            if file.get('name') == 'tool_data' and is_folder(file):
                continue

            all_files.append(file)

            # Process the file or folder
            print('Processing:', '/'.join(parent_folders + [file.get('name')]))

            # If it's a folder, recursively process its contents
            if is_folder(file):
                children_files = process_files(service, file.get('id'), parent_folders + [file.get('name')])
                all_files.extend(children_files)

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

    return all_files

# Authenticate and create the service
SERVICE_ACCOUNT_FILE = service_account_file
SCOPES = ['https://www.googleapis.com/auth/drive']
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('drive', 'v3', credentials=credentials)

# Replace with your Google Drive folder ID
folder_id = destination_folder_url.split("/")[-1]

# Start processing from the specified folder
all_files = process_files(service, folder_id)

jsonl_df = pd.DataFrame(all_files)
jsonl_df = jsonl_df[["id", "name", "webViewLink"]]
jsonl_df

Processing: 1jbm83vIC1CGrMgkYIQj0pHC_7FN4zTX1.json
Processing: 15V0N5UP2ynf2GKCNdNS1YQrxsjNmmpAd.json
Processing: 1KXsoK4q7_IwFMtP7DzvfFnHIkI1pFBSP.json
Processing: 1v8JmOZp9QNWLxxnrPPVXcgzOuk-DoZi0.json
Processing: 1j2lEEIGjdR9SfXG0tvH3-H3YDC9tDClm.json
Processing: 1ms4UuR55ollJT4Oc2LYGK9b8SggiQvxM.json
Processing: 1FVLxbQs_H2qcSj1j2Qla40tqN87IQm7v.json
Processing: 1v2GJTZ7i1JpIgDIaQtucpF7Dy1as8Ooc.json
Processing: 1gx519taTTsaNTh2-zKDa7uWckl-Pckbi.json
Processing: 1kvN8pNE6GL3kWdWsMQfyzht5WcwZ7xA6.json
Processing: 1Nq2dy7IffJkUwpRzLEgcKRX3T3f953BH.json
Processing: 1UTMWverEFLkwTVwqkbQ0zspSan4e9ZM4.json
Processing: 1IDG28f3Vy5siSxNNrjwTCpBX5C1j4UCh.json
Processing: 1FKh_FNbZ0tDgyZ5Vm1sDnBuiWlmtNfUP.json
Processing: 11GJq_2NtiBVutIw-ymA5kRuPC8xttC1F.json
Processing: 1jSLEUGTmoq4ALh3jdDKGvFk5966V_eTI.json
Processing: 1OyK7dH9bqdD0wxk1Pb5r3Rzssmnd7YvM.json
Processing: 16UTka0tBw50Uhp50fL92hWnkEJZKXXsZ.json
Processing: 1sFadJ4cxjIluKFqToc6J6crNbsvP38AL.json
Processing: 1AO6XL_xTfKEEBtx1S8

Unnamed: 0,id,name,webViewLink
0,1GuL5UttXbiQXKnTkAFTvzd9sy3Q3FPSB,1jbm83vIC1CGrMgkYIQj0pHC_7FN4zTX1.json,https://drive.google.com/file/d/1GuL5UttXbiQXK...
1,1dOCtcAPGw8jADB4o41l8OPVCfV1CUh-H,15V0N5UP2ynf2GKCNdNS1YQrxsjNmmpAd.json,https://drive.google.com/file/d/1dOCtcAPGw8jAD...
2,1SzcwrnyB4N7HXnSYF8-JJzrlcUFaTqpN,1KXsoK4q7_IwFMtP7DzvfFnHIkI1pFBSP.json,https://drive.google.com/file/d/1SzcwrnyB4N7HX...
3,1BZrXRp1FMYT1DWmbraxvGq6MswVyCJlL,1v8JmOZp9QNWLxxnrPPVXcgzOuk-DoZi0.json,https://drive.google.com/file/d/1BZrXRp1FMYT1D...
4,1Om-LgMeIvkdUWDShYnVf-JguXC2cC5Sn,1j2lEEIGjdR9SfXG0tvH3-H3YDC9tDClm.json,https://drive.google.com/file/d/1Om-LgMeIvkdUW...
...,...,...,...
1055,1-M8MMnr2zEbzsAtkMMHHdxdU8tx68i6-,1Xnx1SuC1mg--SCm_VqWyHzFnXOAT6alP.json,https://drive.google.com/file/d/1-M8MMnr2zEbzs...
1056,1H-WWtDSRviikrClD34-_gXIxGsB6GoLa,1A4WAqpHRfaPQwyjFPSSIXtjxrBT0gkDP.json,https://drive.google.com/file/d/1H-WWtDSRviikr...
1057,1SfnnxXheEizCsqoGXf_FaWmo1jLDaTh2,1qwc1IM4wfZW74nkWpfFccw16Me4alD0J.json,https://drive.google.com/file/d/1SfnnxXheEizCs...
1058,1xDxLPp4TeRIomoKWITS1c9fRVZ37eZAx,1kXNPgDueZBr-4Uv-2MckDHCnuwUpmxxK.json,https://drive.google.com/file/d/1xDxLPp4TeRIom...


In [68]:
conversation.keys()

parsed_jsons_ref = [
    {
        "colab_id": pj["id"],
        "task_link": pj["metadata"]["task_link"],
        "number_of_turns": pj["metadata"]["number_of_turns"],
    }
    for pj
    in parsed_jsons
]
conversation_df = pd.DataFrame(parsed_jsons_ref)
conversation_df

Unnamed: 0,colab_id,task_link,number_of_turns
0,1IMCwRqYS6N68BY7_82VeK5Y3J88t9VKt,https://colab.research.google.com/drive/1IMCwR...,2
1,1d613Ifi3UsHlbhh5iBkeMjndcuchNnsH,https://colab.research.google.com/drive/1d613I...,2
2,1qJPPqc7d1M4ljB2fWss3XniuCXL_-J6w,https://colab.research.google.com/drive/1qJPPq...,1
3,1cP6qzbSAWYKL8QgTNPa9S85qApJejzde,https://colab.research.google.com/drive/1cP6qz...,1
4,1xw5RAWZpiWGN9j4i-SO2ZKQ41IeNciaG,https://colab.research.google.com/drive/1xw5RA...,3
...,...,...,...
1055,1QXk9FqJ7IMTlvrsfeOZfjW8kUdfGQ3Ak,https://colab.research.google.com/drive/1QXk9F...,2
1056,1lekWL0PjIsa3TdoJ-q47sANQ01sptkn3,https://colab.research.google.com/drive/1lekWL...,2
1057,1s_KMO8OdDyOPcmGKyy-nc8H91-AHdZ4N,https://colab.research.google.com/drive/1s_KMO...,1
1058,1PFqmHU1YtauKPd4fwD8zQI16JZwK4cIz,https://colab.research.google.com/drive/1PFqmH...,7


In [69]:
jsonl_df["colab_id"] = jsonl_df["name"].apply(lambda x: x.split(".")[0])


df_merged = conversation_df.merge(jsonl_df, on="colab_id", how="inner")
df_merged = df_merged[["task_link", "number_of_turns", "webViewLink"]]
df_merged = df_merged.rename(columns={"webViewLink": "jsonl_link"})
df_merged

Unnamed: 0,task_link,number_of_turns,jsonl_link
0,https://colab.research.google.com/drive/1IMCwR...,2,https://drive.google.com/file/d/10tqk4wcBJFbzb...
1,https://colab.research.google.com/drive/1d613I...,2,https://drive.google.com/file/d/135lxPXEWHb2Pi...
2,https://colab.research.google.com/drive/1qJPPq...,1,https://drive.google.com/file/d/17faYUO_xOnLan...
3,https://colab.research.google.com/drive/1cP6qz...,1,https://drive.google.com/file/d/12lm8x2QikaLaZ...
4,https://colab.research.google.com/drive/1xw5RA...,3,https://drive.google.com/file/d/1t9D3joY980t40...
...,...,...,...
1055,https://colab.research.google.com/drive/1QXk9F...,2,https://drive.google.com/file/d/1S4dpvl7lDK0Ix...
1056,https://colab.research.google.com/drive/1lekWL...,2,https://drive.google.com/file/d/1BPxrto_U7ZCiy...
1057,https://colab.research.google.com/drive/1s_KMO...,1,https://drive.google.com/file/d/1rBeAk3HCK_aVG...
1058,https://colab.research.google.com/drive/1PFqmH...,7,https://drive.google.com/file/d/15de0-zkqjJxqK...


## Upload Batch Sheet

In [70]:
from src.sheets_utils import upload_df_to_sheet

cols = ["task_link", "jsonl_link", "number_of_turns"]
upload_df_to_sheet(service_account_file, "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4", BATCH_NAME, df_merged[cols])