## Settings and pointers

In [4]:
service_account_file = '../../creds/google__sa.json'

tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"
included_sheet_names = [
    "Conversations_Batch_7",
    "Conversations_Batch_8",
    "Conversations_Batch_9",
]

jupyter_gdrive_folder_ids = [
    "1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9", # V0
    "1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb", # V1
    "1jV7WA5zB172DJUp7Z2XzHr62E6U6_NtY",
]

delivery_sheet_id = "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4"
delivery_jsonl_gdrive_folder_id = "1kVHndPCu_WRrgF5rwzUfY_9ZmMG_Jyeh"
destination_folder_url = f"https://drive.google.com/drive/folders/{delivery_jsonl_gdrive_folder_id}"
DELIVERY_BATCH_NAME = "Batch 8"

insights_sheet_id = "1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w"
INSIGHTS_VERSION_TAB = "v1 (Jan 25)"

## Source Code

In [2]:
import sys 
sys.path.append('../../')
import json
import io
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

from tqdm import tqdm
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload, MediaIoBaseUpload



def crawl_gdrive_folder(folder_id, service_account_file, file_extension=None):
    """
    Crawls a Google Drive folder and returns its contents as a list of items. 
    Every item can either be a file or a folder. 
    Folder items are represented as dictionaries with the keys 'id', 'name' and 'children'.
    File items are represented as dictionaries with the keys 'id' and 'name'.

    Parameters
    folder_id: str
        The ID of the Google Drive folder to crawl
    service_account_file: str
        The path to the service account file
    file_extension: str
        The file extension to filter for. If None, all files are returned.

    Returns a list of dictionaries
    """
    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Crawl the folder
    items = []
    page_token = None
    while True:
        response = service.files().list(
            q=f"'{folder_id}' in parents",
            spaces='drive',
            fields='nextPageToken, files(id, name, mimeType)',
            pageToken=page_token
        ).execute()
        for file in response.get('files', []):
            if file.get('mimeType') == 'application/vnd.google-apps.folder':
                items.append({
                    'id': file['id'],
                    'name': file['name'],
                    'children': crawl_gdrive_folder(file['id'], service_account_file, file_extension)
                })
            elif file_extension is None or file['name'].endswith(file_extension):
                items.append({
                    'id': file['id'],
                    'name': file['name']
                })
        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break
    return items



def download_gdrive_file(file_id, service_account_file):
    """
    Downloads a file from Google Drive, attempting to parse it as a JSON file, returning the parsed contents.

    Parameters
    file_id: str
        The ID of the file to download
    service_account_file: str
        The path to the service account file

    Returns the parsed contents of the file as a dict, or None if the file could not be downloaded
    """
    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Download the file
    try:
        request = service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)

        done = False
        while not done:
            status, done = downloader.next_chunk()
        
        # Load file contents
        fh.seek(0)
        return json.load(fh)
    
    except Exception as e:
        print(f"Error parsing file contents: {e}")
        return None



def reupload_gdrive_file(file_contents, file_id, service_account_file):
    """
    Re-uploads a dictionary from memory as a JSON file to Google Drive. 
    If the file already exists, it should present the same id, creating a new version of the file in drive.

    Parameters:
    - file_contents: dict
        The file contents to upload.
    - file_id: str
        The ID of the file to upload.
    - service_account_file: str
        The path to the service account file.

    Returns True if the file was successfully uploaded, False otherwise.
    """
    # Initialize Google Drive API
    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    # Convert dictionary to bytes and prepare for upload
    file_stream = io.BytesIO(json.dumps(file_contents).encode())
    media = MediaIoBaseUpload(file_stream, mimetype='application/json', resumable=True)

    try:
        # Perform the update operation
        service.files().update(fileId=file_id, media_body=media).execute()
        return True
    except Exception as e:
        print(f"Error re-uploading file: {e}")
        return False


def parallel_execute_with_progress(function, arguments, max_workers=10):
    """
    Executes a function in parallel with multiple arguments displaying a tqdm progress bar.

    Parameters
    function: function
        The function to execute
    arguments: list
        A list of tuples, where each tuple contains the arguments to pass to the function
    max_workers: int
        The maximum number of workers to use

    Returns a list of results
    """
    # Create a thread pool
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Execute the function for each argument
        futures = [executor.submit(function, *args) for args in arguments]
        results = []
        for future in tqdm(as_completed(futures), total=len(futures)):
            results.append(future.result())
    return results



## Testing Playground

In [3]:
delivery_batches_parent_folder_id = "1AhR0jAYwgJM1iE_5XSoQvsgcQnKhX4Rc"
pointers = crawl_gdrive_folder(delivery_batches_parent_folder_id, service_account_file)

In [4]:
for p in pointers:
    if 'children' in p:
        print(f"Folder: {p['name']}, Children: {len(p['children'])}")

Folder: Batch 10, Children: 561
Folder: Batch 9, Children: 1305
Folder: Batch 8, Children: 559
Folder: Batch 7, Children: 492
Folder: Batch 6, Children: 482
Folder: Redo 1-5 (Jan 25 Feedback), Children: 2995
Folder: Batch 5, Children: 1060
Folder: Batch 4, Children: 512
Folder: Batch 3, Children: 701
Folder: Batch 2, Children: 446
Folder: Batch 1, Children: 300


In [5]:
test_contents = download_gdrive_file(pointers[0]['children'][1]['id'], service_account_file)
test_contents

{'metadata': {'task_link': 'https://colab.research.google.com/drive/1Q43jfkwuqyyG7TzaIooq7IDOy35dV0Pe',
  'number_of_turns': 2,
  'batch_id': '9',
  'domain': {'top_level': 'Web Development', 'sub_level': 'Flask'},
  'action': 'Write code in python',
  'use_case__summary': 'User seeks assistance for Flask app database connection and secure environment variable management.',
  'gpt_estimated_duration': 30},
 'messages': [{'role': 'User',
   'content': "I'm building a `Flask` app for internal use and need to connect to our `PostgreSQL` database. Can you help me write a Python code using `SQLAlchemy` to create a connection pool and retrieve the first 10 employee records from the `employees` table? I'm looking for a practical example that's easy to adapt.",
   'type': 'markdown'},
  {'role': 'Assistant',
   'content': "Here's a code snippet that accomplishes your request:",
   'type': 'markdown'},
  {'role': 'Assistant',
   'content': 'import os\nimport psycopg2\nfrom sqlalchemy import cre

In [9]:
reupload_gdrive_file(test_contents, pointers[0]['children'][0]['id'], service_account_file)

True

In [None]:
parallel_execute_with_progress(reupload_gdrive_file, [(test_contents, pointers[0]['children'][0]['id'], service_account_file)])

In [5]:
import os
from typing import List
from pydantic import BaseModel, Field
from llama_index.program import OpenAIPydanticProgram
from llama_index.llms.openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures

from tqdm import tqdm
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
api_key = os.environ["OPENAI_API_KEY"]


class HierarchicalCategory(BaseModel):
    """Data model for hierarchical category classification."""
    top_level: str
    sub_level: str

def classify_conversation_by_domain(conversation: List[dict]) -> HierarchicalCategory:
    DOMAIN_CATEGORIES = """
        - Python basics & scripting
        - Problem Solving
        - Interview Prep
        - Web Development
        - Testing
        - Cloud Computing / Frameworks
        - Data Analysis
        - Machine Learning
        - Other languages
        - Other
    """

    prompt_template_str = """
    Categorize the theme of user requests in the following conversation by domain into one of the following top-level categories, then sub categories that you think is descriptive & appropriate:
    {categories}

    Conversation:
    {conversation}
    """

    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=HierarchicalCategory,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        categories=DOMAIN_CATEGORIES,
        conversation=conversation["messages"],
    )
    return output


def classify_conversation_by_action(
    conversation: List[dict]
) -> HierarchicalCategory:
    ACTION_CATEGORIES = """
    - Write code in python
    - Explain code
    - Fix / refactor / optimize code
    - Debug error trace
    - Write unit tests
    - Write CI/CD code
    - Do a code review
    - Write / modify / fix beam code
    - Write / modify / fix spark code
    - Write end to end ML training code
    - Help me take an interview
    - Answer ML research questions
    - Answer infra questions
    - Write / modify / fix SQL code
    - Write / modify / fix JavaScript code
    - Scrape a website
    """
    prompt_template_str = """
    Categorize the initial user request in the following conversation by requested action into one of the following top-level categories. Sub-level should be empty string always. In case there's no natural fit, use "Other" as the top-level category.
    
    Categories:
    {categories}

    Conversation:
    {conversation}

    """

    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=HierarchicalCategory,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        categories=ACTION_CATEGORIES,
        conversation=conversation["messages"],
    )
    return output


class SummaryResult(BaseModel):
    """Data model for the summary result."""
    summary: str = Field(
        description="A short summary containing 1 sentence, 15 words max, focused on the specific theme. [super concise language]"
    )

def exec_summary(conversation: List[List[dict]]):
    prompt_template_str = """
    Given the following conversation, please, generate an executive summary of the conversation.

    User Use Case, why user uses the Assistant in this conversation, in general terms, **for what** the User is using it. Not from a technical perspective, but from a daily life situation perspective. 
    Example: work, homework, exam, studying, inteview, debugging, etc...

    It should also contain a little bit of the context of the conversation, and the main goal of the conversation.

    Conversation:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=SummaryResult,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        conversation=conversation["messages"]
    )
    return output


class GPTEstimationResult(BaseModel):
    """Data model for the GPT estimation result."""
    estimated_duration: int = Field(
        description="The estimated duration of demonstrating the conversation in minutes."
    )

def gpt_estimated_duration(conversation: dict) -> int:
    prompt_template_str = """
    Given the following conversation which has been demonstrated by a median skilled technical human playing both User and Assistant... He also is responsible for making sure the assistant responses are flawless...
    Estimate how many minutes it would take to Design, Research for, Write & Verify the code in this.
    You can consider the following factors to estimate (Conversation Length, Topic/Code Complexity).

    If you get this right, you will save my life.

    Conversation:
    {conversation}
    """
    program = OpenAIPydanticProgram.from_defaults(
        llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
        output_cls=GPTEstimationResult,
        prompt_template_str=prompt_template_str,
        verbose=False,
    )
    output = program(
        conversation=conversation["messages"]
    )
    return output


def process_conversation__metadata_extraction(conversation, task_link):
    domain = classify_conversation_by_domain(conversation)
    action = classify_conversation_by_action(conversation)
    summary = exec_summary(conversation)
    estimate_duration = gpt_estimated_duration(conversation)
    if "metadata" in conversation:
        conversation["metadata"].update({
            "domain": domain.model_dump(), 
            "action": action.model_dump()["top_level"], 
            "use_case_summary": summary.model_dump()["summary"],
            "gpt_estimated_duration": estimate_duration.model_dump()["estimated_duration"],
            "task_link": task_link
        })
    else:
        conversation["metadata"] = {
            "domain": domain.model_dump(), 
            "action": action.model_dump()["top_level"], 
            "use_case_summary": summary.model_dump()["summary"],
            "gpt_estimated_duration": estimate_duration.model_dump()["estimated_duration"],
            "task_link": task_link
        }
    return conversation
    

In [6]:
import os

from pydantic import BaseModel, Field
from llama_index.program import OpenAIPydanticProgram
from llama_index.llms.openai import OpenAI
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
api_key = os.environ["OPENAI_API_KEY"]


def transform__concatenate_back_to_back_messages_from_same_role(messages):
    """
    Merge back-to-back user & ai messages into a single message, with the content of the messages concatenated together.
    """
    if len(messages) == 0:
        return []

    concatenated_messages = []
    current_concatenation = ""
    current_role = messages[0]['role']
    for message in messages:
        if message.get('role') == current_role:
            current_concatenation += message.get('content').strip() + "\n\n"
        else:
            concatenated_messages.append({
                'role': current_role,
                'content': current_concatenation.strip()
            })
            current_concatenation = message.get('content') + "\n\n"
            current_role = message.get('role')

    # Add the last concatenation
    concatenated_messages.append({
        'role': current_role,
        'content': current_concatenation.rstrip()
    })
    return concatenated_messages



def transform__code_blocks_to_syntax_highlighted_md(messages): 
    """
    Identify the language of each code block and transform it into markdown syntax highlighting.
    """
    transformed_messages = []

    # Isolate Code blocks
    for message in messages:

        if message.get("type") != "code":
            transformed_messages.append(message)
            continue

        message_copy = message.copy()

        # Identify the language of each code block
        class Language(BaseModel):
            """Data model for identifying the language of a code block for use in markdown syntax highlighting."""
            language: str

        prompt_template_str = """
        Given the following conversation, Identify the language of each code block.
        The output should be compatible with markdown syntax highlighting for triple backtick code blocks.

        Contents:
        {code}
        """
        program = OpenAIPydanticProgram.from_defaults(
            llm=OpenAI(api_key=api_key, model="gpt-4-1106-preview", temperature=0),
            output_cls=Language,
            prompt_template_str=prompt_template_str,
            verbose=False,
        )
        output = program(
            code=message_copy.get("content").strip()
        )
        language = output.language

        # Strip leading and trailing spaces/newlines from the code block
        message_copy["content"] = f"```{language}\n{message_copy.get('content').strip()}\n```"
        message_copy["type"] = "markdown"

        # Wrap in triple backticks and append to transformed messages
        transformed_messages.append(message_copy)

    return transformed_messages


def is_metadata_v1(file_contents):
    """
    Verify the schema.

    Includes metadata dict, which includes these keys
    - domain
    - action
    - use_case_summary
    - gpt_estimated_duration
    - task_link
    """
    if file_contents is None:
        return False
    
    if "metadata" in file_contents:
        if "domain" in file_contents["metadata"] and \
            "action" in file_contents["metadata"] and \
            "use_case_summary" in file_contents["metadata"] and \
            "gpt_estimated_duration" in file_contents["metadata"] and \
            "task_link" in file_contents["metadata"]:
            return True
    return False



def transform_pipeline(file_pointer, update_metadata=False):
    try:
        # Download the file
        file_contents = download_gdrive_file(file_pointer['id'], service_account_file)

        # Transform the file
        transformed_messages = transform__code_blocks_to_syntax_highlighted_md(file_contents['messages'])
        file_contents["messages"] = transform__concatenate_back_to_back_messages_from_same_role(transformed_messages)

        # Update the metadata if not v1 schema
        if not is_metadata_v1(file_contents) and update_metadata:
            colab_id = file_pointer['name'].split(".")[0]
            colab_link = f"https://colab.research.google.com/drive/{colab_id}"
            file_contents = process_conversation__metadata_extraction(file_contents, colab_link)

        # Re-upload the file
        reupload_gdrive_file(file_contents, file_pointer['id'], service_account_file)

        return file_contents, None
    except Exception as e:
        return False, e
    

def transform__fix_task_link(file_pointer):
    try:
        # Download the file
        file_contents = download_gdrive_file(file_pointer['id'], service_account_file)

        colab_id = file_pointer['name'].split(".")[0]
        colab_link = f"https://colab.research.google.com/drive/{colab_id}"
        file_contents["metadata"]["task_link"] = colab_link

        # Re-upload the file
        reupload_gdrive_file(file_contents, file_pointer['id'], service_account_file)

        return file_contents, None
    except Exception as e:
        return False, e


def transform__fix_use_case_summary_key(file_pointer):
    try:
        # Download the file
        file_contents = download_gdrive_file(file_pointer['id'], service_account_file)

        # Update the metadata if not v1 schema
        file_contents["metadata"]["use_case__summary"] = file_contents["metadata"].pop("use_case_summary")

        # Re-upload the file
        reupload_gdrive_file(file_contents, file_pointer['id'], service_account_file)

        return file_contents, None
    except Exception as e:
        return False, e


# Flatten the pointers recursively
def flatten_pointers(pointers, folders=None):
    flat_pointers = []
    for pointer in pointers:
        if folders is None or pointer['name'] in folders:
            if 'children' in pointer:
                flat_pointers.extend(flatten_pointers(pointer['children']))
            else:
                flat_pointers.append(pointer)
    return flat_pointers

flat_pointers = flatten_pointers(pointers, folders=["Batch 10"])
len(flat_pointers)

561

In [7]:
# Execute the pipeline in parallel on all pointers, Use chunks and Save/Load checkpoints
from datetime import datetime
import pickle


def save_checkpoint(checkpoint, filename):
    with open(filename, 'wb') as f:
        pickle.dump(checkpoint, f)


def load_checkpoint(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
    

def execute_pipeline_on_pointers(pointers, chunk_size=1500, checkpoint_filename=None):
    # Load checkpoint if it exists
    if checkpoint_filename is not None and os.path.exists(checkpoint_filename):
        checkpoint = load_checkpoint(checkpoint_filename)
    else:
        checkpoint = {
            "completed": [],
            "failed": []
        }
        
    # Execute the pipeline in parallel
    for i in range(0, len(pointers), chunk_size):
        chunk = pointers[i:i + chunk_size]
        results = parallel_execute_with_progress(transform_pipeline, [(pointer,) for pointer in chunk], max_workers=15)
        for result, error in results:
            if error is not None:
                checkpoint["failed"].append(result)
            else:
                checkpoint["completed"].append(result)

        # Save checkpoint
        if checkpoint_filename is not None:
            save_checkpoint(checkpoint, checkpoint_filename)
    return checkpoint


checkpoint_filename = "checkpoint.pkl"
checkpoint = execute_pipeline_on_pointers(flat_pointers, checkpoint_filename=checkpoint_filename)

100%|██████████| 561/561 [03:41<00:00,  2.53it/s]


In [36]:
test_pointer = {'id': '1iD6sjEucJO09TBzUjBXoQX0FpbI1wuvL',
   'name': '1fpx6m5PT_j6-PsjZgmjMNvXucj9EOomw.json'}
transform_pipeline(test_pointer)

({'metadata': {'task_link': 'https://colab.research.google.com/drive/1fpx6m5PT',
   'number_of_turns': 3,
   'batch_id': '7',
   'domain': {'top_level': 'Python basics & scripting',
    'sub_level': 'Code maintainability and readability'},
   'action': 'Explain code',
   'use_case__summary': 'User seeks advice on commenting Python code for a digital library system to enhance maintainability and readability.',
   'gpt_estimated_duration': 10,
   'use_case_summary': 'User seeks advice on commenting Python code for a digital library system to enhance maintainability and readability.'},
  'messages': [{'role': 'User',
    'content': "I've been working on a Python script for a digital library management system. It handles book inventory and user interactions, but I'm concerned about the long-term maintenance and readability of the code. Can you suggest how to effectively add comments to my code?"},
   {'role': 'Assistant',
   {'role': 'User',
    'content': 'That makes sense. How detailed s

In [90]:
current_folder_names = [
    "Batch 7",
    "Batch 6",
    "Redo 1-5 (Jan 25 Feedback)"
]

# Download the files
def flatten_pointers(pointers, folders=None):
    flat_pointers = []
    for pointer in pointers:
        if folders is None or pointer['name'] in folders:
            if 'children' in pointer:
                flat_pointers.extend(flatten_pointers(pointer['children']))
            else:
                flat_pointers.append(pointer)
    return flat_pointers

# flat_filtered_pointers = flatten_pointers(pointers, current_folder_names)
# print(len(flat_filtered_pointers))
# results = parallel_execute_with_progress(download_gdrive_file, [(pointer['id'], service_account_file) for pointer in flat_filtered_pointers], max_workers=50)


# Download the delivery sheet
# from src.sheets_utils import download_sheet_as_df
# batch_to_links = {
#     "Batch 1": set(download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 1")["task_link"].to_list()),
#     "Batch 2": set(download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 2")["task_link"].to_list()),
#     "Batch 3": set(download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 3")["task_link"].to_list()),
#     "Batch 4": set(download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 4")["task_link"].to_list()),
#     "Batch 5": set(download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 5")["task_link"].to_list()),
#     "Batch 6": set(download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 6")["task_link"].to_list()),
#     "Batch 7": set(download_sheet_as_df(service_account_file, delivery_sheet_id, "Batch 7")["task_link"].to_list()),
# }


# Parse their metadata into a df corresponding to the links
import pandas as pd
insights_records = []
for r in results:
    if r is None:
        continue

    batch_id = None
    for batch, links in batch_to_links.items():
        if r["metadata"].get("task_link", None) in links:
            batch_id = batch.split()[1]
            break

    record = {
        "task_link": r["metadata"].get("task_link", None),
        "batch_id": batch_id,
        "number_of_turns": r["metadata"].get("number_of_turns", None),
        "gpt_estimated_duration": r["metadata"].get("gpt_estimated_duration", None),
        "action": r["metadata"].get("action", None),
        "domain__top_level": r["metadata"].get("domain", {}).get("top_level", None),
        "domain__sub_level": r["metadata"].get("domain", {}).get("sub_level", None),
        "use_case__summary": r["metadata"].get("use_case_summary", None)
    }
    insights_records.append(record)
insights_df = pd.DataFrame(insights_records)
insights_df = insights_df.fillna("")

# Upload the df to the insights sheet
# from src.sheets_utils import upload_df_to_sheet, GoogleSheetsService

# sheets_client = GoogleSheetsService(service_account_file, ['https://www.googleapis.com/auth/spreadsheets'])
# values = [insights_df.columns.tolist()] + insights_df.values.tolist()
# sheets_client.update_or_append_data_to_sheet(insights_sheet_id, INSIGHTS_VERSION_TAB, values)

In [91]:
results[0]

{'metadata': {'task_link': 'https://colab.research.google.com/drive/1ztZJScju_m-9MYF2v1fze2TVgjPadgNF',
  'number_of_turns': 4,
  'batch_id': '7',
  'domain': {'top_level': 'Machine Learning',
   'sub_level': 'Neural Network Design'},
  'action': 'Write end to end ML training code',
  'use_case__summary': 'The user is refining a fitness recommendation system, seeking to enhance it with historical workout data and feedback loops for personalized plans, and exploring strategies for scaling and integrating diverse datasets.',
  'gpt_estimated_duration': 60,
  'use_case_summary': 'The User seeks to enhance a fitness recommendation system with ML, focusing on scalability and integrating diverse data for holistic advice.'},
 'messages': [{'role': 'User',
   'content': "As we refine our ML-driven fitness recommendation system, we've focused on tailoring workout plans according to individual progress and feedback. I'm considering incorporating a user's historical workout data into the model. C

In [95]:
insights_df

Unnamed: 0,task_link,batch_id,number_of_turns,gpt_estimated_duration,action,domain__top_level,domain__sub_level,use_case__summary
0,https://colab.research.google.com/drive/1ztZJS...,7,4,60.0,Write end to end ML training code,Machine Learning,Neural Network Design,The User seeks to enhance a fitness recommenda...
1,https://colab.research.google.com/drive/1QBLfW...,7,4,60.0,Write end to end ML training code,Machine Learning,Neural Networks / Deep Learning,The User is using the Assistant to enhance a f...
2,https://colab.research.google.com/drive/1OiW7x...,,3,30.0,Do a code review,Python basics & scripting,Code Review and Best Practices,User seeks assistance with Python script revie...
3,https://colab.research.google.com/drive/1pzywt...,7,2,10.0,Debug error trace,Python basics & scripting,Error Handling,"User seeks assistance for a coding error, usin..."
4,https://colab.research.google.com/drive/1JJuMy...,7,4,30.0,Help me take an interview,Interview Prep,Mock Interview for Web Development,User seeks mock interview practice for buildin...
...,...,...,...,...,...,...,...,...
3963,https://colab.research.google.com/drive/1X-6J-...,2,2,10.0,Answer infra questions,Web Development,HTTPS Configuration,User seeks assistance for enabling HTTPS on a ...
3964,https://colab.research.google.com/drive/1cx0m_...,3,2,30.0,Write unit tests,Testing,Integration Testing,User seeks assistance with integration testing...
3965,https://colab.research.google.com/drive/1_HLk3...,4,3,10.0,Explain code,Python basics & scripting,Serialization,User seeks assistance with Python serializatio...
3966,https://colab.research.google.com/drive/1lTlIm...,4,3,10.0,Write code in python,Problem Solving,Algorithm Explanation,User seeks understanding of merge sort and its...


In [70]:
insights_df[insights_df["batch_id"]=="7"]["gpt_estimated_duration"].describe()

count    477.000000
mean      24.643606
std       16.493532
min        5.000000
25%       10.000000
50%       30.000000
75%       30.000000
max       60.000000
Name: gpt_estimated_duration, dtype: float64

In [83]:
# Is it safe to upload the files to the insights sheet? how many will we lose? yes


# Merge and upload the GPT estimated durations

# Download the insights sheet
# existing_insights_v1 = download_sheet_as_df(service_account_file, insights_sheet_id, INSIGHTS_VERSION_TAB)

# Merge the new insights with the existing ones
# merged_insights = existing_insights_v1.merge(insights_df[["task_link", "gpt_estimated_duration"]], on="task_link", how="left")


from src.sheets_utils import upload_df_to_sheet, GoogleSheetsService


sheets_client = GoogleSheetsService(service_account_file, ['https://www.googleapis.com/auth/spreadsheets'])
merged_insights = merged_insights.fillna("")
values = [merged_insights.columns.tolist()] + merged_insights.values.tolist()
sheets_client.update_or_append_data_to_sheet(insights_sheet_id, INSIGHTS_VERSION_TAB, values)

Updated or appended data to 'v1 (Jan 25)'


{'spreadsheetId': '1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w',
 'tableRange': "'v1 (Jan 25)'!A1:G975",
 'updates': {'spreadsheetId': '1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w',
  'updatedRange': "'v1 (Jan 25)'!A976:H1950",
  'updatedRows': 975,
  'updatedColumns': 8,
  'updatedCells': 7800}}

In [97]:
sheets_client = GoogleSheetsService(service_account_file, ['https://www.googleapis.com/auth/spreadsheets'])
insights_df = insights_df.fillna("")
insights_df_old = insights_df[~insights_df["batch_id"].isin(["7", "6"])]
values = [insights_df_old.columns.tolist()] + insights_df_old.values.tolist()
sheets_client.update_or_append_data_to_sheet(insights_sheet_id, INSIGHTS_VERSION_TAB, values)

Updated or appended data to 'v1 (Jan 25)'


{'spreadsheetId': '1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w',
 'tableRange': "'v1 (Jan 25)'!A1:H975",
 'updates': {'spreadsheetId': '1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w',
  'updatedRange': "'v1 (Jan 25)'!A976:H3985",
  'updatedRows': 3010,
  'updatedColumns': 8,
  'updatedCells': 24080}}

In [87]:
insights_df

Unnamed: 0,task_link,batch_id,number_of_turns,gpt_estimated_duration,action,domain__top_level,domain__sub_level
0,https://colab.research.google.com/drive/1ztZJS...,7,4,60.0,Write end to end ML training code,Machine Learning,Neural Network Design
1,https://colab.research.google.com/drive/1QBLfW...,7,4,60.0,Write end to end ML training code,Machine Learning,Neural Networks / Deep Learning
2,https://colab.research.google.com/drive/1OiW7x...,,3,30.0,Do a code review,Python basics & scripting,Code Review and Best Practices
3,https://colab.research.google.com/drive/1pzywt...,7,2,10.0,Debug error trace,Python basics & scripting,Error Handling
4,https://colab.research.google.com/drive/1JJuMy...,7,4,30.0,Help me take an interview,Interview Prep,Mock Interview for Web Development
...,...,...,...,...,...,...,...
3963,https://colab.research.google.com/drive/1X-6J-...,2,2,10.0,Answer infra questions,Web Development,HTTPS Configuration
3964,https://colab.research.google.com/drive/1cx0m_...,3,2,30.0,Write unit tests,Testing,Integration Testing
3965,https://colab.research.google.com/drive/1_HLk3...,4,3,10.0,Explain code,Python basics & scripting,Serialization
3966,https://colab.research.google.com/drive/1lTlIm...,4,3,10.0,Write code in python,Problem Solving,Algorithm Explanation


In [48]:

def is_metadata_v1(file_contents):
    """
    Verify the schema.

    Includes metadata dict, which includes these keys
    - domain
    - action
    - use_case_summary
    - gpt_estimated_duration
    - task_link
    """
    if file_contents is None:
        return False

    if "metadata" in file_contents.keys():
        if "domain" in file_contents["metadata"] and \
            "action" in file_contents["metadata"] and \
            "use_case_summary" in file_contents["metadata"] and \
            "gpt_estimated_duration" in file_contents["metadata"] and \
            "task_link" in file_contents["metadata"]:
            return True
    return False

# count how many elements of the results are v1 schema
count = 0
for r in results:
    if is_metadata_v1(r):
        count += 1

count, len(results)

(3921, 3969)

In [46]:
results[1000]

{'metadata': {'task_link': 'https://colab.research.google.com/drive/1J4ftUr1iSgQOXjaHehIsp9Sxs-0tF9qG.json',
  'number_of_turns': '2',
  'use_case__summary': 'The user is using the Assistant to enhance web development security practices.',
  'area_of_focus__detailed_level': 'secure cookies in flask',
  'domain__detailed_level': 'security and authentication',
  'behavioral_tags': [{'top_level': 'continuation follow up',
    'sub_level': 'request for clarification/elaboration',
    'custom_category': 'FALSE'},
   {'top_level': 'respond to assistant',
    'sub_level': 'answer a question',
    'custom_category': 'FALSE'},
   {'top_level': 'continuation follow up',
    'sub_level': 'incrementally build',
    'custom_category': 'FALSE'}],
  'programming_language_tags': [{'language': 'python', 'percentage': '1'}],
  'dependency_tags': [{'dependency': 'flask', 'percentage': '1'}],
  'topic_classication': 'web_development > web_security',
  'area_of_focus_classification': {'top_level': 'general

## Unpublish Polluted Projects / Actions

In [2]:
import sys 
sys.path.append('../../')
from src.sheets_utils import download_sheet_as_df

# Download the insights sheet
metadata_df = download_sheet_as_df(
    service_account_file,
    insights_sheet_id,
    "v1 (Jan 25)"
)


# Filter for subset (batch_id == 10 && action in ["scrape a website", "Help me take an Interview", "beam code", "spark code"])
to_exclude_tasks = metadata_df[
    (metadata_df["batch_id"] == "10") & 
    (metadata_df["action"].isin(["Scrape a website", "Help me take an interview", "Write / modify / fix beam code", "Write / modify / fix spark code"]))
]["task_link"].to_list()
print(f"Excluding {len(to_exclude_tasks)} tasks")

# Remove these task_links from delivery sheet, inisghts sheet, and gdrive folder
from src.sheets_utils import GoogleSheetsService

sheets_client = GoogleSheetsService(service_account_file, ['https://www.googleapis.com/auth/spreadsheets'])
sheets_client.delete_rows_in_list_of_values(delivery_sheet_id, INSIGHTS_VERSION_TAB, to_exclude_tasks)



Excluding 56 tasks


HttpError: <HttpError 400 when requesting https://sheets.googleapis.com/v4/spreadsheets/1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4/values/%27v1%20%28Jan%2025%29%27%21A%3AH?alt=json returned "Unable to parse range: 'v1 (Jan 25)'!A:H". Details: "Unable to parse range: 'v1 (Jan 25)'!A:H">

In [8]:
import gspread
from google.oauth2.service_account import Credentials

# Function to setup the gspread client
def google_sheet_auth(json_file_path):
    scope = ['https://www.googleapis.com/auth/spreadsheets']
    credentials = Credentials.from_service_account_file(json_file_path, scopes=scope)
    client = gspread.authorize(credentials)
    return client

# Main function to delete rows with a given list of values in a column
def delete_rows_from_google_sheet(json_file_path, sheet_url, tab_name, search_column_index, values_to_delete):
    # Authenticate with the Google Sheet
    gspread_client = google_sheet_auth(json_file_path)
    google_sheet = gspread_client.open_by_url(sheet_url)
    worksheet = google_sheet.worksheet(tab_name) 

    # Extracting all data from the target column
    all_values = worksheet.col_values(search_column_index)
    rows_to_delete = [i for i, item in enumerate(all_values, 1) if item in values_to_delete]
    
    # Gspread doesn't support direct deletion of multiple rows in arbitrary order.
    # The loop will delete in the reverse order to avoid index shifting problems.
    for i in sorted(rows_to_delete, reverse=True):
        worksheet.delete_rows(i)

# Example variables
sheet_url = 'https://docs.google.com/spreadsheets/d/1v_O33STdi_h7taPd3MkD0fiqRx7rqr_aAQWGnlOfr_w/edit#gid=1753316694'
tab_name = 'v1 (Jan 25)'
search_column_index = 1  # Considering first column as an example

# Delete operation
delete_rows_from_google_sheet(service_account_file, sheet_url, tab_name, search_column_index, to_exclude_tasks)

In [9]:
# Example variables
sheet_url = 'https://docs.google.com/spreadsheets/d/1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4/edit#gid=281973205'
tab_name = 'Batch 10'
search_column_index = 1  # Considering first column as an example

# Delete operation
delete_rows_from_google_sheet(service_account_file, sheet_url, tab_name, search_column_index, to_exclude_tasks)

In [4]:
%pip install gspread

Collecting gspread
  Downloading gspread-6.0.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 KB[0m [31m830.3 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting StrEnum==0.4.15
  Using cached StrEnum-0.4.15-py3-none-any.whl (8.9 kB)
Collecting google-auth-oauthlib>=0.4.1
  Downloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl (24 kB)
Collecting requests-oauthlib>=0.7.0
  Using cached requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Collecting oauthlib>=3.0.0
  Using cached oauthlib-3.2.2-py3-none-any.whl (151 kB)
Installing collected packages: StrEnum, oauthlib, requests-oauthlib, google-auth-oauthlib, gspread
Successfully installed StrEnum-0.4.15 google-auth-oauthlib-1.2.0 gspread-6.0.2 oauthlib-3.2.2 requests-oauthlib-1.3.1
You should consider upgrading via the '/home/joe96/projects/turing/character.ai/character_tasks/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart 

In [10]:
to_exclude_tasks

['https://colab.research.google.com/drive/1ISirhbmPOg8hMIJuvIHVtv-MymPC13cO',
 'https://colab.research.google.com/drive/1ca91-ZE2u1uigCJgT-da-fypBEtzYNvB',
 'https://colab.research.google.com/drive/1uU28jLtLHaZM63YQf2HAUqlHm76Y84Uu',
 'https://colab.research.google.com/drive/1_slXeCkFcpl5tT_bh1SnFzIz3qPSNQjW',
 'https://colab.research.google.com/drive/1F8pDEDA3UzOzRmt2fK3DTUBXQWFeMNrv',
 'https://colab.research.google.com/drive/1kv9kfPyGyjvY1O1UH_01iLctxfS1UGk9',
 'https://colab.research.google.com/drive/1zffjzl7-PSmyfljmyWzp98JHSuvbf1Mz',
 'https://colab.research.google.com/drive/10MGY0KFzDDatqxqAAnb_Qr3QERkrIt76',
 'https://colab.research.google.com/drive/1hBGIIO1iAs_TOcl-D22I48dmXTYs36zs',
 'https://colab.research.google.com/drive/1ogdDTsrSk7Puy8HvD5HwTN4Lx2rxRYDx',
 'https://colab.research.google.com/drive/1xtWX10-Qg6e0EHglgMpDWp0NARwIisYH',
 'https://colab.research.google.com/drive/1zD0DbXGuzDmuuKWFKurLk5hp4-YHgzJt',
 'https://colab.research.google.com/drive/1lBqPQrCdJma67bajak4aD

In [2]:
# Create a new histroical redo sheet


to_exclude_tasks = ['https://colab.research.google.com/drive/1ISirhbmPOg8hMIJuvIHVtv-MymPC13cO',
 'https://colab.research.google.com/drive/1ca91-ZE2u1uigCJgT-da-fypBEtzYNvB',
 'https://colab.research.google.com/drive/1uU28jLtLHaZM63YQf2HAUqlHm76Y84Uu',
 'https://colab.research.google.com/drive/1_slXeCkFcpl5tT_bh1SnFzIz3qPSNQjW',
 'https://colab.research.google.com/drive/1F8pDEDA3UzOzRmt2fK3DTUBXQWFeMNrv',
 'https://colab.research.google.com/drive/1kv9kfPyGyjvY1O1UH_01iLctxfS1UGk9',
 'https://colab.research.google.com/drive/1zffjzl7-PSmyfljmyWzp98JHSuvbf1Mz',
 'https://colab.research.google.com/drive/10MGY0KFzDDatqxqAAnb_Qr3QERkrIt76',
 'https://colab.research.google.com/drive/1hBGIIO1iAs_TOcl-D22I48dmXTYs36zs',
 'https://colab.research.google.com/drive/1ogdDTsrSk7Puy8HvD5HwTN4Lx2rxRYDx',
 'https://colab.research.google.com/drive/1xtWX10-Qg6e0EHglgMpDWp0NARwIisYH',
 'https://colab.research.google.com/drive/1zD0DbXGuzDmuuKWFKurLk5hp4-YHgzJt',
 'https://colab.research.google.com/drive/1lBqPQrCdJma67bajak4aDA4ig5AHprL4',
 'https://colab.research.google.com/drive/14VKzIcDY0jPyZj5q8U-3ji5Uc5JsqFfh',
 'https://colab.research.google.com/drive/1GmwZRUz9Owkf2TclEkiiq3s58YCekLnd',
 'https://colab.research.google.com/drive/1-lWGDxx41n9sg87yIsqVTur1ZVpR0nSO',
 'https://colab.research.google.com/drive/1AM3BNP3qoqo_nHZMl0RZKQffIi62wBPQ',
 'https://colab.research.google.com/drive/1e6ORvNVBNteC_5q2IhnfdReEdsn_bb_-',
 'https://colab.research.google.com/drive/1j1MYrC5QM0lMjKqyCdEzyZZEEotFjl-f',
 'https://colab.research.google.com/drive/1Jv96_tXn4GhS1SzDFGSoil7hydrHiIvj',
 'https://colab.research.google.com/drive/1BR_cMikfmC48VdgJCGTX_-Wtl6LsJWUv',
 'https://colab.research.google.com/drive/19bAuEuQhKrVwuCYprslbAPITShNN5A24',
 'https://colab.research.google.com/drive/1RYoBPEVpVFvUO-uLDTWx2vckSq73Ihp-',
 'https://colab.research.google.com/drive/1xGEHflfPWwOrHG16aSmca7I77zOtsCwr',
 'https://colab.research.google.com/drive/1FAzFYv-fOH5v-IrxH2nn_f9n2PrfpNjl',
 'https://colab.research.google.com/drive/1Qam-Yq2JidOmOm9pvQRR-d62w5IHLlkZ',
 'https://colab.research.google.com/drive/1wx_UisDlDKa4LQKr99z2X0yDtFBRHs_F',
 'https://colab.research.google.com/drive/1SbSrys_ssUfWkIAjva6dTHWHfKtKnEHh',
 'https://colab.research.google.com/drive/1h7qcHGK0u13_ZSo5C-Zf6DWfDkkIAUCB',
 'https://colab.research.google.com/drive/15Yz7fwUf4EuKwh_Pi6ugC6lSNEZwDRmV',
 'https://colab.research.google.com/drive/1L-Vx6dHcuOZDgtTOrk9k9AvOgYCum6dQ',
 'https://colab.research.google.com/drive/1KeDmjC7XFRK5mX9OXHTjGMbVzViRUOEG',
 'https://colab.research.google.com/drive/1oru8cEwM3EEZJqafKnSp0-FY-OG16C7_',
 'https://colab.research.google.com/drive/12xQXXUXkApDNqvwXZKGJGPsCNbxnbvcR',
 'https://colab.research.google.com/drive/1my4v1sJ954TLesO7CesaeQ-43Y9iizqB',
 'https://colab.research.google.com/drive/19HmT3nrmY465GRcrME91k550Km9qpB9O',
 'https://colab.research.google.com/drive/1KGK4DybzhOU3abJofcNOjT3nTh0ezzPy',
 'https://colab.research.google.com/drive/1HuIQZClE7GAA-Qx-bgfYJeD-cEz7x69Z',
 'https://colab.research.google.com/drive/15eYkWuWKMfAJF5yntXj7FB-0fVdjes6s',
 'https://colab.research.google.com/drive/1OXvNXnZoRS1Px4NCsSULspE5hdlHmD81',
 'https://colab.research.google.com/drive/1LEJxDd8mDp5qgM7fU4HN4OlUZc3LPOLg',
 'https://colab.research.google.com/drive/1LLfohSnpQkrwWmnWn5Z870--iAEfNi8q',
 'https://colab.research.google.com/drive/12Hy8zETbqkppKYsjGx4unURVJzT5yOre',
 'https://colab.research.google.com/drive/1UsbsUJfmYjAYMT0gx6Vf31E-rHgdfN6v',
 'https://colab.research.google.com/drive/1THjPNauR474s_9_KKCLNLJMCxdjazStr',
 'https://colab.research.google.com/drive/1zTNyAMz3-MEL-v22KhoIPrh-fkr8QO4c',
 'https://colab.research.google.com/drive/11j9ogx3XDIe_oHPNSrXLdo-ZRd9gTBj_',
 'https://colab.research.google.com/drive/19cEqh_5Qs5XhqBpMWB7NUlK6xdJzz0Nl',
 'https://colab.research.google.com/drive/1H1PuQtV_iMis0A2JYyGkhDK_yO86NWf9',
 'https://colab.research.google.com/drive/1HK6UAoy6OVsvVRuuVR6g2Q2naeSfu2so',
 'https://colab.research.google.com/drive/1osS7hdnErgBpMhVOo_pPYA23KheFjp3-',
 'https://colab.research.google.com/drive/19rNQcINThr752-wOdnLfYMG_GZI-G1WA',
 'https://colab.research.google.com/drive/1rFq481ihhQ9wM2w-VVrj6pfvIah9oZFL',
 'https://colab.research.google.com/drive/1yrFXe0ydN1jIrxFUF8ABBHFq18kAb3wU',
 'https://colab.research.google.com/drive/1Pp8oadtFk89wvdZFFtFyHr6OejVExjXE',
 'https://colab.research.google.com/drive/1o7aEZrKPFuFCYhI4b2cNMjjiYveqXJ_9']

In [None]:
progress_batches = []
for sheet_name in included_sheet_names:
    print(sheet_name)
    bdf = download_sheet_as_df(service_account_file, tracking_sheet_id, sheet_name)
    progress_batches.append(bdf)

df = pd.concat(progress_batches, ignore_index=True)


df = df[df["task_link"].isin(to_exclude_tasks)]