In [4]:
import sys 
sys.path.append('../../')

service_account_file = '../../creds/google__sa.json'

tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"
included_sheet_names = [
    "Conversations_Batch_1", # "Conversations_Batch_1 (Responses)
    "Conversations_Batch_2",
    "Conversations_Batch_3",
    "Conversations_Batch_4",
    "Conversations_Batch_5",
]

jupyter_gdrive_folder_ids = [
    "1Z1bdYMe2Qmo_vs-OaKDaYIiV3rIqLJH9", # V0
    "1sfPFHkXYpKyY41V0pfz3Qw3k4VLy5Hvb", # V1
    "1jV7WA5zB172DJUp7Z2XzHr62E6U6_NtY",
]

delivery_sheet_id = "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4"
delivery_jsonl_gdrive_folder_id = "1b3UuMfgwxpOsW0GnsdsrEBWdjUvg8Ub7"

gpt_reviews_path = "gpt_reviews.csv"


from concurrent.futures import ThreadPoolExecutor, as_completed
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from tqdm import tqdm
import pandas as pd

def get_file_name_from_colab_link(colab_link, service_account_file):
    try:
        file_id = colab_link.split('/drive/')[1]
    except IndexError:
        return None

    SCOPES = ['https://www.googleapis.com/auth/drive']
    credentials = Credentials.from_service_account_file(service_account_file, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)

    try:
        file = service.files().get(fileId=file_id).execute()
        return file.get('name')
    except Exception as e:
        return None


def fetch_file_names_parallel(links, service_account_file, max_workers=100):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_file_name_from_colab_link, link, service_account_file): link for link in links}
        results = {}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching File Names"):
            link = futures[future]
            try:
                file_name = future.result()
                results[link] = file_name
            except Exception as e:
                results[link] = None
        return results


In [8]:
from src.sheets_utils import download_sheet_as_df

progress_batches = []
for sheet_name in included_sheet_names:
    print(sheet_name)
    bdf = download_sheet_as_df(service_account_file, tracking_sheet_id, sheet_name)
    progress_batches.append(bdf)
    print(bdf.shape)

df = pd.concat(progress_batches, ignore_index=True)
df = df[df["completion_status"] == "Done"]
df[["task_link", "assigned_to_email"]]

Conversations_Batch_1
(300, 13)
Conversations_Batch_2
(308, 13)
Conversations_Batch_3
(336, 12)
Conversations_Batch_4
(734, 12)
Conversations_Batch_5
(1878, 12)


Unnamed: 0,task_link,assigned_to_email
0,https://colab.research.google.com/drive/1_2Z17...,stefano.b@turing.com
1,https://colab.research.google.com/drive/1_arMF...,yuzhao.ni@turing.com
2,https://colab.research.google.com/drive/1_Flx9...,visalakshi.garimella@turing.com
3,https://colab.research.google.com/drive/1_lsGG...,riya.dhar@turing.com
4,https://colab.research.google.com/drive/1_xYmo...,chirag.rade-c@turing.com
...,...,...
3493,https://colab.research.google.com/drive/1PFqmH...,zubair.m@turing.com
3494,https://colab.research.google.com/drive/1sAiD4...,zubair.m@turing.com
3495,https://colab.research.google.com/drive/1EJY1Q...,zubair.m@turing.com
3496,https://colab.research.google.com/drive/1ChXQJ...,zubair.m@turing.com


## Personal Historical Mistakes

In [14]:
df_personal_historical_corrections = df[['task_link', 'assigned_to_email', "completion_date"]].drop_duplicates().copy()
df_personal_historical_corrections = df_personal_historical_corrections.rename(columns={"assigned_to_email": "author"})

df_personal_historical_corrections["resolved_by"] = ""
df_personal_historical_corrections["resolution_duration"] = ""
df_personal_historical_corrections["completion_status"] = "Unclaimed"
df_personal_historical_corrections["corrections"] = ""

df_personal_historical_corrections.sort_values(by=["author", "completion_date"], inplace=True)

df_personal_historical_corrections

Unnamed: 0,task_link,author,completion_date,resolved_by,resolution_duration,completion_status,corrections
1220,https://colab.research.google.com/drive/1fKN7Z...,,,,,Unclaimed,
1551,https://colab.research.google.com/drive/1KTtZ0...,,,,,Unclaimed,
56,https://colab.research.google.com/drive/1622YW...,James.oladimeji-c@turing.com,12/20/2023,,,Unclaimed,
1297,https://colab.research.google.com/drive/1HPSON...,aarunik.g@turing.com,1/1/2024,,,Unclaimed,
1491,https://colab.research.google.com/drive/1aiZ-t...,aarunik.g@turing.com,1/1/2024,,,Unclaimed,
...,...,...,...,...,...,...,...
1467,https://colab.research.google.com/drive/10y4g5...,zubair.m@turing.com,12/29/2023,,,Unclaimed,
1506,https://colab.research.google.com/drive/1NCPpG...,zubair.m@turing.com,12/29/2023,,,Unclaimed,
1510,https://colab.research.google.com/drive/1W3FYa...,zubair.m@turing.com,12/29/2023,,,Unclaimed,
1519,https://colab.research.google.com/drive/1JxEMu...,zubair.m@turing.com,12/29/2023,,,Unclaimed,


In [13]:
from src.sheets_utils import upload_df_to_sheet

upload_df_to_sheet(
    service_account_file, 
    tracking_sheet_id, 
    "historical__personal_corrections", 
    df_personal_historical_corrections
)

## Assistant Goes First Not Realistic

## Missing Messages
Due to missing headers, so message is skipped

## Code Cells should be wrapped in backticks

## Quality Red Flags