In [None]:
from utils import service_account_path
from datetime import datetime, timezone
import pandas as pd
from src.gdrive_api.auth import build_service



In [None]:
get_all_revisions('1a0XS1q3IKk1GGWvMoItNinJ1O_RuOOOa')

In [None]:
service = build_service(service_account_path)
revisions = get_all_revisions(service, '1a0XS1q3IKk1GGWvMoItNinJ1O_RuOOOa')
revisions

In [None]:
from datetime import timedelta
import traceback
from collections import defaultdict

def get_all_revisions(service, file_id, fields='id,modifiedTime,keepForever,lastModifyingUser'):
    try:
        # Get the revisions of the file
        revisions = service.revisions().list(fileId=file_id, pageSize=1000, fields=f'revisions({fields})').execute()
        return revisions['revisions']
    except Exception as e:
        print(f"Error for file: {file_id}")
        traceback.print_exc()
        return None


def batch_callback_wrapper(file_id, failed_revision_updates):
    def batch_callback(request_id, response, exception):
        if exception is not None:
            # Handle error
            print(f"An error occurred: {exception}")
            failed_revision_updates[file_id].append({'request_id': request_id, 'response': response, 'exception': exception})
            print('Total files with some failed revision updates:', len(failed_revision_updates))
        else:
            pass
            #print(f"Updated revision: {response.get('id')} keepForever: {response.get('keepForever')}")
    return batch_callback

def batch_update_revisions_keepForever(service, file_id, revisions, failed_revision_updates, dry_run=False):
    # Sort the revisions by 'modifiedTime' from most recent to oldest
    sorted_revisions = sorted(revisions, key=lambda r: r['modifiedTime'], reverse=True)
    # Initialize the batch counter and batch request
    batch_counter = 0
    batch = service.new_batch_http_request(callback=batch_callback_wrapper(file_id, failed_revision_updates))
    keep_forever_count_set = 0
    total_updated_count = 0

    for index, revision in enumerate(sorted_revisions):
        keep_forever_status = index < 200
        if revision.get('keepForever') != keep_forever_status:
            if not dry_run:
                batch.add(service.revisions().update(
                    fileId=file_id,
                    revisionId=revision['id'],
                    body={'keepForever': keep_forever_status},
                    fields='id, keepForever'
                ), request_id=revision['id'])
            batch_counter += 1
            total_updated_count += 1
            if keep_forever_status:
                keep_forever_count_set += 1
            # Execute the batch request after 50 calls and start a new batch. Limit is 100 but character limit on url is 8k. to be safe using 50
            if batch_counter >= 50:
                try:
                    batch.execute()
                except Exception as e:
                    print(f"Failed to update revisions for file {file_id}: {e}")
                # Reset the batch counter and start a new batch request
                batch_counter = 0
                batch = service.new_batch_http_request(callback=batch_callback_wrapper(file_id, failed_revision_updates))

    # Execute any remaining calls in the final batch
    if batch_counter > 0:
        try:
            batch.execute()
        except Exception as e:
            print(f"Failed to update revisions for file {file_id}: {e}")

    return keep_forever_count_set, total_updated_count

def group_revisions_by_author_and_time(revisions, max_interval=timedelta(hours=1)):
    # Sort revisions by descending modifiedTime
    sorted_revisions = sorted(revisions, key=lambda x: x['modifiedTime'], reverse=True)

    # Group revisions where the time difference between consecutive items is <= max_interval
    # and they are by the same author
    grouped_revisions = []
    current_group = []

    for i, revision in enumerate(sorted_revisions):
        if i == 0:
            current_group.append(revision)
        else:
            previous_revision = sorted_revisions[i-1]
            previous_revision_time = datetime.fromisoformat(previous_revision['modifiedTime'].rstrip('Z'))
            current_revision_time = datetime.fromisoformat(revision['modifiedTime'].rstrip('Z'))
            same_author = previous_revision['lastModifyingUser']['emailAddress'] == revision['lastModifyingUser']['emailAddress']
            if (previous_revision_time - current_revision_time <= max_interval) and same_author:
                current_group.append(revision)
            else:
                grouped_revisions.append(current_group)
                current_group = [revision]

    # Add the last group if it's not empty
    if current_group:
        grouped_revisions.append(current_group)

    return grouped_revisions

def get_most_recent_revisions_from_groups(grouped_revisions):
    most_recent_revisions = [group[0] for group in grouped_revisions]
    return most_recent_revisions

def update_recent_revisions_to_keep_forever(service_account_path, file_id, dry_run, failed_revision_updates, max_grouping_interval=timedelta(hours=1)):
    service = build_service(service_account_path)
    try:
        revisions = get_all_revisions(service, file_id)
        total_revisions_count = len(revisions) if revisions else 0
        if revisions:
            grouped_revisions = group_revisions_by_author_and_time(revisions, max_interval=max_grouping_interval)

            most_recent_revisions_from_groups = get_most_recent_revisions_from_groups(grouped_revisions)
            most_recent_revisions_from_groups_count = len(most_recent_revisions_from_groups)
            revisions = most_recent_revisions_from_groups
        else:
            most_recent_revisions_from_groups_count = 0
        
        result = {'total_revisions_count': total_revisions_count,
                  'total_revision_groups_count': most_recent_revisions_from_groups_count,
                  'keep_forever_count_set': 0,
                  'total_updated_count': 0}
        if revisions:
            keep_forever_count_set, total_updated_count = batch_update_revisions_keepForever(service, file_id, revisions, failed_revision_updates, dry_run=dry_run)
            result['keep_forever_count_set'] = keep_forever_count_set
            result['total_updated_count'] = total_updated_count
            print(f'Set keepForever={keep_forever_count_set}/{most_recent_revisions_from_groups_count} revisions, updated {total_updated_count} total revisions for file: {file_id}')
        else:
            print(f"No revisions found for file {file_id}")
        return result
    except Exception as e:
        print(f"Failed to update revisions for {file_id}: {e}")
        return {'total_revisions_count': 0, 'total_revision_groups_count': 0, 'keep_forever_count_set': 0, 'total_updated_count': 0}


In [None]:
update_recent_revisions_to_keep_forever('10P6VHfZQ9FmoW4nrJ41VYVH9NeuCnxws')

In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm
import pandas as pd

def extract_id_from_url(url):
    """
    Extract the file ID from a Google Drive URL.
    """
    parts = url.split('/')
    if 'drive' in parts:
        return parts[4].split('#')[0].split('?')[0]
    return None

def update_files_in_parallel(service_account_path, file_ids, failed_revision_updates=None, max_workers=10, dry_run=False):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(lambda file_id: update_recent_revisions_to_keep_forever(service_account_path, file_id, dry_run, failed_revision_updates), file_ids), total=len(file_ids)))
    return results

def summarize_stats(results):
    stats = {
        'total_revisions_count': [],
        'total_revision_groups_count': [],
        'keep_forever_count_set': [],
        'total_updated_count': []
    }

    for result in results:
        for key in stats:
            stats[key].append(result[key])

    stats_summary = {
        'sum': {key: sum(values) for key, values in stats.items()},
        'max': {key: max(values) for key, values in stats.items()},
        'min': {key: min(values) for key, values in stats.items()},
        'average': {key: pd.Series(values).mean() for key, values in stats.items()}
    }

    print("Stats Summary:")
    for stat_type, summary in stats_summary.items():
        print(f"{stat_type.capitalize()}:")
        for key, value in summary.items():
            print(f"  {key}: {value}")

    return stats_summary

In [None]:


"""
grouped_revisions = group_revisions_by_author_and_time(revisions, max_interval=timedelta(hours=1))

most_recent_revisions_from_groups = get_most_recent_revisions_from_groups(grouped_revisions)

print(grouped_revisions)
most_recent_revisions_from_groups
"""


In [None]:
file_ids = ['1a0XS1q3IKk1GGWvMoItNinJ1O_RuOOOa', '1RNw9fAbgXXvovrEfuia-jEtNkd0VOBiH', '10P6VHfZQ9FmoW4nrJ41VYVH9NeuCnxws']
failed_revision_updates = defaultdict(list)
stats = update_files_in_parallel(service_account_path, file_ids, max_workers=2, dry_run=True, failed_revision_updates=failed_revision_updates)
print(summarize_stats(stats))

In [None]:
from utils import get_delivered_df

df = get_delivered_df([1])

In [None]:


file_ids = df['task_link'].apply(extract_id_from_url).to_list()
len(file_ids)


In [None]:
stats1 = update_files_in_parallel(service_account_path, file_ids, max_workers=10, dry_run=True)
print(summarize_stats(stats1))

In [None]:
df = get_delivered_df([5])
file_ids = df['task_link'].apply(extract_id_from_url).to_list()
print(len(file_ids))
stats5 = update_files_in_parallel(service_account_path, file_ids, max_workers=10, dry_run=True)
print(summarize_stats(stats5))

# RUN ALL

In [None]:
from src.sheets_utils import download_sheet_as_df
tracking_sheet_id = '1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4'


def get_current_batches_df(batch_ids=[1, 2, 3, 4, 5, 6]):
    current_work_dfs = []
    for batch_id in batch_ids:
        print(f"Downloading batch {batch_id}...")
        df = download_sheet_as_df(
            service_account_path, tracking_sheet_id, f"Conversations_Batch_{batch_id}"
        )
        print(f"Batch {batch_id} downloaded.")
        df = df.assign(batch_id=batch_id)
        current_work_dfs.append(df)
    current_work_df = pd.concat(current_work_dfs, ignore_index=True)
    # Remove duplicate entries based on 'task_link'
    current_work_df['file_id'] = current_work_df['task_link'].apply(extract_id_from_url)
    current_work_df = current_work_df.drop_duplicates(subset='file_id', keep='last')
    print("Done.")
    return current_work_df


df = get_current_batches_df([1, 2, 3, 4, 5, 6])
file_ids = df[df['completion_status'] == 'Done']['file_id'].to_list()
print(len(file_ids))

In [None]:
['1sI5fewjKVoEWwO3OVuVzOBvVVER0ZYso',
'15AXjZoIhlO9GZl9BTKxSh9JHJkIl_Qh2']

In [None]:
failed_revision_updates = defaultdict(list)
stats_all = update_files_in_parallel(service_account_path, file_ids, failed_revision_updates=failed_revision_updates, max_workers=10, dry_run=False)
print(summarize_stats(stats_all))

In [None]:
print('Rerun for', len(failed_revision_updates))
second_run_stats_all = update_files_in_parallel(service_account_path, list(failed_revision_updates.keys()), failed_revision_updates=failed_revision_updates, max_workers=10, dry_run=False)
print(summarize_stats(second_run_stats_all))

In [None]:
list(failed_revision_updates.keys())[0]

In [None]:
failed_revision_updates['1XJeEyRu7hsqFakQUtwPBfHzgmpuA8inW']

In [None]:
len(failed_revision_updates)

In [None]:
failed_revision_updates['1RBuZglp5fQFRfmKaAfGRzvzHOPygz-1s']

In [None]:


def get_revision_before_timestamp(file_id, timestamp):
    try:
        # Get the revisions of the file
        revisions = drive_service.revisions().list(fileId=file_id).execute()
    except Exception as e:
        print(f"File not found: {file_id}")
        return None

    # Convert the timestamp to a datetime object
    
    timestamp = pd.to_datetime(timestamp).tz_localize(gmt_plus_2h_timezone)

    # Initialize the latest revision before the timestamp as None
    latest_revision_before_timestamp = None

    for revision in revisions['revisions']:
        # Convert the modifiedTime of the revision to a datetime object
        modified_time = pd.to_datetime(revision['modifiedTime'])
        # If the modifiedTime is before the timestamp
        if modified_time <= timestamp:
            # If this is the first revision or this revision is later than the latest found so far
            if latest_revision_before_timestamp is None or modified_time > pd.to_datetime(latest_revision_before_timestamp['modifiedTime']):
                # Update the latest revision before the timestamp
                latest_revision_before_timestamp = revision

    return latest_revision_before_timestamp



def get_file_id_from_task_link(task_link):
    try:
        return task_link.split("/")[-1].split('#')[0]
    except Exception as e:
        print('ERROR' + '='*60)
        print(task_link)
        return None

## Add a new column to the DataFrame for the file IDs
#selected_rows_df['file_id'] = selected_rows_df['Task Link [Google Colab]'].apply(get_file_id_from_task_link)

# Apply the function to each row in the DataFrame
#selected_rows_df['revision'] = selected_rows_df.apply(lambda row: get_revision_before_timestamp(row['file_id'], row['Timestamp']) if row['file_id'] is not None else None, axis=1)