<a href="https://colab.research.google.com/github/ipeirotis-org/scholar_v2/blob/main/notebooks/refreshing_authors_Extracting_percentiles_for_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade google-cloud-tasks google-cloud-secret-manager google-auth



from google.colab import auth
auth.authenticate_user()

from tqdm import tqdm
import pandas as pd
from google.cloud import firestore



In [None]:
# @title Refresh publications using Tasks Queue

from google.cloud import tasks_v2
import google.auth

credentials, _ = google.auth.default()
db = firestore.Client(credentials=credentials, project="scholar-version2")

# Create a client for Cloud Tasks
client = tasks_v2.CloudTasksClient()

# Your Google Cloud project ID and location
project = 'scholar-version2'
location = 'northamerica-northeast1'
queue = 'process-authors'

# Construct the fully qualified queue name
authors_queue = client.queue_path(project, location, 'process-authors')
pubs_queue = client.queue_path(project, location, 'process-pubs')



PAGE_SIZE = 20
collection_ref = db.collection('scholar_raw_author')
query = collection_ref.order_by('timestamp').limit(PAGE_SIZE)
for doc in query.stream():
  author = doc.to_dict().get('data', None)
  if not author: continue

  author_id = author.get('scholar_id')
  publications = author['publications']

  print(f'{author_id}: {len(publications)} publications')
  url = f'https://scholar.ipeirotis.org/api/author/{author_id}?no_cache=true'
  # Construct the request body
  task = {
      'http_request': {
          'http_method': tasks_v2.HttpMethod.GET,
          'url': url,
      }
  }
  # Add the task to the queue
  response = client.create_task(request={"parent": authors_queue, "task": task})

  for pub in tqdm(publications):
    pub_id = pub['author_pub_id']
    url = f'https://scholar.ipeirotis.org/api/author/{author_id}/publication/{pub_id}'
    # Construct the request body
    task = {
        'http_request': {
            'http_method': tasks_v2.HttpMethod.GET,
            'url': url,
            #'headers': {'Content-type': 'application/json'},
            #'body': f'{{"pub": "{pub}"}}'.encode()
        }
    }
    # Add the task to the queue
    response = client.create_task(request={"parent": pubs_queue, "task": task})

In [None]:
# @title Create a table with the analytics of the authors

# Initialize Firestore (ensure you have your credentials configured)
db = firestore.Client(credentials=credentials, project="scholar-version2")

entries = []

PAGE_SIZE = 1000
collection_ref = db.collection('author_stats')
query = collection_ref.order_by('timestamp').limit(PAGE_SIZE)

# Start with the initial query
last_document_snapshot = None

while True:
    if last_document_snapshot:
        query = collection_ref.order_by('timestamp').start_after(last_document_snapshot).limit(PAGE_SIZE)

    # Temporarily store documents from the current page to determine if we need to continue
    current_page_documents = []

    for doc in tqdm(query.stream()):
        data = doc.to_dict().get('data', {})
        stats = data.get('stats', {})
        entry = {
            "scholar_id": data.get('scholar_id'),
            "hindex": data.get('hindex'),
            "years_active": stats.get('years_active'),
            "pip_auc": stats.get('pip_auc'),
            "pip_auc_percentile": stats.get('pip_auc_percentile'),
            "total_publications": stats.get('total_publications'),
            "total_publications_percentile": stats.get('total_publications_percentile'),
            "citedby": data.get('citedby'),
        }
        entries.append(entry)
        current_page_documents.append(doc)

    if not current_page_documents:
        break  # Exit the loop if no documents are left

    # Use the last document from the current page as the starting point for the next query
    last_document_snapshot = current_page_documents[-1]

# Convert entries to DataFrame if needed



In [None]:
df = pd.DataFrame(entries)

In [None]:
df.dtypes

In [None]:
df.years_active = pd.to_numeric(df.years_active, errors = 'coerce')
df.total_publications_percentile = pd.to_numeric(df.total_publications_percentile, errors = 'coerce')

In [None]:
df

In [None]:
%config InlineBackend.figure_format='retina'

In [None]:
import numpy as np

# Define a custom aggregation function
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = f'percentile_{n}'
    return percentile_

pvt = df.query('years_active<=50').pivot_table(
    values='total_publications',
    index='years_active',
    aggfunc=[percentile(0), percentile(10), percentile(25), percentile(50), percentile(75), percentile(90)],
)

In [None]:
pvt.plot(grid=True)

In [None]:
pd.DataFrame(entries).query('years_active<=50').pivot_table(
    values='h_index',
    index='years_active',
    aggfunc='median'
).plot(grid=True)