In [None]:
import requests
import json
from google.colab import userdata
import sys
import logging


In [18]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

if logger.hasHandlers():
    logger.handlers.clear()

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler("api_extraction.log")
file_handler.setFormatter(formatter)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [19]:
try:
  GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
except userdata.SecretNotFoundError:
  print('Secret not found. Add the token to Colab Secrets.')

if not GITHUB_TOKEN:
  sys.exit('GITHUB TOKEN not found')

headers ={
    'Authorization': f'token {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github+json',
    'X-GitHub-Api-Version': '2022-11-28'
}


In [22]:
def check_rate_limit(response):
  limit = response.headers.get('X-RateLimit-Limit')
  remaining_limit= response.headers.get('X-RateLimit-Remaining')
  # logging.info(f'Remaining limit: {remaining_limit}/{limit}')

def search_repository(query, max_pages = 3):
  # logging.info(f'Searching for {query}')
  search_url = f'https://api.github.com/search/repositories'
  results = []
  page = 1

  while page <= max_pages:
    params = {
        'q': query,
        'per_page': 100,
        'page': page
    }
    try:
      response = requests.get(search_url, headers= headers, params=params)
      response.raise_for_status()
      check_rate_limit(response)
      data = response.json()
      for item in data['items']:
        results.append({
            'id': item['id'],
            'full_name': item['full_name'],
            'url': item['html_url'],
            'stars': item['stargazers_count']
        })
      logging.info(f'Page {page} processed correctly, total results {len(results)}')
      page += 1

    except requests.exceptions.RequestException as e:
      logging.error(f'Error while searching repositories. (Query: {query}) : {e}')
    except requests.exceptions.HTTPError as e:
      logging.error(f'HTTP Error. (Query: {query}) : {e}, Response text: {e.response.text}')
      break

    except Exception as e:
      logging.critical(f'Critical error while searching repositories. (Query : {query}) : {e}')
      break

  return results

In [21]:
search_results = search_repository("python data analysis", max_pages=2)
print(json.dumps(search_results[:7], indent=2))

2025-06-30 04:16:18,761 - INFO - Searching for python data analysis
2025-06-30 04:16:21,005 - INFO - Page 1 processed correctly, total results 100


Remaining limit: 29/30


2025-06-30 04:16:23,183 - INFO - Page 2 processed correctly, total results 200


Remaining limit: 28/30
[
  {
    "id": 85111422,
    "full_name": "WillKoehrsen/Data-Analysis",
    "url": "https://github.com/WillKoehrsen/Data-Analysis",
    "stars": 5345
  },
  {
    "id": 43759462,
    "full_name": "ujjwalkarn/DataSciencePython",
    "url": "https://github.com/ujjwalkarn/DataSciencePython",
    "stars": 5458
  },
  {
    "id": 138245018,
    "full_name": "iamseancheney/python_for_data_analysis_2nd_chinese_version",
    "url": "https://github.com/iamseancheney/python_for_data_analysis_2nd_chinese_version",
    "stars": 8389
  },
  {
    "id": 10648016,
    "full_name": "fonnesbeck/statistical-analysis-python-tutorial",
    "url": "https://github.com/fonnesbeck/statistical-analysis-python-tutorial",
    "stars": 1611
  },
  {
    "id": 108740787,
    "full_name": "apachecn/python_data_analysis_and_mining_action",
    "url": "https://github.com/apachecn/python_data_analysis_and_mining_action",
    "stars": 1767
  },
  {
    "id": 30730072,
    "full_name": "rhiever/D

In [24]:
def get_repository_commits(owner, repo, max_pages = 5):
  # logging.info(f'Getting commits for {owner}/{repo}')
  url = f'https://api.github.com/repos/{owner}/{repo}/commits'
  commits= []
  page_counter = 1

  while url and page_counter <= max_pages:
    try:
        response = requests.get(url, headers=headers, params={'per_page': 100})
        response.raise_for_status()
        commit = response.json()

        for item in commit:
          commits.append({
              'sha': item['sha'],
              'author': item['commit']['author']['name'],
              'date': item['commit']['author']['date'],
              'message': item['commit']['message']
          })

          logging.info(f'Page {page_counter} processed correctly, total commits {len(commits)}')

          if 'next' in response.links:
            url = response.links['next']['url']
          else:
            url = None
          page_counter += 1

    except requests.exceptions.RequestException as e:
      logging.error(f'Error while getting commits for {owner}/{repo}. (Owner: {owner}, Repo: {repo}) : {e}')

  return commits

In [25]:
owner = 'pandas-dev'
repo_name = 'pandas'
commits_data = get_repository_commits(owner, repo_name, max_pages=2)
print(json.dumps(commits_data[:2], indent=2))

2025-06-30 04:27:20,435 - INFO - Getting commits for pandas-dev/pandas
2025-06-30 04:27:21,133 - INFO - Page 1 processed correctly, total commits 1
2025-06-30 04:27:21,134 - INFO - Page 2 processed correctly, total commits 2
2025-06-30 04:27:21,135 - INFO - Page 3 processed correctly, total commits 3
2025-06-30 04:27:21,136 - INFO - Page 4 processed correctly, total commits 4
2025-06-30 04:27:21,137 - INFO - Page 5 processed correctly, total commits 5
2025-06-30 04:27:21,138 - INFO - Page 6 processed correctly, total commits 6
2025-06-30 04:27:21,140 - INFO - Page 7 processed correctly, total commits 7
2025-06-30 04:27:21,141 - INFO - Page 8 processed correctly, total commits 8
2025-06-30 04:27:21,142 - INFO - Page 9 processed correctly, total commits 9
2025-06-30 04:27:21,143 - INFO - Page 10 processed correctly, total commits 10
2025-06-30 04:27:21,144 - INFO - Page 11 processed correctly, total commits 11
2025-06-30 04:27:21,145 - INFO - Page 12 processed correctly, total commits 12

[
  {
    "sha": "35b0d1dcadf9d60722c055ee37442dc76a29e64c",
    "author": "Leo Gordon",
    "date": "2025-06-25T15:57:56Z",
    "message": "TST: Increase test coverage for pandas.io.formats.excel.py (#61697)\n\n* Added coverage 314-349 and change in excel.py\n\n* Adding further coverage for excel.py\n\n* Whitespace changes (whoops)\n\n* Whitespace changes (whoops)\n\n* Update .gitignore\n\n* Update test_common.py"
  },
  {
    "sha": "09f7cc0a3c7805a036777456e87fdfe73129b6fc",
    "author": "Niruta Talwekar",
    "date": "2025-06-25T11:23:02Z",
    "message": "DOC: update Slack invite link in community dos (#61704)"
  }
]


In [34]:
def get_repo_contents(owner, repo, path=""):
  url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
  try:
      response = requests.get(url, headers=headers)
      response.raise_for_status()
      contents = response.json()
      if isinstance(contents, list):
        contents_list = []
        for item in contents:
          contents_list.append({
              'name': item['name'],
              'type': item['type'],
              'download_url': item.get('download_url'),
              'size': item.get('size', 0)
          })
        return contents_list
      else:
        return {
                'name': contents.get('name'),
                'type': contents.get('type'),
                'download_url': contents.get('download_url'),
                'size': contents.get('size', 0),
                'encoding': contents.get('encoding')
            }
  except requests.exceptions.HTTPError as e:
    logging.error(f"Error fetching repository contents: {e}")
    return {"error": e.response.json()}
  except Exception as e:
    logging.error(f"Error fetching repository contents: {e}")
    return {"error": str(e)}


In [42]:
owner = 'google'
repo = 'generative-ai-python'
path = ''
repo_contents = get_repo_contents(owner, repo, path)
print(json.dumps(repo_contents[:2], indent=2))

[
  {
    "name": ".editorconfig",
    "type": "file",
    "download_url": "https://raw.githubusercontent.com/google-gemini/deprecated-generative-ai-python/main/.editorconfig",
    "size": 222
  },
  {
    "name": ".github",
    "type": "dir",
    "download_url": null,
    "size": 0
  }
]
