<a href="https://colab.research.google.com/github/hliuson/gh-pr-discussions/blob/feature%2Fgithub-data-pipeline/GitHubDataPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
import time
import re
import json
from datetime import datetime, timedelta
from google.colab import files

## Setup

In [None]:
GITHUB_TOKEN = ""
HEADERS = {
    'Authorization': f'token {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github+json'
}

REQUEST_DELAY = 2
MAX_REPOS = 100
MAX_PRS_PER_REPO = 10

## Discover the Repositories

In [None]:
def searchRepos(language='python', min_stars=1000, min_forks=100):

  search_url = "https://api.github.com/search/repositories"

  queries = [
      f"stars:>{min_stars}",
      f"forks:>{min_forks}",
      f"language:{language}",
      "pushed:>2024-01-01",
      "archived:false"
  ]

  params = {
      'q': ' '.join(queries),
      'sort': 'stars',
      'order': 'desc',
      'per_page': MAX_REPOS
  }

  print(f"Repo query: {params['q']}")

  response = requests.get(search_url, headers=HEADERS, params=params)

  if response.status_code != 200:
    print(f"Error: {response.status_code} -  {response.text}")
    return []

  return response.json().get('items', [])

In [None]:
def filterRepos(repos):

  quality_repos = []

  for repo in repos:
    if (repo['stargazers_count'] >= 1000 and
        repo['forks_count'] >= 100 and
        repo['open_issues_count'] > 5 and
        repo['open_issues_count'] < 500 and
        repo['size'] > 100 and
        repo.get('license') and
        repo.get('description') and
        len(repo['description']) > 20 and
        not repo['archived'] and
        not repo['disabled']):

      quality_repo = {
          'id': repo['id'],
          'name': repo['name'],
          'full_name': repo['full_name'],
          'description': repo['description'][:200],
          'stars': repo['stargazers_count'],
          'forks': repo['forks_count'],
          'language': repo['language'],
          'open_issues': repo['open_issues_count'],
          'updated_at': repo['updated_at'],
          'pushed_at': repo['pushed_at'],
          'license': repo['license']['name'] if repo['license'] else 'Unknown',
          'size': repo['size'],
          'has_wiki': repo['has_wiki'],
          'has_pages': repo['has_pages'],
          'url': repo['html_url']
      }

      quality_repos.append(quality_repo)

  return quality_repos

## Extract Pull Request Discussions

In [None]:
def searchPRsWithComments(repo_fullName, max_prs=50):

    search_url = "https://api.github.com/search/issues"

    # Search for PRs with comments in this specific repo
    query = f"repo:{repo_fullName} type:pr comments:>0"

    params = {
        'q': query,
        'sort': 'comments',     # Sort by number of comments
        'order': 'desc',        # Most comments first
        'per_page': max_prs
    }

    print(f"Searching for PRs with comments in {repo_fullName}")
    time.sleep(REQUEST_DELAY)

    response = requests.get(search_url, headers=HEADERS, params=params)

    if response.status_code != 200:
        print(f"Error searching PRs: {response.status_code}")
        return []

    prs = response.json().get('items', [])

    for pr in prs:
        pr['repository_full_name'] = repo_fullName

    return response.json().get('items', [])

In [None]:
def filterPRs(prs):
  print(f"    Filtering {len(prs)} PRs...")
  quality_prs = []
  no_comments_count = 0

  for pr in prs:
    has_quality_title = len(pr.get('title', '')) > 10
    has_description = pr.get('body') and len(pr['body']) > 30
    has_comments = pr.get('comments', 0) > 0
    not_draft = not pr.get('draft', False)

    if (has_quality_title or has_description) and has_comments and not_draft:
      quality_prs.append(pr)
    elif pr.get('comments', 0) == 0:
      no_comments_count += 1
      # Only print first few to avoid spam
      if no_comments_count <= 3:
          print(f"    Filtered out PR #{pr.get('number', '?')}: No comments")
    elif pr.get('draft', False):
      print(f"    Filtered out PR #{pr.get('number', '?')}: Draft PR")

    #print(f"    Summary: {no_comments_count} PRs with no comments")
    #print(f"    Found {len(quality_prs)} quality PRs")

  return quality_prs

In [None]:
def getComments(repo_fullName, pr_number):

  url = f"https://api.github.com/repos/{repo_fullName}/issues/{pr_number}/comments"

  #print(f"      Fetching comments from: {url}")
  time.sleep(REQUEST_DELAY)

  response = requests.get(url, headers=HEADERS)

  #print(f"      Response status: {response.status_code}")

  if response.status_code != 200:
    print(f"Error getting comments for PR #{pr_number}: {response.status_code}")
    print(f"      Error details: {response.text}")
    return []

  comments = response.json()
  #print(f"      API returned {len(comments)} comments for PR #{pr_number}")

  return comments

In [None]:
def processComments(comments, pr_data):

  bot_patterns = [
      'bot', 'Bot', '[bot]', 'github-actions', 'dependabot',
      'codecov', 'travis', 'circleci', 'sonarcloud'
  ]

  unwanted_patterns = [
      r'^lgtm$',
      r'^👍$',
      r'^thanks$',
      r'^ping @',
      r'^cc @',
      r'^\+1$',
      r'^approved$',
      r'^merge$'
  ]

  quality_comments = []

  for comment in comments:
    username = comment['user']['login']
    body = comment['body']

    is_bot = (any(pattern in username for pattern in bot_patterns) or
              comment['user']['type'] == 'Bot')

    is_too_short = len(body) < 30

    is_unwanted = any(re.match(pattern, body.strip(), re.IGNORECASE)
                      for pattern in unwanted_patterns)

    is_minor_fix = re.match(r'^(fix:|type:|lint:|format:)', body, re.IGNORECASE)

    if not (is_bot or is_too_short or is_unwanted or is_minor_fix):
            cleaned_comment = {
                'pr_title': pr_data.get('title', 'Unknown'),
                'pr_body': pr_data.get('body', '')[:500] if pr_data.get('body') else '',
                'pr_number': pr_data.get('number', 0),
                'comment_id': comment['id'],
                'author': username,
                'body': body,
                'created_at': comment['created_at'],
                'updated_at': comment['updated_at'],
                'repository': pr_data.get('repository_full_name', 'Unknown'),
                'comment_length': len(body)
            }

            quality_comments.append(cleaned_comment)

    return quality_comments

## Data Management

In [None]:
def saveJSON(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Data saved to {filename}")
    return filename

def summaryDisplay(data, data_type="data"):
    print(f"\n=== {data_type.upper()} SUMMARY ===")
    print(f"Total items: {len(data)}")

    if data and isinstance(data[0], dict):
        print("Sample item keys:", list(data[0].keys()))
        if len(data) > 0:
            print("First item preview:")
            for k, v in list(data[0].items())[:3]:
                preview = str(v)[:100] + "..." if len(str(v)) > 100 else str(v)
                print(f"  {k}: {preview}")

    return data

## Main Pipeline

In [None]:
def repositoryDiscovery():
    print("=== STAGE 1: REPOSITORY DISCOVERY ===")

    # Search for repositories
    repos = searchRepos(language='python', min_stars=1000)
    print(f"Found {len(repos)} repositories from search")

    # Filter for quality
    quality_repos = filterRepos(repos)
    print(f"Filtered to {len(quality_repos)} high-quality repositories")

    # Save to JSON
    filename = saveJSON(quality_repos, 'high_quality_repos.json')
    summaryDisplay(quality_repos, "repositories")

    return quality_repos

In [None]:
def prDiscussionExtraction(repos):

    print("=== STAGE 2: PR DISCUSSION EXTRACTION ===")

    all_discussions = []

    for i, repo in enumerate(repos[:10]):  # Limit to first repo for testing
        #print(f"\nProcessing repository {i+1}/{min(len(repos), 10)}: {repo['full_name']}")

        # Get pull requests
        prs = searchPRsWithComments(repo['full_name'], max_prs=100)
        print(f"  Total PRs found: {len(prs)}")

        # Filter substantial PRs
        substantial_prs = filterPRs(prs)
        print(f"Found {len(substantial_prs)} substantial PRs")

        # Process each PR
        for pr in substantial_prs[:10]:  # Limit PRs per repo
            print(f"  Processing PR #{pr['number']}: {pr['title'][:50]}...")

            # Get comments
            comments = getComments(repo['full_name'], pr['number'])
            print(f"  Raw comments retrieved: {len(comments)}")

            # Clean and filter comments
            quality_comments = processComments(comments, pr)
            print(f"  Quality comments after filtering: {len(quality_comments)}")

            all_discussions.extend(quality_comments)

    print(f"\nTotal quality discussions collected: {len(all_discussions)}")

    # Save to JSON
    if all_discussions:
        filename = saveJSON(all_discussions, 'pr_discussions_cleaned.json')
        summaryDisplay(all_discussions, "discussions")

    return all_discussions

## Pipeline Execution

In [None]:
test_repo = {
    'full_name': 'microsoft/vscode',  # Very active with lots of discussions
    'name': 'vscode'
}

In [None]:
print("Start of GitHub Pipeline")

repos = repositoryDiscovery()

Start of GitHub Pipeline
=== STAGE 1: REPOSITORY DISCOVERY ===
Repo query: stars:>1000 forks:>100 language:python pushed:>2024-01-01 archived:false
Found 100 repositories from search
Filtered to 55 high-quality repositories
Data saved to high_quality_repos.json

=== REPOSITORIES SUMMARY ===
Total items: 55
Sample item keys: ['id', 'name', 'full_name', 'description', 'stars', 'forks', 'language', 'open_issues', 'updated_at', 'pushed_at', 'license', 'size', 'has_wiki', 'has_pages', 'url']
First item preview:
  id: 13491895
  name: free-programming-books
  full_name: EbookFoundation/free-programming-books


In [None]:
discussions = prDiscussionExtraction(repos)

print("\n Pipeline completed!")
print(f" Collected {len(repos)} repositories and {len(discussions)} discussions")
print("\n Output files:")
print("  - high_quality_repos.json")
print("  - pr_discussions_cleaned.json")

=== STAGE 2: PR DISCUSSION EXTRACTION ===
Searching for PRs with comments in EbookFoundation/free-programming-books
  Total PRs found: 100
    Filtering 100 PRs...
    Filtered out PR #2235: No comments
    Filtered out PR #6799: No comments
    Filtered out PR #6878: No comments
Found 94 substantial PRs
  Processing PR #6614: move the translated documentation files to a docs ...
  Raw comments retrieved: 6
  Quality comments after filtering: 1
  Processing PR #7818: Telugu courses added...
  Raw comments retrieved: 12
  Quality comments after filtering: 1
  Processing PR #10030: Translating ZH for Contributing document (zh revie...
  Raw comments retrieved: 8
  Quality comments after filtering: 0
  Processing PR #3166: Format desc base on CONTRIBUTING-zh and Optimize z...
  Raw comments retrieved: 8
  Quality comments after filtering: 1
  Processing PR #3050: 8 books...
  Raw comments retrieved: 6
  Quality comments after filtering: 1
  Processing PR #11781: Added Playground for diffe

In [None]:
files.download('high_quality_repos.json')

[{'id': 13491895, 'name': 'free-programming-books', 'full_name': 'EbookFoundation/free-programming-books', 'description': ':books: Freely available programming books', 'stars': 363434, 'forks': 63796, 'language': 'Python', 'open_issues': 37, 'updated_at': '2025-07-22T03:23:53Z', 'pushed_at': '2025-06-28T02:59:36Z', 'license': 'Creative Commons Attribution 4.0 International', 'size': 19604, 'has_wiki': False, 'has_pages': True, 'url': 'https://github.com/EbookFoundation/free-programming-books'}, {'id': 21289110, 'name': 'awesome-python', 'full_name': 'vinta/awesome-python', 'description': 'An opinionated list of awesome Python frameworks, libraries, software and resources.', 'stars': 251296, 'forks': 26012, 'language': 'Python', 'open_issues': 479, 'updated_at': '2025-07-22T03:38:59Z', 'pushed_at': '2025-07-17T16:35:51Z', 'license': 'Other', 'size': 6839, 'has_wiki': False, 'has_pages': True, 'url': 'https://github.com/vinta/awesome-python'}, {'id': 63476337, 'name': 'Python', 'full_nam

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('pr_discussions_cleaned.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>