In [1]:
# import libraries 
import requests
from bs4 import BeautifulSoup
import csv
import json
import re
import os
from tqdm import tqdm

In [2]:
crew_issue_url = "https://github.com/crewAIInc/crewAI/issues"

In [13]:
# Function to scrape issues from a given page URL
def scrape_issues(page_url):
    response = requests.get(page_url)
    print(f"Requesting URL: {page_url} - Status Code: {response.status_code}")  # Debugging line
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all issue elements
        issues = soup.find_all("div", class_="js-issue-row")
        print(f"Found {len(issues)} issues on this page.")  # Debugging line

        # List to store scraped issues for this function call
        page_issues = []

        # Loop through the issues and extract details
        for issue in issues:
            title = issue.find("a", class_="Link--primary").text.strip()
            status = "Open" if "open" in issue["class"] else "Closed"  # Check if the issue is open or closed
            
            # Find the link to the issue to scrape details
            issue_link = "https://github.com" + issue.find("a", class_="Link--primary")["href"]
            issue_details = scrape_issue_details(issue_link)
            
            # Store the issue data in the list
            page_issues.append({
                "title": title,
                "status": status,
                "description": issue_details["description"],
                "labels": issue_details["labels"],
                "created_at": issue_details["created_at"]
            })
        
        return page_issues  # Return the list of issues found on this page
    else:
        print(f"Failed to retrieve issues. Status code: {response.status_code}")
        return []  # Return an empty list if no issues were found

# Function to scrape additional details from the individual issue page
def scrape_issue_details(issue_url):
    response = requests.get(issue_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract description
        description = soup.find("div", class_="js-comment")  # Find the first comment as the description
        description_text = description.get_text(strip=True) if description else "No description provided."

        # Extract labels
        labels = soup.find_all("span", class_="labels")
        label_texts = [label.get_text(strip=True) for label in labels]

        # Extract creation date
        created_at = soup.find("relative-time")["datetime"]

        return {
            "description": description_text,
            "labels": label_texts,
            "created_at": created_at
        }
    else:
        # Return a default dictionary with empty fields if the request fails
        return {
            "description": "Failed to retrieve issue details.",
            "labels": [],
            "created_at": "Unknown"
        }


# Function to scrape all issues from the repository
def scrape_all_issues(base_url):
    all_scraped_issues = []  # List to store all issues scraped across pages
    page_number = 1
    
    while True:
        print(f"Scraping page {page_number}...")
        page_url = f"{base_url}?page={page_number}&q=is%3Aissue"
        
        page_issues = scrape_issues(page_url)  # Get issues from the current page
        if not page_issues:
            break  # Stop if no issues were found on this page
        
        all_scraped_issues.extend(page_issues)  # Add the found issues to the total list
        page_number += 1  # Go to the next page

    return all_scraped_issues  # Return the complete list of scraped issues





In [4]:
# Start scraping all issues
scraped_issues = scrape_all_issues(crew_issue_url)

# Print the total number of scraped issues
print(f"Total issues scraped: {len(scraped_issues)}")

# Print or use the scraped issues with a progress bar
print("\nProcessing scraped issues:")
for issue in tqdm(scraped_issues, desc="Processing Issues", unit="issue"):
    # Print only a summary or limit the output for better visibility
    print(f"Title: {issue['title']}, Status: {issue['status']}")

Scraping page 1...
Requesting URL: https://github.com/crewAIInc/crewAI/issues?page=1&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 2...
Requesting URL: https://github.com/crewAIInc/crewAI/issues?page=2&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 3...
Requesting URL: https://github.com/crewAIInc/crewAI/issues?page=3&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 4...
Requesting URL: https://github.com/crewAIInc/crewAI/issues?page=4&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 5...
Requesting URL: https://github.com/crewAIInc/crewAI/issues?page=5&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 6...
Requesting URL: https://github.com/crewAIInc/crewAI/issues?page=6&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 7...
Requesting URL: https://github.com/crewAIInc/crewAI/issues?page=7&q=is%3Aissue - Status Code: 200
F

Processing Issues: 100%|██████████████████████████████████████████| 857/857 [00:00<00:00, 113003.19issue/s]

Title: [BUG] : CrewAI Doesn't seem to support VertexAI embedding models., Status: Closed
Title: [BUG] Can't invoke my llm in a crew?, Status: Closed
Title: [BUG] Chromadb failing to intialize, Status: Closed
Title: [BUG] CrewAI Tools -- DirectorySearchTool, FileReadTool --> Requires OPENAI-API-Key, Status: Closed
Title: [BUG] Error occurred after upgrading to the latest version, Status: Closed
Title: [BUG] 0.74.1 crewAI+ compatibility issue, Status: Closed
Title: [BUG] Hierarchical example from documentation doesn't work, Status: Closed
Title: [FEATURE] Construct Agents and Tasks fully from YAML config, Status: Closed
Title: Cannot import Crawl4AI AsyncWebCrawler within CrewAI, Status: Closed
Title: No module named 'tomllib' when updating crewai, Status: Closed
Title: [BUG] memory - azure openai embedder, Status: Closed
Title: [BUG] Setting up a manager agent using local Ollama LLMs., Status: Closed
Title: [BUG] Flow @listen with _and error., Status: Closed
Title: [BUG] Logs are not di




In [6]:
print(len(scraped_issues))

857


In [7]:
# Function to clean raw HTML and extract plain text
def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text(strip=True)  # Get plain text and strip extra spaces

# Function to save the issues to a CSV file
def save_issues_to_csv(scraped_issues, folder="data", filename="github_issues.csv"):
    # Ensure the "data" folder exists
    os.makedirs(folder, exist_ok=True)
    
    # Full path to the CSV file inside the "data" folder
    file_path = os.path.join(folder, filename)
    
    # Open the CSV file for writing
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the header row
        writer.writerow(["Issue Title", "Issue Description"])
        
        # Iterate through each issue and write its details to the CSV
        for issue in scraped_issues:
            title = issue["title"]
            description_html = issue["description"]  # Assuming raw HTML is stored here
            
            # Clean the description from HTML
            clean_description = clean_html(description_html)
            
            # Write the title and cleaned description into the CSV
            writer.writerow([title, clean_description])




In [8]:
# Call the function to save the issues to a CSV
save_issues_to_csv(scraped_issues, filename="crew_ai_github_issues.csv")

  soup = BeautifulSoup(raw_html, "html.parser")


In [9]:
n8n = "https://github.com/n8n-io/n8n/issues" 
dify = "https://github.com/langgenius/dify/issues" 
volker = "https://github.com/strohne/volker/issues" 

In [14]:
scraped_issues = scrape_all_issues(n8n)
save_issues_to_csv(scraped_issues, filename="n8n_github_issues.csv")

scraped_issues = scrape_all_issues(dify)
save_issues_to_csv(scraped_issues, filename="dify_github_issues.csv")

scraped_issues = scrape_all_issues(volker)
save_issues_to_csv(scraped_issues, filename="volker_github_issues.csv")

Scraping page 1...
Requesting URL: https://github.com/n8n-io/n8n/issues?page=1&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 2...
Requesting URL: https://github.com/n8n-io/n8n/issues?page=2&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 3...
Requesting URL: https://github.com/n8n-io/n8n/issues?page=3&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 4...
Requesting URL: https://github.com/n8n-io/n8n/issues?page=4&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 5...
Requesting URL: https://github.com/n8n-io/n8n/issues?page=5&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 6...
Requesting URL: https://github.com/n8n-io/n8n/issues?page=6&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 7...
Requesting URL: https://github.com/n8n-io/n8n/issues?page=7&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page

  soup = BeautifulSoup(raw_html, "html.parser")
  k = self.parse_starttag(i)


Requesting URL: https://github.com/langgenius/dify/issues?page=1&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 2...
Requesting URL: https://github.com/langgenius/dify/issues?page=2&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 3...
Requesting URL: https://github.com/langgenius/dify/issues?page=3&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 4...
Requesting URL: https://github.com/langgenius/dify/issues?page=4&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 5...
Requesting URL: https://github.com/langgenius/dify/issues?page=5&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 6...
Requesting URL: https://github.com/langgenius/dify/issues?page=6&q=is%3Aissue - Status Code: 200
Found 25 issues on this page.
Scraping page 7...
Requesting URL: https://github.com/langgenius/dify/issues?page=7&q=is%3Aissue - Status Code: 200
Found 25 issues on this pag