In [1]:
pip install loguru

Defaulting to user installation because normal site-packages is not writeable
Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 391 kB/s eta 0:00:011
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.3
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pymongo

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import shutil
import subprocess
import tempfile
import uuid
from typing import Dict
from loguru import logger
from pymongo import MongoClient, errors

In [4]:
MONGO_URI = "mongodb://localhost:27017/"
DATABASE_NAME = "github_scraper"
COLLECTION_NAME = "repositories"

client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

In [None]:
class GithubCrawler:
    def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
        self._ignore = ignore

    def extract(self, link: str, user: Dict) -> None:
        """Extracts content from a GitHub repository and saves it to MongoDB."""
        # Check if repository already exists
        if collection.find_one({"link": link}):
            logger.info(f"Repository already exists in the database: {link}")
            return

        logger.info(f"Starting to scrape GitHub repository: {link}")
        repo_name = link.rstrip("/").split("/")[-1]
        local_temp = tempfile.mkdtemp()

        try:
            subprocess.run(["git", "clone", link], check=True, cwd=local_temp)

            repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])

            # build  content tree
            tree = {}
            for root, _, files in os.walk(repo_path):
                rel_dir = root.replace(repo_path, "").lstrip("/")
                if any(rel_dir.startswith(pattern) for pattern in self._ignore):
                    continue

                for file in files:
                    if any(file.endswith(pattern) for pattern in self._ignore):
                        continue
                    file_path = os.path.join(rel_dir, file)
                    try:
                        with open(os.path.join(root, file), "r", errors="ignore") as f:
                            tree[file_path] = f.read().strip()
                    except Exception as e:
                        logger.warning(f"Failed to read file {file_path}: {e}")

            # save the repo data to mongodb
            repo_data = {
                "_id": str(uuid.uuid4()),
                "name": repo_name,
                "link": link,
                "content": tree,
                "platform": "github",
                "author_id": user["id"],
                "author_full_name": user["full_name"],
            }
            collection.insert_one(repo_data)
            logger.info(f"Repository {repo_name} saved successfully.")
        except subprocess.CalledProcessError as e:
            logger.error(f"Failed to clone repository: {e}")
        except errors.PyMongoError as e:
            logger.error(f"Failed to save data to MongoDB: {e}")
        finally:
            # clean the temp directory
            shutil.rmtree(local_temp)

        logger.info(f"Finished scraping GitHub repository: {link}")

In [None]:
crawler = GithubCrawler()
test_user = {"id": str(uuid.uuid4()), "full_name": "Test User"}

test_links = [
    "https://github.com/ros-controls/ros2_controllers",
    # "https://github.com/IFRA-Cranfield/ros2_SimRealRobotControl",
    "https://github.com/ros-controls/ros2_control_demos",
    "https://github.com/IntelligentSystemsLabUTV/ros2-examples",
    # "https://github.com/ros-navigation/navigation2"
]

# iterating
for link in test_links:
    crawler.extract(link=link, user=test_user)

[32m2024-12-06 23:48:31.728[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract[0m:[36m9[0m - [1mRepository already exists in the database: https://github.com/ros-controls/ros2_controllers[0m
[32m2024-12-06 23:48:31.748[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract[0m:[36m9[0m - [1mRepository already exists in the database: https://github.com/ros-controls/ros2_control_demos[0m
[32m2024-12-06 23:48:31.753[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract[0m:[36m9[0m - [1mRepository already exists in the database: https://github.com/IntelligentSystemsLabUTV/ros2-examples[0m


In [None]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
db = client["github_scraper"]
collection = db["repositories"]

documents = list(collection.find())
# for doc in documents:
#     print(doc)

print("Total documents:", len(documents))

Total documents: 3


In [17]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
db = client["github_scraper"]
collection = db["repositories"]

ids = []
for doc in collection.find():
    ids.append(doc["_id"])

print(ids)

['d4b9ba47-79ef-4cac-ac5b-a00ecab94779', '16fbe7f7-f3f9-4afa-9b6c-65af435d70e1', '8dc64a03-a856-4109-b7f9-8e13c62d4b44']
