### Visualizar archivos BSON en Pandas (Vissoft14)

In [6]:
from pathlib import Path
import pandas as pd
from bson import decode_all

def read_bson_files_from_folder(folder_path):
    folder = Path(folder_path)
    bson_files = list(folder.glob("*.bson"))
    
    if not bson_files:
        print("No se encontraron archivos BSON en la carpeta.")
        return pd.DataFrame()
    
    dfs = []
    
    for bson_file in bson_files:
        with open(bson_file, "rb") as file:
            data = decode_all(file.read())
            dfs.append(pd.DataFrame(data))
    
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

# Uso:
folder_path = "./datasets/vissoft14"
df = read_bson_files_from_folder(folder_path)
display(df)

Unnamed: 0,_id,committer,url,comments_url,commit,parents,author,files,sha,stats,...,following,public_gists,company,blog,location,email,hireable,bio,members_url,public_members_url
0,50ae31f1edecb54bbc00019c,{'url': 'https://api.github.com/users/trustin'...,https://api.github.com/repos/arya/netty/commit...,https://api.github.com/repos/arya/netty/commit...,{'url': 'https://api.github.com/repos/arya/net...,[{'url': 'https://api.github.com/repos/arya/ne...,{'url': 'https://api.github.com/users/trustin'...,[{'blob_url': 'https://github.com/arya/netty/b...,00056f585f5cb04285ebf110f83a6a26bbc4ff1a,"{'deletions': 4, 'total': 7, 'additions': 3}",...,,,,,,,,,,
1,50e52bb1edecb55805002b65,"{'type': 'User', 'login': 'normanmaurer', 'rep...",https://api.github.com/repos/netty/netty/commi...,https://api.github.com/repos/netty/netty/commi...,{'tree': {'url': 'https://api.github.com/repos...,[{'url': 'https://api.github.com/repos/netty/n...,"{'type': 'User', 'login': 'normanmaurer', 'rep...",[{'contents_url': 'https://api.github.com/repo...,000688fab0075eed850b686841343e3ef96ab905,"{'deletions': 4, 'additions': 5, 'total': 9}",...,,,,,,,,,,
2,50b5cc9aedecb52a5d000096,{'repos_url': 'https://api.github.com/users/tr...,https://api.github.com/repos/netty/netty/commi...,https://api.github.com/repos/netty/netty/commi...,"{'comment_count': 0, 'tree': {'sha': 'fbd17501...",[{'sha': 'b9b23663618bbf7eb27393a4f278f1ee0a0f...,{'repos_url': 'https://api.github.com/users/tr...,"[{'filename': 'pom.xml', 'contents_url': 'http...",0007e919233ff86a3fbf8080029f5fe2d8e4a02e,"{'additions': 709, 'deletions': 713, 'total': ...",...,,,,,,,,,,
3,522ced0dbd3543b496004ec2,"{'login': 'normanmaurer', 'id': 439362, 'avata...",https://api.github.com/repos/netty/netty/commi...,https://api.github.com/repos/netty/netty/commi...,"{'author': {'name': 'Norman Maurer', 'email': ...",[{'sha': '4a5c8402719586497a240e74a7893b3ccad9...,"{'login': 'normanmaurer', 'id': 439362, 'avata...",[{'sha': '0b3d2d11288a6d941cd2b8c8f2fb81c1e5b6...,0007fb81efa7e08c4b3f8c3a99a15cf4b0d5c1bc,"{'total': 50, 'additions': 50, 'deletions': 0}",...,,,,,,,,,,
4,50fe5c6ebd35439fb1000612,"{'type': 'User', 'url': 'https://api.github.co...",https://api.github.com/repos/netty/netty/commi...,https://api.github.com/repos/netty/netty/commi...,"{'comment_count': 0, 'url': 'https://api.githu...",[{'url': 'https://api.github.com/repos/netty/n...,"{'type': 'User', 'url': 'https://api.github.co...",[{'filename': 'codec-http/src/main/java/io/net...,00096ddb6207f097decd351fe8371a6cd3a6a807,"{'deletions': 6, 'total': 12, 'additions': 6}",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64681,53048bfebd3543f67700cb5c,,https://api.github.com/users/daami,,,,,,,,...,,,,,,,,,,
64682,530ab964bd3543abef037e89,,https://api.github.com/users/victorhjh,,,,,,,,...,,,,,,,,,,
64683,531149b8bd3543d5d70001ae,,https://api.github.com/users/echarles,,,,,,,,...,,,,,,,,,,
64684,52a9cb55bd3543c0f1000306,,https://api.github.com/users/treejames,,,,,,,,...,,,,,,,,,,


### Convertir archivo BSON a JSON (Vissoft14)

In [1]:
import json
from pathlib import Path
from bson import decode_all
from bson.json_util import dumps

def bson_to_json(input_folder, output_folder):
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)

    for bson_file in input_path.glob("*.bson"):
        try:
            with open(bson_file, "rb") as f:
                bson_data = decode_all(f.read())
            
            json_file = output_path / f"{bson_file.stem}.json"
            
            with open(json_file, "w", encoding="utf-8") as f:
                f.write(dumps(bson_data, indent=4))
            
            print(f"Convertido: {bson_file.name} -> {json_file.name}")
        except Exception as e:
            print(f"Error al procesar {bson_file.name}: {e}")

bson_to_json("./datasets/vissoft14/", "./datasets2/vissoft14")

Convertido: commits.bson -> commits.json
Convertido: commit_comments.bson -> commit_comments.json
Convertido: followers.bson -> followers.json
Convertido: forks.bson -> forks.json
Convertido: issues.bson -> issues.json
Convertido: issue_comments.bson -> issue_comments.json
Convertido: issue_events.bson -> issue_events.json
Convertido: org_members.bson -> org_members.json
Convertido: pull_requests.bson -> pull_requests.json
Convertido: pull_request_comments.bson -> pull_request_comments.json
Convertido: repos.bson -> repos.json
Convertido: repo_collaborators.bson -> repo_collaborators.json
Convertido: repo_labels.bson -> repo_labels.json
Convertido: users.bson -> users.json
Convertido: watchers.bson -> watchers.json
