## Import ASFI Datasets

In [23]:
import pandas as pd
import json

# Dataset and Target Variables file paths
repositories = './datasets/1-apache_project_status.json'
datasetPath = './datasets/2-clean-apache-network-data.csv'

## Metrics Dataset

In [24]:
dfMetrics = pd.read_csv(datasetPath)
with open(repositories, 'r') as f:
    reposAll = json.load(f)

print("Metrics Dataset:")
display(dfMetrics.head())

Metrics Dataset:


Unnamed: 0,s_num_nodes,s_weighted_mean_degree,s_num_component,s_avg_clustering_coef,s_largest_component,s_graph_density,t_num_dev_nodes,t_num_file_nodes,t_num_dev_per_file,t_num_file_per_dev,t_graph_density,proj_name,month,st_num_dev,t_net_overlap,s_net_overlap
0,13,74.153846,1,0.687463,13,0.384615,2,201,1.059701,106.5,0.529851,abdera,0,1,0.0,0.0
1,15,34.133333,1,0.392751,15,0.247619,3,218,1.252294,91.0,0.417431,abdera,1,2,0.191358,0.196429
2,18,22.0,2,0.399824,14,0.156863,3,171,1.140351,65.0,0.380117,abdera,2,2,0.147436,0.14
3,15,22.666667,1,0.449899,15,0.228571,1,195,1.0,195.0,1.0,abdera,3,0,0.235897,0.1875
4,16,19.0,2,0.163095,14,0.141667,2,72,1.069444,38.5,0.534722,abdera,4,1,0.139706,0.170732


## ASFI REPOS

In [30]:
print("Repos Dataset:")
display(reposAll)

print(f"Total No. of ASFI Repos: ", len(reposAll.get("graduated", []))+len(reposAll.get("incubating", []))+len(reposAll.get("retired", [])))

Repos Dataset:


{'graduated': ['abdera',
  'accumulo',
  'ace',
  'activemq',
  'airavata',
  'airflow',
  'allura',
  'ambari',
  'oltu',
  'any23',
  'apex',
  'apisix',
  'apollo',
  'aries',
  'asterixdb',
  'atlas',
  'aurora',
  'batchee',
  'beam',
  'bval',
  'beehive',
  'bigtop',
  'bloodhound',
  'brooklyn',
  'buildr',
  'calcite',
  'carbondata',
  'cassandra',
  'cayenne',
  'celix',
  'chemistry',
  'chukwa',
  'clerezza',
  'click',
  'cloudstack',
  'commonsrdf',
  'cordova',
  'couchdb',
  'crunch',
  'ctakes',
  'curator',
  'cxf',
  'daffodil',
  'datafu',
  'datasketches',
  'deltacloud',
  'deltaspike',
  'derby',
  'devicemap',
  'directmemory',
  'directory',
  'bookkeeper',
  'dolphinscheduler',
  'drill',
  'druid',
  'dubbo',
  'eagle',
  'ant',
  'echarts',
  'empire',
  'esme',
  'etch',
  'falcon',
  'felix',
  'fineract',
  'flex',
  'flink',
  'flume',
  'fluo',
  'freemarker',
  'ftpserver',
  'geode',
  'geronimo',
  'giraph',
  'gobblin',
  'gora',
  'griffin',
  'gr

Total No. of ASFI Repos:  330


# Graduated and Retired Repos

In [12]:
reposGraduatedAndRetired = reposAll["graduated"] + reposAll["retired"]
display(reposGraduatedAndRetired)
print("Total Graduated and Retired Projects = ", len(reposGraduatedAndRetired))

['abdera',
 'accumulo',
 'ace',
 'activemq',
 'airavata',
 'airflow',
 'allura',
 'ambari',
 'oltu',
 'any23',
 'apex',
 'apisix',
 'apollo',
 'aries',
 'asterixdb',
 'atlas',
 'aurora',
 'batchee',
 'beam',
 'bval',
 'beehive',
 'bigtop',
 'bloodhound',
 'brooklyn',
 'buildr',
 'calcite',
 'carbondata',
 'cassandra',
 'cayenne',
 'celix',
 'chemistry',
 'chukwa',
 'clerezza',
 'click',
 'cloudstack',
 'commonsrdf',
 'cordova',
 'couchdb',
 'crunch',
 'ctakes',
 'curator',
 'cxf',
 'daffodil',
 'datafu',
 'datasketches',
 'deltacloud',
 'deltaspike',
 'derby',
 'devicemap',
 'directmemory',
 'directory',
 'bookkeeper',
 'dolphinscheduler',
 'drill',
 'druid',
 'dubbo',
 'eagle',
 'ant',
 'echarts',
 'empire',
 'esme',
 'etch',
 'falcon',
 'felix',
 'fineract',
 'flex',
 'flink',
 'flume',
 'fluo',
 'freemarker',
 'ftpserver',
 'geode',
 'geronimo',
 'giraph',
 'gobblin',
 'gora',
 'griffin',
 'groovy',
 'guacamole',
 'hama',
 'harmony',
 'hawq',
 'hcatalog',
 'helix',
 'pubscribe',
 'h

Total Graduated and Retired Projects =  293


## Forks, Stars and PRs Scraping For Graduated and Retired Repos 

In [14]:
# Graduated and Retired

import requests
import json
from config import GITHUB_TOKEN

GITHUB_API_URL = "https://api.github.com/repos/apache/{}"
headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}

repo_data = []
failed_repos = []

for repo in reposGraduatedAndRetired:
    try:
        response = requests.get(GITHUB_API_URL.format(repo), headers=headers)
        response.raise_for_status()
        data = response.json()
        stars = data.get("stargazers_count", 0)
        forks = data.get("forks_count", 0)
        pr_response = requests.get(GITHUB_API_URL.format(repo) + "/pulls?state=all", headers=headers)
        pr_response.raise_for_status()
        pr_count = len(pr_response.json())
        repo_data.append({
            "repo": repo,
            "stars": stars,
            "forks": forks,
            "pull_requests": pr_count
        })
        print(f"Repo: {repo}, Stars: {stars}, Forks: {forks}, PRs: {pr_count}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {repo}: {e}")
        failed_repos.append({"repo": repo, "error": str(e)})

with open("./datasets/3-repo_data.json", "w") as f:
    json.dump(repo_data, f, indent=4)

with open("./datasets/4-failed_repos.json", "w") as f:
    json.dump(failed_repos, f, indent=4)

Repo: abdera, Stars: 18, Forks: 24, PRs: 4
Repo: accumulo, Stars: 1091, Forks: 455, PRs: 30
Repo: ace, Stars: 27, Forks: 23, PRs: 16
Repo: activemq, Stars: 2345, Forks: 1455, PRs: 30
Repo: airavata, Stars: 119, Forks: 126, PRs: 30
Repo: airflow, Stars: 39190, Forks: 14797, PRs: 30
Repo: allura, Stars: 133, Forks: 35, PRs: 8
Repo: ambari, Stars: 2182, Forks: 1698, PRs: 30
Repo: oltu, Stars: 163, Forks: 120, PRs: 17
Repo: any23, Stars: 96, Forks: 55, PRs: 30
Error fetching data for apex: 404 Client Error: Not Found for url: https://api.github.com/repos/apache/apex
Repo: apisix, Stars: 14893, Forks: 2561, PRs: 30
Error fetching data for apollo: 404 Client Error: Not Found for url: https://api.github.com/repos/apache/apollo
Repo: aries, Stars: 113, Forks: 158, PRs: 30
Repo: asterixdb, Stars: 287, Forks: 143, PRs: 30
Repo: atlas, Stars: 1908, Forks: 873, PRs: 30
Repo: aurora, Stars: 634, Forks: 231, PRs: 30
Error fetching data for batchee: 404 Client Error: Not Found for url: https://api.gi

In [None]:
import json
def count_json_objects(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if isinstance(data, list):
            return len(data)
x_count = count_json_objects("./datasets/3-repo_data.json")
y_count = count_json_objects("./datasets/4-failed_repos.json")

print(f"Total Scraped Graduated and Retired Projects: {x_count}")
print(f"Number of Repositories Failed to be Scraped: {y_count}")

Total Scraped Graduated and Retired Projects: 192
Number of Repositories Failed to be Scraped: 101


## Handle failed Graduated and Retired Repos

In [None]:
import requests
import json
import os
from config import GITHUB_TOKEN

custom_repos = {
    "Ibatis": "https://github.com/mybatis/mybatis-3",
    "Jaxme": "https://github.com/jax-ml/jax",
    "Avalon": "https://github.com/avalonphp/avalon",
    "Juice": "https://github.com/fff-rs/juice",
    "Kabuki": "https://github.com/carllerche/kabuki",
    "Beehive": "https://github.com/moparisthebest/beehive"
}

apache_repos = {
    "Apex": "apex-core",
    "Apollo": "activemq-apollo",
    "Batchee": "incubator-batchee",
    "Commonsrdf": "commons-rdf",
    "Devicemap": "devicemap-browsermap",
    "Empire": "empire-db",
    "Flex": "flex-sdk",
    "Guacamole": "guacamole-client",
    "Pubscribe": "infrastructure-pypubsub",
    "Ivy": "ant-ivy",
    "Jdo": "db-jdo",
    "Log4cxx": "logging-log4cxx",
    "Log4net": "logging-log4net",
    "Log4php": "logging-log4php",
    "Mynewt": "mynewt-core",
    "Commons": "commons-lang",
    "Olingo": "olingo-odata4",
    "Omid": "incubator-omid",
    "Parquet": "parquet-mr",
    "Rat": "creadur-rat",
    "Servicecomb": "servicecomb-java-chassis",
    "Tapestry": "tapestry-5",
    "Tephra": "incubator-tephra",
    "Tuscany": "tuscany-sca-cpp",
    "Uima": "uima-uimaj",
    "Amaterasu": "incubator-retired-amaterasu-site",
    "Ariatosca": "incubator-ariatosca",
    "Axion": "ws-axiom",
    "Blur": "incubator-blur",
    "Cmda": "incubator-cmda",
    "Composer": "openwhisk-composer",
    "Concerted": "incubator-concerted",
    "Corinthia": "incubator-corinthia",
    "Cotton": "incubator-cotton",
    "Edgent": "incubator-edgent",
    "Gearpump": "incubator-gearpump",
    "Gossip": "incubator-gossip",
    "Hdt": "incubator-hdt",
    "Horn": "incubator-horn",
    "Htrace": "incubator-retired-htrace",
    "Iota": "incubator-iota",
    "Mrql": "incubator-retired-mrql",
    "Myriad": "incubator-myriad",
    "Nmaven": "incubator-myriad",
    "Openaz": "incubator-retired-openaz",
    "Pirk": "incubator-retired-pirk",
    "Provisionr": "incubator-retired-provisionr",
    "Quickstep": "incubator-retired-quickstep",
    "Ripple": "incubator-retired-ripple",
    "S2graph": "incubator-s2graph",
    "S4": "incubator-retired-s4",
    "Samoa": "incubator-samoa",
    "Slider": "incubator-retired-slider",
    "Tamaya": "incubator-retired-tamaya",
    "Taverna": "incubator-taverna-engine",
    "Warble": "incubator-warble-website",
    "Wave": "incubator-retired-wave",
    "Weex": "incubator-weex",
    "Xmlbeans": "xmlbeans",
    "Yoko": "geronimo-yoko",
    "Zeta": "zetacomponents",
    "Wsrp4j": "ws-wss4j",
    "Httpd-cli": "httpd",
    "Log4cxx2": "logging-log4cxx",
}

GITHUB_API_URL = "https://api.github.com/repos/apache/{}"
headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}

repo_data = {}

if os.path.exists("./datasets/3-repo_data.json"):
    with open("./datasets/3-repo_data.json", "r") as f:
        try:
            existing_data = json.load(f)
            if isinstance(existing_data, list):
                repo_data = {item["repo"]: item for item in existing_data}
            elif isinstance(existing_data, dict):
                repo_data = existing_data
        except json.JSONDecodeError:
            repo_data = {}
failed_repos = []
for key, repo in apache_repos.items():
    if repo in repo_data:
        print(f"Skipping {key}, already in data.")
        continue
    try:
        response = requests.get(GITHUB_API_URL.format(repo), headers=headers)
        response.raise_for_status()
        data = response.json()
        stars = data.get("stargazers_count", 0)
        forks = data.get("forks_count", 0)
        pr_response = requests.get(GITHUB_API_URL.format(repo) + "/pulls?state=all", headers=headers)
        pr_response.raise_for_status()
        pr_count = len(pr_response.json())
        repo_data[key] = {
            "repo": key,
            "stars": stars,
            "forks": forks,
            "pull_requests": pr_count
        }
        print(f"Repo: {key}, Stars: {stars}, Forks: {forks}, PRs: {pr_count}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {key}: {e}")
        failed_repos.append({"repo": key, "error": str(e)})
for key, url in custom_repos.items():
    if url in repo_data:
        print(f"Skipping {key}, already in data.")
        continue
    try:
        response = requests.get(f"https://api.github.com/repos/{url.replace('https://github.com/', '')}", headers=headers)
        response.raise_for_status()
        data = response.json()
        stars = data.get("stargazers_count", 0)
        forks = data.get("forks_count", 0)
        pr_response = requests.get(f"https://api.github.com/repos/{url.replace('https://github.com/', '')}/pulls?state=all", headers=headers)
        pr_response.raise_for_status()
        pr_count = len(pr_response.json())
        repo_data[key] = {
            "repo": key,
            "stars": stars,
            "forks": forks,
            "pull_requests": pr_count
        }
        print(f"Repo: {key}, Stars: {stars}, Forks: {forks}, PRs: {pr_count}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {key}: {e}")
        failed_repos.append({"repo": key, "error": str(e)})

with open("./datasets/3-repo_data.json", "w") as f:
    json.dump(repo_data, f, indent=4)

with open("./datasets/4-failed_repos.json", "w") as f:
    json.dump(failed_repos, f, indent=4)

Repo: Apex, Stars: 349, Forks: 173, PRs: 30
Repo: Apollo, Stars: 153, Forks: 70, PRs: 10
Repo: Batchee, Stars: 14, Forks: 17, PRs: 20
Repo: Commonsrdf, Stars: 47, Forks: 42, PRs: 30
Repo: Devicemap, Stars: 9, Forks: 5, PRs: 0
Repo: Empire, Stars: 83, Forks: 23, PRs: 16
Repo: Flex, Stars: 355, Forks: 144, PRs: 30
Repo: Guacamole, Stars: 1477, Forks: 737, PRs: 30
Repo: Pubscribe, Stars: 4, Forks: 3, PRs: 9
Repo: Ivy, Stars: 69, Forks: 112, PRs: 30
Repo: Jdo, Stars: 31, Forks: 18, PRs: 30
Repo: Log4cxx, Stars: 282, Forks: 124, PRs: 30
Repo: Log4net, Stars: 877, Forks: 334, PRs: 30
Repo: Log4php, Stars: 100, Forks: 81, PRs: 22
Repo: Mynewt, Stars: 857, Forks: 370, PRs: 30
Repo: Commons, Stars: 2777, Forks: 1625, PRs: 30
Repo: Olingo, Stars: 169, Forks: 191, PRs: 30
Repo: Omid, Stars: 88, Forks: 60, PRs: 30
Repo: Parquet, Stars: 2754, Forks: 1453, PRs: 30
Repo: Rat, Stars: 30, Forks: 48, PRs: 30
Repo: Servicecomb, Stars: 1921, Forks: 824, PRs: 30
Repo: Tapestry, Stars: 122, Forks: 95, PRs: 

In [None]:
import json

def count_json_objects(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if isinstance(data, dict):
            return len(data.keys())
        elif isinstance(data, list):
            return len(data)
        else:
            return 0

x_count = count_json_objects("./datasets/3-repo_data.json")
y_count = count_json_objects("./datasets/4-failed_repos.json")
print(f"Final Scraped Graduated and Retired Projects: {x_count}")
print(f"Number of Graduated and Retired Projects Failed to be Scraped: {y_count}")

Final Scraped Graduated and Retired Projects: 260
Number of Graduated and Retired Projects Failed to be Scraped: 0


## Incubating Projects

In [34]:
reposIncubating = reposAll["incubating"]
display(reposIncubating)
print("Total Projects in Incubating Phase = ", len(reposIncubating))

['age',
 'annotator',
 'bluemarlin',
 'brpc',
 'crail',
 'datalab',
 'doris',
 'eventmesh',
 'flagon',
 'heron',
 'hivemall',
 'hop',
 'inlong',
 'kyuubi',
 'liminal',
 'linkis',
 'livy',
 'marvin',
 'milagro',
 'mxnet',
 'nemo',
 'nlpcraft',
 'nuttx',
 'pagespeed',
 'pegasus',
 'ponymail',
 'sdap',
 'sedona',
 'shenyu',
 'spot',
 'streampipes',
 'teaclave',
 'toree',
 'training',
 'tuweni',
 'wayang',
 'yunikorn']

Total Projects in Incubating Phase =  37


## Forks, Stars and PRs Scraping For Incubating Repositories 

In [35]:
import requests
import json
from config import GITHUB_TOKEN

GITHUB_API_URL = "https://api.github.com/repos/apache/{}"
headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}

repo_data = []
failed_repos = []

for repo in reposIncubating:
    try:
        response = requests.get(GITHUB_API_URL.format(repo), headers=headers)
        response.raise_for_status()
        data = response.json()
        stars = data.get("stargazers_count", 0)
        forks = data.get("forks_count", 0)
        pr_response = requests.get(GITHUB_API_URL.format(repo) + "/pulls?state=all", headers=headers)
        pr_response.raise_for_status()
        pr_count = len(pr_response.json())
        repo_data.append({
            "repo": repo,
            "stars": stars,
            "forks": forks,
            "pull_requests": pr_count
        })
        print(f"Repo: {repo}, Stars: {stars}, Forks: {forks}, PRs: {pr_count}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {repo}: {e}")
        failed_repos.append({"repo": repo, "error": str(e)})

with open("./datasets/5-repo_data_incubating.json", "w") as f:
    json.dump(repo_data, f, indent=4)
with open("./datasets/6-failed_repos_incubating.json", "w") as f:
    json.dump(failed_repos, f, indent=4)

Repo: age, Stars: 3398, Forks: 424, PRs: 30
Error fetching data for annotator: 404 Client Error: Not Found for url: https://api.github.com/repos/apache/annotator
Error fetching data for bluemarlin: 404 Client Error: Not Found for url: https://api.github.com/repos/apache/bluemarlin
Repo: brpc, Stars: 16848, Forks: 4016, PRs: 30
Error fetching data for crail: 404 Client Error: Not Found for url: https://api.github.com/repos/apache/crail
Error fetching data for datalab: 404 Client Error: Not Found for url: https://api.github.com/repos/apache/datalab
Repo: doris, Stars: 13330, Forks: 3426, PRs: 30
Repo: eventmesh, Stars: 1639, Forks: 640, PRs: 30
Repo: flagon, Stars: 25, Forks: 15, PRs: 30
Error fetching data for heron: 404 Client Error: Not Found for url: https://api.github.com/repos/apache/heron
Error fetching data for hivemall: 404 Client Error: Not Found for url: https://api.github.com/repos/apache/hivemall
Repo: hop, Stars: 1095, Forks: 361, PRs: 30
Repo: inlong, Stars: 1423, Forks: 5

In [36]:
import json
def count_json_objects(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if isinstance(data, list):
            return len(data)
x_count = count_json_objects("./datasets/5-repo_data_incubating.json")
y_count = count_json_objects("./datasets/6-failed_repos_incubating.json")

print(f"Total Scraped Incubating Projects: {x_count}")
print(f"Number of Incubating Repos Failed to be Scraped: {y_count}")

Total Scraped Incubating Projects: 16
Number of Incubating Repos Failed to be Scraped: 21


## Handle failed Incubating Repos

In [37]:
import requests
import json
import os
from config import GITHUB_TOKEN

reposIncubating = {
    "Annotator": "incubator-annotator",
    "Bluemarlin": "incubator-bluemarlin",
    "Crail": "incubator-crail",
    "Datalab": "incubator-datalab",
    "Heron": "incubator-heron",
    "Hivemall": "incubator-hivemall",
    "Liminal": "incubator-liminal",
    "Livy": "incubator-livy",
    "Marvin": "incubator-marvin",
    "Milagro": "incubator-milagro",
    "Nemo": "incubator-nemo",
    "NLPCraft": "incubator-nlpcraft",
    "Spot": "incubator-spot",
    "Teaclave": "incubator-teaclave",
    "Toree": "incubator-toree",
    "Training": "incubator-training",
    "Tuweni": "incubator-tuweni",
    "Wayang": "incubator-wayang",
    "Pagespeed": "incubator-pagespeed-mod",
    "SDAP": "sdap-in-situ-data-services",
    "Yunikorn": "yunikorn-core"
}

GITHUB_API_URL = "https://api.github.com/repos/apache/{}"
headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
repo_data = {}

if os.path.exists("repo_data_incubating.json"):
    with open("repo_data_incubating.json", "r") as f:
        try:
            existing_data = json.load(f)
            if isinstance(existing_data, list):
                repo_data = {item["repo"]: item for item in existing_data}
            elif isinstance(existing_data, dict):
                repo_data = existing_data
        except json.JSONDecodeError:
            repo_data = {}

failed_repos = []
for key, repo in reposIncubating.items():
    print(key, repo)
    if repo in repo_data:
        print(f"Skipping {key}, already in data.")
        continue
    try:
        response = requests.get(GITHUB_API_URL.format(repo), headers=headers)
        response.raise_for_status()
        data = response.json()
        stars = data.get("stargazers_count", 0)
        forks = data.get("forks_count", 0)
        pr_response = requests.get(GITHUB_API_URL.format(repo) + "/pulls?state=all", headers=headers)
        pr_response.raise_for_status()
        pr_count = len(pr_response.json())
        repo_data[key] = {
            "repo": key,
            "stars": stars,
            "forks": forks,
            "pull_requests": pr_count
        }
        print(f"Repo: {key}, Stars: {stars}, Forks: {forks}, PRs: {pr_count}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {key}: {e}")
        failed_repos.append({"repo": key, "error": str(e)})
with open("./datasets/5-repo_data_incubating.json", "w") as f:
    json.dump(repo_data, f, indent=4)
with open("./datasets/6-failed_repos_incubating.json", "w") as f:
    json.dump(failed_repos, f, indent=4)

Annotator incubator-annotator
Repo: Annotator, Stars: 231, Forks: 41, PRs: 30
Bluemarlin incubator-bluemarlin
Repo: Bluemarlin, Stars: 2, Forks: 7, PRs: 30
Crail incubator-crail
Repo: Crail, Stars: 149, Forks: 46, PRs: 30
Datalab incubator-datalab
Repo: Datalab, Stars: 153, Forks: 57, PRs: 30
Heron incubator-heron
Repo: Heron, Stars: 3630, Forks: 593, PRs: 30
Hivemall incubator-hivemall
Repo: Hivemall, Stars: 311, Forks: 117, PRs: 30
Liminal incubator-liminal
Repo: Liminal, Stars: 144, Forks: 42, PRs: 30
Livy incubator-livy
Repo: Livy, Stars: 904, Forks: 606, PRs: 30
Marvin incubator-marvin
Repo: Marvin, Stars: 101, Forks: 34, PRs: 30
Milagro incubator-milagro
Repo: Milagro, Stars: 42, Forks: 13, PRs: 30
Nemo incubator-nemo
Repo: Nemo, Stars: 112, Forks: 64, PRs: 30
NLPCraft incubator-nlpcraft
Repo: NLPCraft, Stars: 79, Forks: 25, PRs: 30
Spot incubator-spot
Repo: Spot, Stars: 350, Forks: 227, PRs: 30
Teaclave incubator-teaclave
Repo: Teaclave, Stars: 773, Forks: 160, PRs: 30
Toree inc

In [38]:
import json

def count_json_objects(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if isinstance(data, dict):
            return len(data.keys())
        elif isinstance(data, list):
            return len(data)
        else:
            return 0

x_count = count_json_objects("./datasets/5-repo_data_incubating.json")
y_count = count_json_objects("./datasets/6-failed_repos_incubating.json")
print(f"Final Scraped Incubating Projects: {x_count}")
print(f"Number of Incubating Projects Failed to be Scraped: {y_count}")

Final Scraped Incubating Projects: 37
Number of Incubating Projects Failed to be Scraped: 0


> Now, the final scraped graduated and retired projects are **260** which are stored in `3-repo_data.json` and total scraped incubating projects are **37** which are stored in `5-repo_data_incubating.json`!

Total ASFI Repos = 297

## Calculate PScore and Define Popularity (Target Variable-Popular)

In [57]:
import json
import pandas as pd

# Calculate pScore
def load_and_process(filename):
    with open(filename, "r") as file:
        try:
            data = json.load(file)
            if isinstance(data, dict):
                data = list(data.values())
            elif not isinstance(data, list):
                raise ValueError(f"Unexpected format in {filename}: {type(data)}")
        except json.JSONDecodeError:
            raise ValueError(f"Error loading {filename}: Invalid JSON format")
    for repo in data:
        repo["pScore"] = repo["stars"] + repo["forks"] + (repo["pull_requests"] ** 2)
    return data

repo_data_1 = load_and_process("./datasets/3-repo_data.json")
repo_data_2 = load_and_process("./datasets/5-repo_data_incubating.json")
merged_data = repo_data_1 + repo_data_2

df = pd.DataFrame(merged_data)
df_sorted = df.sort_values(by="pScore", ascending=False)
display(df_sorted)

Unnamed: 0,repo,stars,forks,pull_requests,pScore
50,echarts,62125,19696,30,82721
160,superset,64987,14678,30,80565
152,spark,40744,28550,30,70194
47,dubbo,40819,26504,30,68223
5,airflow,39190,14797,30,54887
...,...,...,...,...,...
108,nuvem,2,3,2,9
246,Warble,1,2,1,4
220,Cmda,1,3,0,4
216,Amaterasu,0,1,0,1


In [None]:
# Calculate pScore_normalized and popularity
min_score = df_sorted['pScore'].min()
max_score = df_sorted['pScore'].max()
df_sorted_norm = df_sorted
df_sorted_norm['pScore_normalized'] = (df_sorted['pScore'] - min_score) / (max_score - min_score)
# df_sorted_norm['popular'] = df_sorted['pScore_normalized'].apply(lambda x: 1 if x >= 0.5 else 0)
threshold = df_sorted_norm['pScore_normalized'].quantile(0.30)  # Adjust this value as per requirement
df_sorted_norm['popular'] = df_sorted_norm['pScore_normalized'].apply(lambda x: 1 if x >= threshold else 0)
display(df_sorted_norm)

Unnamed: 0,repo,stars,forks,pull_requests,pScore,pScore_normalized,popular
50,echarts,62125,19696,30,82721,1.000000,1
160,superset,64987,14678,30,80565,0.973936,1
152,spark,40744,28550,30,70194,0.848563,1
47,dubbo,40819,26504,30,68223,0.824736,1
5,airflow,39190,14797,30,54887,0.663520,1
...,...,...,...,...,...,...,...
108,nuvem,2,3,2,9,0.000109,0
246,Warble,1,2,1,4,0.000048,0
220,Cmda,1,3,0,4,0.000048,0
216,Amaterasu,0,1,0,1,0.000012,0


In [61]:
popular_counts = df_sorted_norm['popular'].value_counts()
print("Popularity counts (1-Popular, 0-Not Popular):")
print(popular_counts)

Popularity counts (1-Popular, 0-Not Popular):
popular
1    208
0     89
Name: count, dtype: int64


In [63]:
df_sorted_norm.to_csv('./datasets/final-popularity-dataset.csv', index=False)
print("Popularity Dataset is stored as final-popularity-dataset.csv!")

Popularity Dataset is stored as final-popularity-dataset.csv!


In [None]:
import pandas as pd

df_popularity = pd.read_csv('./datasets/final-popularity-dataset.csv')
df_apache = pd.read_csv('./datasets/2-clean-apache-network-data.csv')
df_popularity['repo'] = df_popularity['repo'].astype(str).str.lower().str.strip()
df_apache['proj_name'] = df_apache['proj_name'].astype(str).str.lower().str.strip()
merged_df = pd.merge(df_popularity, df_apache, left_on='repo', right_on='proj_name', how='inner')
merged_df.to_csv('./datasets/final-dataset.csv', index=False)

display(merged_df.head())
print("Datasets merged successfully and saved as 'final-dataset.csv'!")

Unnamed: 0,repo,stars,forks,pull_requests,pScore,pScore_normalized,popular,s_num_nodes,s_weighted_mean_degree,s_num_component,...,t_num_dev_nodes,t_num_file_nodes,t_num_dev_per_file,t_num_file_per_dev,t_graph_density,proj_name,month,st_num_dev,t_net_overlap,s_net_overlap
0,echarts,62125,19696,30,82721,1.0,1,4,5.0,1,...,4,535,1.220561,163.25,0.30514,echarts,0,0,0.0,0.0
1,echarts,62125,19696,30,82721,1.0,1,5,4.8,2,...,4,414,1.881643,194.75,0.470411,echarts,1,0,0.064156,0.0
2,echarts,62125,19696,30,82721,1.0,1,6,9.333333,1,...,3,606,1.110561,224.333333,0.370187,echarts,2,0,0.017894,0.222222
3,echarts,62125,19696,30,82721,1.0,1,8,28.0,1,...,2,510,1.039216,265.0,0.519608,echarts,3,0,0.025705,0.25
4,echarts,62125,19696,30,82721,1.0,1,7,11.428571,1,...,3,60,1.5,30.0,0.5,echarts,4,0,0.057878,0.291667


Datasets merged successfully and saved as 'final-dataset.csv'!
