# Get full graph

Here we get the full AI ecosystem graph, which is a networkx network where every node is a model in the AI ecosystem and every edge is a relation, including all types of relations -- finetunes, quantizations, merges, and adapters.

We define all attributes over the nodes, except for attributes pertaining to the model cards. We will pickle this graph and use it for other analyses later.

In [1]:
import numpy as np
import pandas as pd
import networkx as nx

# Read the expanded dataset
df = pd.read_csv("data/ai_ecosystem.csv")


In [2]:
# Create a graph
G = nx.DiGraph()

# Add nodes and edges to the graph in a single pass
idx = 0
for index, row in df.iterrows():
    model_id = row['model_id']
    
    # Add the model as a node
    G.add_node(model_id)

    # Define on nodes: likes, downloads, pipeline_tag, library_name, createdAt, licenses, datasets, languages
    G.nodes[model_id]['likes'] = row['likes']
    G.nodes[model_id]['downloads'] = row['downloads']
    G.nodes[model_id]['pipeline_tag'] = row['pipeline_tag']
    G.nodes[model_id]['library_name'] = row['library_name']
    G.nodes[model_id]['createdAt'] = row['createdAt']
    G.nodes[model_id]['licenses'] = row['licenses']
    G.nodes[model_id]['datasets'] = row['datasets']
    G.nodes[model_id]['languages'] = row['languages']

print("Nodes added.")


Nodes added.


In [3]:
# Add edges to the graph
for index, row in df.iterrows():
    model_id = row['model_id']
    #Different types of edges.
    parent_models = eval(row['parent_model']) if pd.notna(row['parent_model']) else []
    parent_model_finetunes = eval(row['finetune_parent']) if pd.notna(row['finetune_parent']) else []
    parent_model_quantizeds = eval(row['quantized_parent']) if pd.notna(row['quantized_parent']) else []
    parent_model_adapters = eval(row['adapter_parent']) if pd.notna(row['adapter_parent']) else []
    parent_model_merges = eval(row['merge_parent']) if pd.notna(row['merge_parent']) else []

    all_parent_models = set(parent_models + parent_model_finetunes + parent_model_quantizeds + parent_model_adapters + parent_model_merges)
    for parent_model in all_parent_models:
        # Some models list parents that are not in the graph. Skip these as we do not have data on them.
        if parent_model not in G.nodes():
            continue

        # Add edge
        G.add_edge(parent_model, model_id)

        # Add edge types
        G[parent_model][model_id]['edge_types'] = []
        G[parent_model][model_id]['edge_type'] = None
        if parent_model in parent_model_finetunes:
            G[parent_model][model_id]['edge_type'] = 'finetune'
            G[parent_model][model_id]['edge_types'].append('finetune')
        if parent_model in parent_model_quantizeds:
            G[parent_model][model_id]['edge_type'] = 'quantized'
            G[parent_model][model_id]['edge_types'].append('quantized')
        if parent_model in parent_model_adapters:
            G[parent_model][model_id]['edge_type'] = 'adapter'
            G[parent_model][model_id]['edge_types'].append('adapter')
        if parent_model in parent_model_merges:
            G[parent_model][model_id]['edge_type'] = 'merge'
            G[parent_model][model_id]['edge_types'].append('merge')

print("Edges added.")

print(len(G.nodes()))
print(len(G.edges()))

Edges added.
1860411
573653


In [4]:
import datetime

# Add attributes to edges
for edge in G.edges():
    parent_model = edge[0]
    model_id = edge[1]
    G.edges[parent_model, model_id]['change_in_likes'] = G.nodes[model_id]['likes'] - G.nodes[parent_model]['likes']
    if G.nodes[parent_model]['likes'] != 0:
        G.edges[parent_model, model_id]['percentage_change_in_likes'] = (G.nodes[model_id]['likes'] - G.nodes[parent_model]['likes']) / G.nodes[parent_model]['likes']
    else:
        G.edges[parent_model, model_id]['percentage_change_in_likes'] = np.nan #0
    G.edges[parent_model, model_id]['change_in_downloads'] = G.nodes[model_id]['downloads'] - G.nodes[parent_model]['downloads']
    if G.nodes[parent_model]['downloads'] != 0:
        G.edges[parent_model, model_id]['percentage_change_in_downloads'] = (G.nodes[model_id]['downloads'] - G.nodes[parent_model]['downloads']) / G.nodes[parent_model]['downloads']
    else:
        G.edges[parent_model, model_id]['percentage_change_in_downloads'] = np.nan #0
    G.edges[parent_model, model_id]['change_in_createdAt_days'] = (datetime.datetime.strptime(G.nodes[model_id]['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ') - datetime.datetime.strptime(G.nodes[parent_model]['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ')).days
            

In [5]:
print("Number of nodes in G:", len(G.nodes()))
print("Number of edges in G:", len(G.edges()))


Number of nodes in G: 1860411
Number of edges in G: 573653


In [6]:
# Pickle the graph
import pickle

with open('data/ai_ecosystem_graph.pkl', 'wb') as f:
    pickle.dump(G, f)

## Fine-tuning Tree Graph 

In [7]:
# Now save the fine-tuning tree graph -- that is, the graph without model merges. 

# Create a new graph 
G_finetune = G.copy()

# Remove all edges that are not finetune
edges_to_remove = []
for edge in G_finetune.edges():
    if G_finetune.edges[edge]['edge_type'] != 'finetune':
        edges_to_remove.append(edge)
for edge in edges_to_remove:
    G_finetune.remove_edge(edge[0], edge[1])

# Save the graph
with open('data/ai_ecosystem_graph_finetune.pkl', 'wb') as f:
    pickle.dump(G_finetune, f)

In [8]:
print(len(G_finetune.nodes()))
print(len(G_finetune.edges()))

1860411
189391


## No-Merges Tree Graph

In [10]:
# Create a new graph with all merges removed
G_nomerges = G.copy()

# Remove all edges that are merges
edges_to_remove = []
for edge in G_nomerges.edges():
    if G_nomerges.edges[edge]['edge_type'] == 'merge':
        edges_to_remove.append(edge)
for edge in edges_to_remove:
    G_nomerges.remove_edge(edge[0], edge[1])

# Save the graph
with open('data/ai_ecosystem_graph_nomerges.pkl', 'wb') as f:
    pickle.dump(G_nomerges, f)

In [11]:
# Create a new graph with all merges removed, and with fulljson on each node.
#G_nomerges_fulljson = G_nomerges.copy()

# Add the full json to each node
#for node in G_nomerges_fulljson.nodes():
#    G_nomerges_fulljson.nodes[node]['full_json'] = G.nodes[node]

# Save the graph
#with open('data/ai_ecosystem_graph_nomerges_fulljson.pkl', 'wb') as f:
#    pickle.dump(G_nomerges_fulljson, f)