# Numerical drifts

Here we read in the network just to do a few, quick calculations of numerical drifts and mutations between parents and children. 

For example, we'd like to know: For parent-child pairs that report languages, what is the average percentage change in the number of languages reported? This will provide evidence of language specialization.

We'd also like to know: What is the characteristic change in length of model card between pareret and child? What is the characteristic frequency of auto-generated model cards?

In [None]:
import numpy as np
import pandas as pd
import pickle
import networkx as nx
import ast
from itertools import combinations
import matplotlib.pyplot as plt
from ordering_helpers import get_trait_list, get_trait_counter, get_trait_graph, append_total_appearances, get_trait_ratios, get_oriented_trait_graph, get_top_trait_graph, solve_max_compatible_ordering, solve_weighted_compatible_ordering, get_violating_edges, get_compatible_and_total_traffic
import random
import ast

#with open('data/ai_ecosystem_graph_finetune_modelcards.pkl', 'rb') as f:
#    G = pickle.load(f)

with open('data/ai_ecosystem_graph_modelcards.pkl', 'rb') as f:
    G = pickle.load(f)

In [None]:
# Define has_model_card and number_of_languages
for node in G.nodes():
    G.nodes[node]['has_model_card'] = False
    if G.nodes[node]['model_card_length_characters']>3:
        G.nodes[node]['has_model_card'] = True
    G.nodes[node]['number_of_languages'] = len(get_trait_list(G.nodes[node], 'languages'))



# Get the  

{'likes': 0, 'downloads': 1, 'pipeline_tag': nan, 'library_name': nan, 'createdAt': '2024-09-02T20:04:31.000Z', 'licenses': "['other']", 'datasets': '[]', 'languages': '[]', 'modelCard': '---\nlicense: other\nlicense_name: flux-1-dev-non-commercial-license\nlicense_link: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/LICENSE.md\n---\n', 'model_card_length_characters': 158, 'model_card_length_words': 8, 'model_card_automatically_generated': False}
[]


In [None]:
edge_list = []
parent_has_model_card = []
child_has_model_card = []
parent_has_auto_generated_model_card = []
child_has_auto_generated_model_card = []
parent_bespoke_card_length = []
child_bespoke_card_length = []
parent_has_languages = []
child_has_languages = []

count_missing_languages = 0
counter = 0
for edge in G.edges():
    counter += 1
    if counter % 100000 == 0: 
        print(counter)
    edge_list.append(str(edge))
    parent_has_model_card.append(G.nodes[edge[0]]['has_model_card'])
    child_has_model_card.append(G.nodes[edge[1]]['has_model_card'])
    parent_has_auto_generated_model_card.append(G.nodes[edge[0]]['model_card_automatically_generated'])
    child_has_auto_generated_model_card.append(G.nodes[edge[1]]['model_card_automatically_generated'])
    #parent_bespoke_card_length.append(None)
    #child_bespoke_card_length.append(None)
    if G.nodes[edge[0]]['model_card_automatically_generated']==False and G.nodes[edge[1]]['model_card_automatically_generated']==False and G.nodes[edge[0]]['has_model_card']==True and G.nodes[edge[1]]['has_model_card']==True:
        parent_bespoke_card_length.append(G.nodes[edge[0]]['model_card_length_characters'])
        child_bespoke_card_length.append(G.nodes[edge[1]]['model_card_length_characters'])
    else:
        parent_bespoke_card_length.append(None)
        child_bespoke_card_length.append(None)

    parent_has_languages.append(len(get_trait_list(G.nodes[edge[0]], 'languages'))>0)
    child_has_languages.append(len(get_trait_list(G.nodes[edge[1]], 'languages'))>0)

result_df = pd.DataFrame(columns = ['index'
                                    , 'edge'
                                    , 'parent_has_model_card'
                                    , 'child_has_model_card'
                                    , 'parent_has_auto_generated_model_card'
                                    , 'child_has_auto_generated_model_card'
                                    , 'parent_bespoke_card_length'
                                    , 'child_bespoke_card_length'
                                    , 'parent_has_languages'
                                    , 'child_has_languages'
                                    ])
result_df['index'] = range(len(G.edges()))

# Append the lists to the dataframe
result_df['edge'] = edge_list
result_df['parent_has_model_card'] = parent_has_model_card
result_df['child_has_model_card'] = child_has_model_card
result_df['parent_has_auto_generated_model_card'] = parent_has_auto_generated_model_card
result_df['child_has_auto_generated_model_card'] = child_has_auto_generated_model_card
result_df['parent_bespoke_card_length'] = parent_bespoke_card_length
result_df['child_bespoke_card_length'] = child_bespoke_card_length
result_df['parent_has_languages'] = parent_has_languages
result_df['child_has_languages'] = child_has_languages


100000
200000
300000
400000
500000


In [None]:
result_df

In [28]:
print("Change in probability of having a model card: ")
print(np.mean(result_df['parent_has_model_card'].astype(int))-np.mean(result_df['child_has_model_card'].astype(int)))
print()

print("Change in probability of having an auto-generated model card | parent and child have model card: ")
print(np.mean(result_df[(result_df['parent_has_model_card']==1)&(result_df['child_has_model_card']==1)]['parent_has_auto_generated_model_card'].astype(int))-np.mean(result_df[(result_df['parent_has_model_card']==1)&(result_df['child_has_model_card']==1)]['child_has_auto_generated_model_card'].astype(int)))
print()

#print("Change in probability of having a bespoke model card: ")
#print(np.mean(result_df['parent_bespoke_card_length'].astype(int))-np.mean(result_df['child_bespoke_card_length'].astype(int)))
#print()




Change in probability of having a model card: 
-0.017376358181688145

Change in probability of having an auto-generated model card | parent and child have model card: 
-0.24868054288280356



In [18]:
# Filter out rows where parent_bespoke_card_length or child_bespoke_card_length is None
valid_rows = result_df.dropna(subset=['parent_bespoke_card_length', 'child_bespoke_card_length'])

# Calculate the percentage change
percent_changes = ((valid_rows['child_bespoke_card_length'] - valid_rows['parent_bespoke_card_length']) / valid_rows['parent_bespoke_card_length']) * 100

# Calculate the mean percentage change
mean_percent_change = percent_changes.mean()

print("Mean percentage change from parent to child bespoke card length:", mean_percent_change)


Mean percentage change from parent to child bespoke card length: 93.25761509667045


In [12]:
print("Average percent change in languages: ", np.mean(percent_change_in_languages))
print("Average percent change in model card characters: ", np.mean(percent_change_in_model_card_characters))
print("Average percent change in model card words: ", np.mean(percent_change_in_model_card_words))
print("Average percent change in model card automatically generated: ", np.mean(percent_change_in_model_card_automatically_generated))
print("Average percent change in has model card: ", np.mean(percent_change_in_has_model_card))
print("Count missing languages: ", count_missing_languages)

Average percent change in languages:  -0.03584345502478219
Average percent change in model card characters:  0.9325761509667044
Average percent change in model card words:  0.6478000865822037
Average percent change in model card automatically generated:  0.24868054288280358
Average percent change in has model card:  0.017376358181688232
Count missing languages:  457993


In [19]:
number_of_languages = []

for node in G.nodes():
    number_of_languages.append(len(get_trait_list(G.nodes[node], 'languages')))

# Value counts of number of languages
print(pd.Series(number_of_languages).value_counts())

0      1601473
1       229135
2        18510
8         2579
4         1429
        ...   
137          1
54           1
33           1
100          1
46           1
Name: count, Length: 105, dtype: int64
