# Get Expanded Dataset

In this notebook, we take the json-only dataset of models on HuggingFace (given in the document `ai_ecosystems_jsons.csv') and we produce an expanded csv dataset where the json elements are expanded into fields. This will give us the tabular dataset that we post online *without model cards*.

In [1]:
import numpy as np
import pandas as pd
import pycountry

# Extract parent information from tags
def extract_parents(tags):
    parent_list = []
    finetune_parents = []
    quantized_parents = []
    adapter_parents = []
    merge_parents = []
    for tag in tags:
        if tag.startswith("base_model:") and tag.count(":") == 1:
            parent_list.append(tag[len("base_model:"):])
        if tag.startswith("base_model:finetune:"):
            finetune_parents.append(tag[len("base_model:finetune:"):]) 
        elif tag.startswith("base_model:quantized:"):
            quantized_parents.append(tag[len("base_model:quantized:"):])
        elif tag.startswith("base_model:adapter:"):
            adapter_parents.append(tag[len("base_model:adapter:"):])
        elif tag.startswith("base_model:merge:"):
            merge_parents.append(tag[len("base_model:merge:"):])
    return (parent_list, finetune_parents, quantized_parents, adapter_parents, merge_parents)

# Get a dictionary of all ISO language codes and their full names.
language_dict = {}
for lang in pycountry.languages:
    # Add 2-letter codes
    if hasattr(lang, 'alpha_2'):
        language_dict[lang.alpha_2] = lang.name
    # Add 3-letter codes
    if hasattr(lang, 'alpha_3'):
        language_dict[lang.alpha_3] = lang.name

# Extract tag information from tags
def extract_languages(tags):
    languages = []
    for tag in tags:
        if len(str(tag))==2 and tag in language_dict.keys():
            #language_codes.append(tag)
            languages.append(language_dict[tag])
    return languages

In [2]:
# Read the raw data
raw_df = pd.read_csv("data/ai_ecosystem_jsons.csv")

# Convert the fullJson column to a pandas dataframe, but keep the fullJson column
processed_df = pd.json_normalize(raw_df['fullJson'].apply(eval))

In [3]:
# We'd like region, base_model, license, arxiv, dataset
#processed_df['region_count'] = processed_df['tags'].apply(lambda x: x.count('region:'))
processed_df['regions'] = processed_df['tags'].apply(lambda x: [tag.replace('region:', '') for tag in x if tag.startswith('region:')])
#processed_df['region_count'] = processed_df['regions'].apply(lambda x: len(x))

processed_df['licenses'] = processed_df['tags'].apply(lambda x: [tag.replace('license:', '') for tag in x if tag.startswith('license:')])
#processed_df['license_count'] = processed_df['licenses'].apply(lambda x: len(x))

processed_df['arxiv_papers'] = processed_df['tags'].apply(lambda x: [tag.replace('arxiv:', '') for tag in x if tag.startswith('arxiv:')])
#processed_df['arxiv_count'] = processed_df['arxiv_papers'].apply(lambda x: len(x))

processed_df['datasets'] = processed_df['tags'].apply(lambda x: [tag.replace('dataset:', '') for tag in x if tag.startswith('dataset:')])
#processed_df['dataset_count'] = processed_df['datasets'].apply(lambda x: len(x))



In [4]:
# Append parent information to the dataset
processed_df[['parent_model','finetune_parent', 'quantized_parent', 'adapter_parent', 'merge_parent']] = pd.DataFrame(
    processed_df['tags'].apply(extract_parents).tolist(), index=processed_df.index
)

# Add the languages information
processed_df['languages'] = processed_df['tags'].apply(extract_languages)


# Drop the columns "_id" and "modelId" (the former is unneeded, the latter is redundant)
processed_df.drop(columns=['_id', 'modelId'], inplace=True)

# Rename the column "id" to "model_id"
processed_df.rename(columns={'id': 'model_id'}, inplace=True)



In [5]:
#processed_df.to_csv("data/ai_ecosystem.csv", index=False)

# Adding model cards

Here we add the model cards to the dataset and save a new version with model cards.

In [6]:
final_dataset_unprocessed = pd.read_csv('data/model_cards_jul21.csv',
                                        header=None, names= ['index','modelId','fullJson','modelCard','ratelimit_retries','exception_raised'])


In [7]:
# Merge the datasets on the 'model_id' and 'modelId' columns
final_dataset_processed = processed_df.merge(
    final_dataset_unprocessed[['modelId', 'modelCard','ratelimit_retries','exception_raised']],
    left_on='model_id',
    right_on='modelId',
    how='left'
)

# For all rows that have model_id in ecosystem_dataset but no modelId in final_dataset_unprocessed, set the exception_raised to 'model card is missing in the final dataset but possibly exists'
final_dataset_processed.loc[final_dataset_processed['modelId'].isna(), 'exception_raised'] = 'Model is missing in the final dataset after card scraping procedure. This may mean the model card is missing in our data but possibly exists.'

# Drop the 'modelId' column as it's redundant after the merge
final_dataset_processed.drop(columns=['modelId'], inplace=True)

# Display the header with all columns
pd.set_option('display.max_columns', None)


In [8]:
final_dataset_processed

Unnamed: 0,model_id,likes,trendingScore,private,downloads,tags,pipeline_tag,library_name,createdAt,regions,licenses,arxiv_papers,datasets,parent_model,finetune_parent,quantized_parent,adapter_parent,merge_parent,languages,modelCard,ratelimit_retries,exception_raised
0,moonshotai/Kimi-K2-Instruct,479,479.0,False,13356,"[transformers, safetensors, kimi_k2, text-gene...",text-generation,transformers,2025-07-11T00:55:12.000Z,[us],[other],[],[],[],[],[],[],[],[],---\nlicense: other\nlicense_name: modified-mi...,0.0,
1,THUDM/GLM-4.1V-9B-Thinking,569,367.0,False,33839,"[transformers, safetensors, glm4v, image-text-...",image-text-to-text,transformers,2025-06-28T14:24:10.000Z,[us],[mit],[2507.01006],[],[THUDM/GLM-4-9B-0414],[THUDM/GLM-4-9B-0414],[],[],[],"[English, Chinese]",---\r\nlicense: mit\r\nlanguage:\r\n- en\r\n- ...,0.0,
2,HuggingFaceTB/SmolLM3-3B,351,351.0,False,21863,"[transformers, safetensors, smollm3, text-gene...",text-generation,transformers,2025-07-08T10:11:45.000Z,[us],[apache-2.0],[],[],[],[],[],[],[],"[English, French, Spanish, Italian, Portuguese...",---\nlibrary_name: transformers\nlicense: apac...,0.0,
3,black-forest-labs/FLUX.1-Kontext-dev,1568,247.0,False,230863,"[diffusers, safetensors, image-generation, flu...",image-to-image,diffusers,2025-05-28T22:23:43.000Z,[us],[other],[2506.15742],[],[],[],[],[],[],[English],---\nlanguage:\n- en\nlicense: other\nlicense_...,0.0,
4,mistralai/Devstral-Small-2507,155,155.0,False,5090,"[vllm, safetensors, mistral, text2text-generat...",text-generation,vllm,2025-07-04T14:23:44.000Z,[us],[apache-2.0],[],[],[mistralai/Mistral-Small-3.1-24B-Instruct-2503],[mistralai/Mistral-Small-3.1-24B-Instruct-2503],[],[],[],"[English, French, German, Spanish, Portuguese,...",---\nlanguage:\n- en\n- fr\n- de\n- es\n- pt\n...,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860406,Amal17/NusaBERT-concate-BiGRU-NusaParagraph-emot,0,0.0,False,0,"[license:apache-2.0, region:us]",,,2025-07-12T17:13:42.000Z,[us],[apache-2.0],[],[],[],[],[],[],[],[],---\r\nlicense: apache-2.0\r\n---\r\n,0.0,
1860407,jackrvn/bidirectional-dialect-translator,0,0.0,False,0,"[transformers, safetensors, t5, text2text-gene...",text-generation,transformers,2025-07-12T17:13:59.000Z,[us],[],[1910.09700],[],[],[],[],[],[],[],---\nlibrary_name: transformers\ntags: []\n---...,0.0,
1860408,Amal17/NusaBERT-concate-BiGRU-NusaParagraph-topic,0,0.0,False,0,"[license:apache-2.0, region:us]",,,2025-07-12T17:14:00.000Z,[us],[apache-2.0],[],[],[],[],[],[],[],[],---\r\nlicense: apache-2.0\r\n---\r\n,0.0,
1860409,ond-ai/ond-agent-1.3-8b-ckpt-1,0,0.0,False,0,[region:us],,,2025-07-12T17:14:07.000Z,[us],[],[],[],[],[],[],[],[],[],---\ntags:\n- text-generation\n---\n,0.0,


In [9]:
final_dataset_processed.to_csv('data/ai_ecosystem_withmodelcards.csv')


In [10]:
# Define a new field, 'model_card_length' that is the length of the model card in number of characters
final_dataset_processed['model_card_length_characters'] = final_dataset_processed['modelCard'].astype(str).apply(len)

# Define a new field, 'model_card_length_words' that is the length of the model card in number of words
final_dataset_processed['model_card_length_words'] = final_dataset_processed['modelCard'].astype(str).apply(lambda x: len(x.split()))


In [11]:
# Define a new field, 'model_card_automatically_generated' that is true if the model card contains the phrase 'automatically generated' or 'generated automatically'
final_dataset_processed['model_card_automatically_generated'] = final_dataset_processed['modelCard'].astype(str).apply(lambda x: 'automatically generated' in x.lower() or 'generated automatically' in x.lower())

final_dataset_processed

Unnamed: 0,model_id,likes,trendingScore,private,downloads,tags,pipeline_tag,library_name,createdAt,regions,licenses,arxiv_papers,datasets,parent_model,finetune_parent,quantized_parent,adapter_parent,merge_parent,languages,modelCard,ratelimit_retries,exception_raised,model_card_length_characters,model_card_length_words,model_card_automatically_generated
0,moonshotai/Kimi-K2-Instruct,479,479.0,False,13356,"[transformers, safetensors, kimi_k2, text-gene...",text-generation,transformers,2025-07-11T00:55:12.000Z,[us],[other],[],[],[],[],[],[],[],[],---\nlicense: other\nlicense_name: modified-mi...,0.0,,25090,2120,False
1,THUDM/GLM-4.1V-9B-Thinking,569,367.0,False,33839,"[transformers, safetensors, glm4v, image-text-...",image-text-to-text,transformers,2025-06-28T14:24:10.000Z,[us],[mit],[2507.01006],[],[THUDM/GLM-4-9B-0414],[THUDM/GLM-4-9B-0414],[],[],[],"[English, Chinese]",---\r\nlicense: mit\r\nlanguage:\r\n- en\r\n- ...,0.0,,4721,422,False
2,HuggingFaceTB/SmolLM3-3B,351,351.0,False,21863,"[transformers, safetensors, smollm3, text-gene...",text-generation,transformers,2025-07-08T10:11:45.000Z,[us],[apache-2.0],[],[],[],[],[],[],[],"[English, French, Spanish, Italian, Portuguese...",---\nlibrary_name: transformers\nlicense: apac...,0.0,,15929,2282,False
3,black-forest-labs/FLUX.1-Kontext-dev,1568,247.0,False,230863,"[diffusers, safetensors, image-generation, flu...",image-to-image,diffusers,2025-05-28T22:23:43.000Z,[us],[other],[2506.15742],[],[],[],[],[],[],[English],---\nlanguage:\n- en\nlicense: other\nlicense_...,0.0,,9621,1158,False
4,mistralai/Devstral-Small-2507,155,155.0,False,5090,"[vllm, safetensors, mistral, text2text-generat...",text-generation,vllm,2025-07-04T14:23:44.000Z,[us],[apache-2.0],[],[],[mistralai/Mistral-Small-3.1-24B-Instruct-2503],[mistralai/Mistral-Small-3.1-24B-Instruct-2503],[],[],[],"[English, French, German, Spanish, Portuguese,...",---\nlanguage:\n- en\n- fr\n- de\n- es\n- pt\n...,0.0,,18761,2110,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860406,Amal17/NusaBERT-concate-BiGRU-NusaParagraph-emot,0,0.0,False,0,"[license:apache-2.0, region:us]",,,2025-07-12T17:13:42.000Z,[us],[apache-2.0],[],[],[],[],[],[],[],[],---\r\nlicense: apache-2.0\r\n---\r\n,0.0,,31,4,False
1860407,jackrvn/bidirectional-dialect-translator,0,0.0,False,0,"[transformers, safetensors, t5, text2text-gene...",text-generation,transformers,2025-07-12T17:13:59.000Z,[us],[],[1910.09700],[],[],[],[],[],[],[],---\nlibrary_name: transformers\ntags: []\n---...,0.0,,5171,741,True
1860408,Amal17/NusaBERT-concate-BiGRU-NusaParagraph-topic,0,0.0,False,0,"[license:apache-2.0, region:us]",,,2025-07-12T17:14:00.000Z,[us],[apache-2.0],[],[],[],[],[],[],[],[],---\r\nlicense: apache-2.0\r\n---\r\n,0.0,,31,4,False
1860409,ond-ai/ond-agent-1.3-8b-ckpt-1,0,0.0,False,0,[region:us],,,2025-07-12T17:14:07.000Z,[us],[],[],[],[],[],[],[],[],[],---\ntags:\n- text-generation\n---\n,0.0,,32,5,False


In [12]:
final_dataset_processed.to_csv('data/ai_ecosystem_withmodelcards_withcardinfo.csv')
