In [1]:
from pathlib import Path
base_path = Path('../../../')
data_path = base_path / 'data' / 'annotations' / 'group_mention_categorization'
models_path = base_path / 'models'

In [2]:
# list folder ins models_path
list(models_path.glob("*"))

[PosixPath('../../../models/all-mpnet-base-v2_economic-attributes-classifier'),
 PosixPath('../../../models/all-mpnet-base-v2_noneconomic-attributes-classifier'),
 PosixPath('../../../models/economic__occupation_professionmention_clustering_model'),
 PosixPath('../../../models/modelcard_metadata_template.yml'),
 PosixPath('../../../models/mention_stance_nli'),
 PosixPath('../../../models/social-group-mention-econ-attributes-classifier'),
 PosixPath('../../../models/social-group-mention-nonecon-attributes-classifier'),
 PosixPath('../../../models/modelcards_data.yml'),
 PosixPath('../../../models/social-group-mention-attribute-dimension-classifier-v2'),
 PosixPath('../../../models/temp_upload_all-mpnet-base-v2_noneconomic-attributes-classifier'),
 PosixPath('../../../models/social-group-mention-stance-classifier'),
 PosixPath('../../../models/social-group-mention-attribute-dimension-classifier-v3'),
 PosixPath('../../../models/modelcard_template.md'),
 PosixPath('../../../models/temp_up

In [3]:
import pandas as pd

In [4]:
!pip freeze | grep transformers
!pip freeze | grep setfit

sentence-transformers==5.1.0
transformers==4.57.1
setfit==1.1.2


In [5]:
usage_template = """\

## Usage

You can use the model with the [`setfit` python library](https://github.com/huggingface/setfit) (>=1.1.0):

*Note:* It is recommended to use transformers version >=4.5.5,<=5.0.0 and sentence-transformers version >=4.0.1,<=5.1.0 for compatibility.

### Classification

```python
import torch
from setfit import SetFitModel

model_name = "haukelicht/{model_name}"
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
classifier = SetFitModel.from_pretrained(model_name)
classifier.to(device);

# Example mentions
mentions = ["working class people", "highly-educated professionals", "people without a stable job"]

# Get predictions
with torch.no_grad():
    predictions = classifier.predict(mentions)
print(predictions)

# Map predictions to labels
[
    [
        classifier.id2label[l]
        for l, p in enumerate(pred) if p==1
    ]
    for pred in predictions
]
```

### Mention embedding

```python
import torch
from sentence_transformers import SentenceTransformer

model_name = "haukelicht/{model_name}"
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# Load the sentence transformer component of the pre-trained classifier
model = SentenceTransformer(model_name, device=device)

# Example mentions
mentions = ["working class people", "highly-educated professionals", "people without a stable job"]

# Compute mention embeddings
with torch.no_grad():
    embeddings = model.encode(mentions)
````

"""

In [6]:
import json
fp = data_path / 'attribute_definitions.json'
#data/annotations/group_mention_categorization/attribute_definitions.jsonl
with open(fp, 'r') as f:
    attributes_definitions = json.load(f)

In [7]:
econ_tab = pd.DataFrame([attributes_definitions["economic"]], index=['definition']).T.reset_index(names=['attribute'])
nonecon_tab = pd.DataFrame([attributes_definitions["non-economic"]], index=['definition']).T.reset_index(names=['attribute'])

In [8]:
econ_desc = f"""\
A multi-label classifier for detecting **economic attribute** categories referred to in a social group mention, trained with `setfit` based on the light-weight [`sentence-transformers/all-mpnet-base-v2`](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) sentence embedding model.

The economic attributes classified are:

{econ_tab.to_markdown(index=False)}
"""

In [9]:
nonecon_desc = f"""\
A multi-label classifier for detecting **non-economic attribute** categories referred to in a social group mention, trained with `setfit` based on the light-weight [`sentence-transformers/all-mpnet-base-v2`](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) sentence embedding model.

The non-economic attributes classified are:

{nonecon_tab.to_markdown(index=False)}
"""

In [10]:
default_tags = ["mention-classification", "mpnet-base-v2", "setfit", "multi-label-classification"]

model_dict = {
    "all-mpnet-base-v2_economic-attributes-classifier": {
        "title": "Group mention economic attributes classifier",
        "tags": ["economic-attributes", *default_tags],
        "description": econ_desc,
        "license": "apache-2.0",
    },
    "all-mpnet-base-v2_noneconomic-attributes-classifier":  {
        "title": "Group mention non-economic attributes classifier",
        "tags": ["noneconomic-attributes", *default_tags],
        "description": nonecon_desc,
        "license": "apache-2.0",
    }
}

### Save models as seet fit 

In [11]:
# create temp directories per model
temp_model_dirs = {}
for model_name, model_info in model_dict.items():
    temp_model_dir = models_path / f'temp_upload_{model_name}'
    temp_model_dir.mkdir(exist_ok=True)
    temp_model_dirs[model_name] = temp_model_dir

In [12]:
import torch
from setfit import SetFitModel, SetFitHead

for model_name, model_info in model_dict.items():

    # Use the correct model path for each model
    model_path = models_path / model_name
    model = SetFitModel.from_pretrained(model_path)
    model.to("cpu")

    # Create a new standard SetFitHead with the same configuration
    new_head = SetFitHead(
        model.model_head.in_features,
        model.model_head.out_features,
        device="cpu",
        multitarget=model.model_head.multitarget
    )
    new_head.to("cpu")

    # Get the state dict and ensure all tensors are in normal mode
    original_state_dict = model.model_head.state_dict()
    
    # Use torch.no_grad() to ensure we're not in inference mode when creating new tensors
    with torch.no_grad():
        new_state_dict = {}
        for key, tensor in original_state_dict.items():
            # Create a new tensor with the same data but ensure it's a normal tensor
            new_tensor = torch.tensor(tensor.cpu().numpy(), dtype=tensor.dtype, device="cpu")
            new_state_dict[key] = new_tensor
    
    # Load the cleaned state dict into the new head
    new_head.load_state_dict(new_state_dict)

    # Replace the model head with the new standard SetFitHead
    model.model_head = new_head

    # Ensure everything is properly set to training mode and then eval mode
    model.model_body.train()  # Set to training mode first to ensure normal tensor behavior
    model.model_body.eval()   # Then set to eval mode for inference
    model.model_head.train()   # Then set to eval mode for inference
    model.model_head.eval()   # Then set to eval mode for inference
    model.to("cpu")

    model.save_pretrained(temp_model_dirs[model_name])

In [13]:
SetFitModel.from_pretrained(temp_model_dirs[model_name])  # test load

<setfit.modeling.SetFitModel at 0x7ede29502410>

In [14]:
# import pandas as pd

# def parse_test_results(fp):
#     with open(fp, 'r') as f:
#         res = json.load(f)

#     res_df = pd.DataFrame.from_dict(res, orient='index', columns=['value']).reset_index(names='tmp')
#     res_df[['scheme', 'tmp']] = res_df.tmp.str.replace('test_', '').str.split('-', n=1, expand=True)
#     res_df = res_df[res_df.tmp.notnull()]
#     res_df[['type', 'metric']] = res_df.tmp.str.split('_', expand=True)

#     schemes = ['seqeval', 'softseqeval', 'doclevel']
#     res_df = res_df.query("scheme in @schemes and metric=='f1' and type in @GROUP_TYPES")

#     res_df = res_df.pivot_table(index='type', columns='scheme', values='value', aggfunc='first').reset_index()
#     res_df['type'] = pd.Categorical(res_df.type, categories=GROUP_TYPES, ordered=True)
#     res_df.columns.name = None
#     res_df = res_df.sort_values('type')


#     return res_df[['type']+schemes]

In [15]:
# schemes_dict = {
#     'seqeval': 'seq-eval F1',
#     'softseqeval': 'soft seq-eval  F1',
#     'doclevel': 'sentence level  F1'
# }

# def results_to_metrics_entries(x, schemes):
#     res_df = x.copy()
#     res_df.rename(columns={'type': 'name'}, inplace=True)
#     res_df = res_df.melt(id_vars='name', var_name='type', value_name='value')
#     if isinstance(schemes, dict):
#         res_df = res_df[res_df['type'].isin(schemes.keys())]
#         res_df['type'] = res_df['type'].map(schemes)
#     elif isinstance(schemes, list):
#         res_df = res_df[res_df['type'].isin(schemes)]
#     elif isinstance(schemes, str):
#         res_df = res_df[res_df['type'].isin([schemes])]
#     res_df['name'] = res_df.apply(lambda x: f"{x['name']} ({x['type']})", axis=1)
#     return res_df[['type', 'name', 'value']].to_dict(orient='records')

In [16]:
# results_to_markdown_table = lambda res_df, schemes=schemes_dict: res_df.rename(columns=schemes).to_markdown(index=False, tablefmt='github', floatfmt=".3f", colalign=("right", "center", "center", "center"))

### parse and fill

In [17]:
import yaml
from jinja2 import Template

# Load templates
with open(models_path / "modelcard_metadata_template.yml") as f:
    meta_template = Template(f.read())
with open(models_path / "modelcard_template.md") as f:
    body_template = Template(f.read())

In [18]:
with open(models_path / 'modelcards_data.yml', 'r') as f:
    modelcard_data = yaml.safe_load(f)

models = modelcard_data.pop("finetunes")

In [19]:
import regex
def create_readme(model_name: str, title: str, description: str, tags: list):
    # fp = models_path / model_name
    # res_df = parse_test_results(fp)
    # test_res = results_to_metrics_entries(res_df, schemes='seqeval')
    # results_table = results_to_markdown_table(res_df, schemes=schemes_dict)

    model_data = models[model_name]
    model_data["model_description"] = title
    model_data["model_summary"] = description
    model_data["get_started_code"] = usage_template.format(model_name=model_name)
    model_data["tags"] = tags
    metadata = {
        'model_id': model_name,
        # **{'test_results': test_res},
        **{k: v for k, v in modelcard_data.items() if v is not None},
        **model_data
    }
    metadata = meta_template.render(metadata)
    metadata = regex.sub(r'(\n\h*){3,}', '\n', metadata)
    body_data = {
        'model_id': title,
        **modelcard_data,
        **model_data,
        # **{'results': results_table}
    }
    body = body_template.render(body_data)
    # remove comment lines in body
    body = regex.sub(r'^\s*<!--.*?-->\s*$', '', body, flags=regex.MULTILINE)
    body = regex.sub(r'\n{3,}', '\n\n', body)
    body = regex.sub(r'(\n\h*){3,}', '\n\n', body)
    # remove (sub)sections where there is no content or only "[More Information Needed]"
    body = regex.sub(r'##+\s+.*?\n+(?:\s*\[More Information Needed\]\s*\n*)+', '', body)

    return metadata+'\n\n'+body+'\n'

In [20]:
readmes = {}
for model_name, model_info in model_dict.items():
    readme = create_readme(
        model_name=model_name,
        title=model_info['title'],
        description=model_info['description'],
        tags=model_info['tags']
    )
    with open(temp_model_dirs[model_name] / 'README.md', 'w') as f:
        f.write(readme)
    readmes[model_name] = readme

In [21]:
models_names = list(model_dict.keys())
models_names

['all-mpnet-base-v2_economic-attributes-classifier',
 'all-mpnet-base-v2_noneconomic-attributes-classifier']

In [22]:
from huggingface_hub import login, create_repo, upload_folder

In [23]:
model_name = models_names[0]
model_id = f"haukelicht/{model_name}"
model_path = temp_model_dirs[model_name]

create_repo(
    repo_id=model_id,  # Just the model name if you want it at root
    repo_type="model",
    private=False,  # or True if you want it private
    exist_ok=True   # avoids error if it already exists
)

upload_folder(
    repo_type="model",
    repo_id=model_id,
    folder_path=model_path,
    commit_message="used native setfit head",
    create_pr=False,
)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/haukelicht/all-mpnet-base-v2_economic-attributes-classifier/commit/99bd01e05034ff85fcd5c7f875ca39b93a1e40f5', commit_message='used native setfit head', commit_description='', oid='99bd01e05034ff85fcd5c7f875ca39b93a1e40f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/haukelicht/all-mpnet-base-v2_economic-attributes-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='haukelicht/all-mpnet-base-v2_economic-attributes-classifier'), pr_revision=None, pr_num=None)

In [24]:
model_name = models_names[1]
model_id = f"haukelicht/{model_name}"
model_path = temp_model_dirs[model_name]

create_repo(
    repo_id=model_id,  # Just the model name if you want it at root
    repo_type="model",
    private=False,  # or True if you want it private
    exist_ok=True   # avoids error if it already exists
)

upload_folder(
    repo_type="model",
    repo_id=model_id,
    folder_path=model_path,
    commit_message="used native setfit head",
    create_pr=False,
)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/haukelicht/all-mpnet-base-v2_noneconomic-attributes-classifier/commit/5955e841ee249909e1609bbf8465bd0cce33afff', commit_message='used native setfit head', commit_description='', oid='5955e841ee249909e1609bbf8465bd0cce33afff', pr_url=None, repo_url=RepoUrl('https://huggingface.co/haukelicht/all-mpnet-base-v2_noneconomic-attributes-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='haukelicht/all-mpnet-base-v2_noneconomic-attributes-classifier'), pr_revision=None, pr_num=None)