In [1]:
import json
import numpy as np
import pandas as pd
import torch
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from transformers import CLIPModel, CLIPTokenizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, hamming_loss
)
from imblearn.under_sampling import RandomUnderSampler
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.multiclass import type_of_target
from imblearn.pipeline import Pipeline as ImbPipeline

  from tqdm.autonotebook import tqdm, trange


In [2]:
model_bert = SentenceTransformer('all-MiniLM-L6-v2')
model_clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")



In [3]:
def compute_tfidf(text_list):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=3000, sublinear_tf=True)
    vectors = vectorizer.fit_transform(text_list)
    return vectors.toarray()

In [4]:
def compute_sentence_embeddings(text_list, batch_size=256):
    embeddings = []
    text_list = [text.strip() for text in text_list]

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        batch_embeddings = model_bert.encode(batch)
        embeddings.append(batch_embeddings)

    # Concatenate all batch embeddings
    return np.vstack(embeddings)

In [5]:
def compute_clip_embeddings(text_list, batch_size=256):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            batch_embeddings = model_clip.get_text_features(**inputs).cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [6]:
def train_random_forest_with_undersampling(X, y, embedding_name, source_text):
    # Stratified K-Fold for cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Random undersampling for balancing classes
    #rus = RandomUnderSampler(sampling_strategy='not minority', random_state=42)

    # Create the RandomForest model
    base_rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # One-vs-Rest Classifier with custom pipeline that applies undersampling
    clf = OneVsRestClassifier(
        make_pipeline(RandomUnderSampler(sampling_strategy='not minority', random_state=42), base_rf),
        verbose=2
    )
    
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    # Cross-validation loop
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

        print(f'Fold Results for {embedding_name} (OvR Random Forest) - {source_text}:')
        print(classification_report(y_test, y_pred))
        print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')
        print('-----------------------------------')

    # Average metrics over all folds
    print(f'Final Cross-Validated Results for {embedding_name} (OvR Random Forest) - {source_text}:')
    print(f'Average Accuracy: {sum(accuracy_scores)/len(accuracy_scores):.4f}')
    print(f'Average Precision: {sum(precision_scores)/len(precision_scores):.4f}')
    print(f'Average Recall: {sum(recall_scores)/len(recall_scores):.4f}')
    print(f'Average F1-Score: {sum(f1_scores)/len(f1_scores):.4f}')

In [7]:
CLASS_ORDER = ["Computer Vision", "Graphs", "Natural Language Processing",
               "Reinforcement Learning", "Sequential", "Audio"]

def _ensure_multilabel_indicator(y, classes=None):
    """Return a proper (n_samples, n_classes) 0/1 array from:
       - pandas Series of lists/sets/tuples
       - list of lists/sets/tuples
       - already-binarized 2D numpy/sparse matrix."""
    classes = classes or CLASS_ORDER

    # Already a 2D numpy array of 0/1?
    if isinstance(y, np.ndarray) and y.ndim == 2:
        uniq = np.unique(y)
        if set(uniq).issubset({0, 1}):
            return y
        raise ValueError("2D y provided but contains values other than {0,1}.")

    # Try scipy sparse directly
    try:
        from scipy import sparse
        if sparse.issparse(y):
            return y
    except Exception:
        pass

    # Pandas Series of lists/sets/tuples
    try:
        import pandas as pd
        if isinstance(y, pd.Series):
            y_list = y.tolist()
        else:
            y_list = y
    except Exception:
        y_list = y

    # List-like multilabel sequences?
    if isinstance(y_list, (list, tuple)) and len(y_list) > 0 and isinstance(y_list[0], (list, tuple, set)):
        mlb = MultiLabelBinarizer(classes=classes)
        return mlb.fit_transform(y_list)

    raise ValueError(
        "y must be a (n_samples, n_classes) 0/1 array/sparse matrix or a Series/list of label-iterables."
    )

In [8]:
def train_random_forest_multilabel(X, y, embedding_name, source_text, classes=CLASS_ORDER, n_splits=5):
    """
    Train a Random Forest in a multi-label One-vs-Rest scheme with undersampling.
    Handles Series-of-lists or already binarized multilabel indicator matrices.
    """

    # --- ensure y is a proper multilabel-indicator ---
    if not (isinstance(y, np.ndarray) and y.ndim == 2 and set(np.unique(y)) <= {0, 1}):
        # assume y is list/Series of lists -> convert
        mlb = MultiLabelBinarizer(classes=classes)
        y = mlb.fit_transform(y)
        print("Converted labels with MultiLabelBinarizer. Shape:", y.shape)

    # Multilabel Stratified K-Fold for cross-validation
    mskf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # RandomForest base model
    base_rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # One-vs-Rest Classifier with undersampling inside imblearn Pipeline
    clf = OneVsRestClassifier(
        ImbPipeline(steps=[
            ('undersample', RandomUnderSampler(sampling_strategy="not minority", random_state=42)),
            ('rf', base_rf)
        ]),
        verbose=2
    )

    # Store metrics
    subset_acc_scores, hamming_scores = [], []
    precision_micro_scores, recall_micro_scores = [], []
    f1_micro_scores, f1_macro_scores = [], []

    # Cross-validation loop
    for train_index, test_index in mskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Multi-label metrics
        subset_acc = accuracy_score(y_test, y_pred)  # exact match ratio
        hamming = hamming_loss(y_test, y_pred)
        precision_micro = precision_score(y_test, y_pred, average="micro", zero_division=0)
        recall_micro = recall_score(y_test, y_pred, average="micro", zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average="micro", zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average="macro", zero_division=0)

        subset_acc_scores.append(subset_acc)
        hamming_scores.append(hamming)
        precision_micro_scores.append(precision_micro)
        recall_micro_scores.append(recall_micro)
        f1_micro_scores.append(f1_micro)
        f1_macro_scores.append(f1_macro)

        print(f'Fold Results for {embedding_name} (OvR RF, Multi-label) - {source_text}:')
        print(classification_report(y_test, y_pred, zero_division=0, target_names=classes))
        print(f'Subset Accuracy: {subset_acc:.4f}, Hamming Loss: {hamming:.4f}, '
              f'Precision (micro): {precision_micro:.4f}, Recall (micro): {recall_micro:.4f}, '
              f'F1 (micro): {f1_micro:.4f}, F1 (macro): {f1_macro:.4f}')
        print('-----------------------------------')

    # Average metrics
    print(f'Final Cross-Validated Results for {embedding_name} (OvR RF, Multi-label) - {source_text}:')
    print(f'Average Subset Accuracy: {np.mean(subset_acc_scores):.4f}')
    print(f'Average Hamming Loss: {np.mean(hamming_scores):.4f}')
    print(f'Average Precision (micro): {np.mean(precision_micro_scores):.4f}')
    print(f'Average Recall (micro): {np.mean(recall_micro_scores):.4f}')
    print(f'Average F1 (micro): {np.mean(f1_micro_scores):.4f}')
    print(f'Average F1 (macro): {np.mean(f1_macro_scores):.4f}')

In [9]:
def evaluate_clustering_metrics(X, y, embedding_name, source_text):
    print(f'Clustering metrics for {embedding_name} - {source_text}:')
    # Calculate metrics
    silhouette_avg = silhouette_score(X, y)
    calinski_harabasz = calinski_harabasz_score(X, y)
    davies_bouldin = davies_bouldin_score(X, y)
    
    # Return results in a dictionary
    metrics = {
        'Silhouette Score': silhouette_avg,
        'Calinski-Harabasz Index': calinski_harabasz,
        'Davies-Bouldin Index': davies_bouldin
    }
    
    table = [["Metric", "Score"]]
    for metric, score in metrics.items():
        table.append([metric, f"{score:.4f}"])
    
    print(f'Clustering metrics for {embedding_name} (OvR Random Forest) - {source_text}:')
    print(f'Silhouette Score: {silhouette_avg:.4f}')
    print(f'Calinski-Harabasz Index: {calinski_harabasz:.4f}')
    print(f'Davies-Bouldin Index: {davies_bouldin:.4f}')

In [10]:
print('Load data')
with open('../data/to_be_published/paper_title_abstract_dataset_multi_label.json', 'r') as f:
    papers_data = json.load(f)

df = pd.DataFrame(papers_data)
print(f'Number of samples: {df.shape[0]}')

titles = df['paper_title'].tolist()
abstracts = df['abstract'].tolist()
readmes = df['github_readme_content'].tolist()

Load data
Number of samples: 9954


In [22]:
readmes = df['preprocessed_readme_content'].tolist()

In [11]:
print('Load data for somef decriptions')
with open('../data/to_be_published/paper_title_abstract_software_readme_description_dataset_multi_label.json', 'r') as f:
    papers_data_somef = json.load(f)

df_somef = pd.DataFrame(papers_data_somef)
print(f'Number of samples: {df_somef.shape[0]}')

somef = df_somef['somef_descriptions'].tolist()

Load data for somef decriptions
Number of samples: 4050


In [12]:
print('Load data for github titles and keywords')
with open('../data/to_be_published/paper_title_abstract_software_readme_description_title_keyword_dataset_multi_label.json', 'r') as f:
    papers_data_complete = json.load(f)

df_complete = pd.DataFrame(papers_data_complete)
print(f'Number of samples: {df_complete.shape[0]}')
github_title = df_complete['github_repo_title'].tolist()
github_keywords = df_complete['github_keywords'].tolist()

Load data for github titles and keywords
Number of samples: 1289


In [13]:
y = df['main_collection_area']
y_somef = df_somef['main_collection_area']
y_complete = df_complete['main_collection_area']
# num_clusters = len(y.unique())

In [14]:
tfidf_embeddings = compute_tfidf(titles)
train_random_forest_multilabel(tfidf_embeddings, y, 'TF-IDF', 'Title')
# evaluate_clustering_metrics(tfidf_embeddings, y, 'TF-IDF', 'Title')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for TF-IDF (OvR RF, Multi-label) - Title:
                             precision    recall  f1-score   support

            Computer Vision       0.91      0.76      0.83      1657
                     Graphs       0.60      0.66      0.63       501
Natural Language Processing       0.85      0.76      0.80      1292
     Reinforcement Learning       0.48      0.83      0.61       268
                 Sequential       0.48      0.74      0.59       412
                      Audio       0.34      0.85      0.48        66

                  micro avg       0.72      0.75      0.74      4196
                  macro avg       0.61      0.77      0.66      4196
               weighted avg       0.78      0.75      0.75      4196
                samples avg       0.75      0.75      0.74      4196

Subset Accuracy: 0.4047, Hamming Loss: 0.1829, Precision (micro): 0.7246, Recall (micro): 0.7519, F1 (micro): 0.7380, F1 (m

In [15]:
sentence_embeddings = compute_sentence_embeddings(titles)
train_random_forest_multilabel(sentence_embeddings, y, 'Sentence Transformer', 'Title')
# evaluate_clustering_metrics(sentence_embeddings, y, 'Sentence Transformer', 'Title')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for Sentence Transformer (OvR RF, Multi-label) - Title:
                             precision    recall  f1-score   support

            Computer Vision       0.92      0.75      0.83      1657
                     Graphs       0.58      0.59      0.58       501
Natural Language Processing       0.82      0.77      0.79      1292
     Reinforcement Learning       0.64      0.81      0.71       268
                 Sequential       0.45      0.74      0.56       412
                      Audio       0.36      0.83      0.50        66

                  micro avg       0.73      0.74      0.74      4196
                  macro avg       0.63      0.75      0.66      4196
               weighted avg       0.78      0.74      0.75      4196
                samples avg       0.76      0.74      0.73      4196

Subset Accuracy: 0.3998, Hamming Loss: 0.1811, Precision (micro): 0.7339, Recall (micro): 0.7395, F1 (micro):

In [17]:
clip_embeddings = compute_clip_embeddings(titles)
train_random_forest_multilabel(clip_embeddings, y, 'CLIP', 'Title')
# evaluate_clustering_metrics(clip_embeddings, y, 'CLIP', 'Title')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for CLIP (OvR RF, Multi-label) - Title:
                             precision    recall  f1-score   support

            Computer Vision       0.92      0.71      0.80      1657
                     Graphs       0.51      0.59      0.55       501
Natural Language Processing       0.80      0.74      0.77      1292
     Reinforcement Learning       0.48      0.76      0.59       268
                 Sequential       0.40      0.69      0.51       412
                      Audio       0.26      0.80      0.40        66

                  micro avg       0.68      0.71      0.69      4196
                  macro avg       0.56      0.72      0.60      4196
               weighted avg       0.74      0.71      0.71      4196
                samples avg       0.71      0.71      0.69      4196

Subset Accuracy: 0.3332, Hamming Loss: 0.2156, Precision (micro): 0.6769, Recall (micro): 0.7095, F1 (micro): 0.6928, F1 (mac

In [18]:
tfidf_embeddings = compute_tfidf(abstracts)
train_random_forest_multilabel(tfidf_embeddings, y, 'TF-IDF', 'Abstract')
# evaluate_clustering_metrics(tfidf_embeddings, y, 'TF-IDF', 'Abstract')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for TF-IDF (OvR RF, Multi-label) - Abstract:
                             precision    recall  f1-score   support

            Computer Vision       0.96      0.85      0.90      1657
                     Graphs       0.78      0.86      0.82       501
Natural Language Processing       0.92      0.93      0.92      1292
     Reinforcement Learning       0.78      0.88      0.83       268
                 Sequential       0.72      0.90      0.80       412
                      Audio       0.46      0.86      0.60        66

                  micro avg       0.87      0.88      0.87      4196
                  macro avg       0.77      0.88      0.81      4196
               weighted avg       0.88      0.88      0.88      4196
                samples avg       0.88      0.88      0.87      4196

Subset Accuracy: 0.6301, Hamming Loss: 0.0879, Precision (micro): 0.8655, Recall (micro): 0.8804, F1 (micro): 0.8729, F1

In [19]:
sentence_embeddings = compute_sentence_embeddings(abstracts)
train_random_forest_multilabel(sentence_embeddings, y, 'Sentence Transformer', 'Abstract')
# evaluate_clustering_metrics(sentence_embeddings, y, 'Sentence Transformer', 'Abstract')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for Sentence Transformer (OvR RF, Multi-label) - Abstract:
                             precision    recall  f1-score   support

            Computer Vision       0.93      0.77      0.84      1657
                     Graphs       0.71      0.70      0.70       501
Natural Language Processing       0.85      0.80      0.83      1292
     Reinforcement Learning       0.69      0.85      0.76       268
                 Sequential       0.55      0.81      0.65       412
                      Audio       0.44      0.83      0.58        66

                  micro avg       0.79      0.78      0.79      4196
                  macro avg       0.69      0.80      0.73      4196
               weighted avg       0.82      0.78      0.79      4196
                samples avg       0.81      0.79      0.78      4196

Subset Accuracy: 0.4855, Hamming Loss: 0.1457, Precision (micro): 0.7896, Recall (micro): 0.7836, F1 (micr

In [20]:
clip_embeddings = compute_clip_embeddings(abstracts)
train_random_forest_multilabel(clip_embeddings, y, 'CLIP', 'Abstract')
# evaluate_clustering_metrics(clip_embeddings, y, 'CLIP', 'Abstract')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for CLIP (OvR RF, Multi-label) - Abstract:
                             precision    recall  f1-score   support

            Computer Vision       0.91      0.70      0.80      1657
                     Graphs       0.44      0.59      0.50       501
Natural Language Processing       0.80      0.75      0.77      1292
     Reinforcement Learning       0.49      0.77      0.60       268
                 Sequential       0.41      0.70      0.52       412
                      Audio       0.26      0.80      0.40        66

                  micro avg       0.67      0.71      0.69      4196
                  macro avg       0.55      0.72      0.60      4196
               weighted avg       0.74      0.71      0.71      4196
                samples avg       0.69      0.71      0.68      4196

Subset Accuracy: 0.2989, Hamming Loss: 0.2214, Precision (micro): 0.6664, Recall (micro): 0.7085, F1 (micro): 0.6868, F1 (

In [23]:
tfidf_embeddings = compute_tfidf(readmes)
train_random_forest_multilabel(tfidf_embeddings, y, 'TF-IDF', 'GitHub README Content')
# evaluate_clustering_metrics(tfidf_embeddings, y, 'TF-IDF', 'GitHub README Content')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for TF-IDF (OvR RF, Multi-label) - GitHub README Content:
                             precision    recall  f1-score   support

            Computer Vision       0.91      0.37      0.52      1657
                     Graphs       0.32      0.86      0.46       501
Natural Language Processing       0.85      0.42      0.56      1292
     Reinforcement Learning       0.20      0.93      0.32       268
                 Sequential       0.26      0.86      0.40       412
                      Audio       0.36      0.36      0.36        66

                  micro avg       0.41      0.53      0.46      4196
                  macro avg       0.48      0.63      0.44      4196
               weighted avg       0.70      0.53      0.50      4196
                samples avg       0.47      0.53      0.48      4196

Subset Accuracy: 0.1886, Hamming Loss: 0.4198, Precision (micro): 0.4117, Recall (micro): 0.5253, F1 (micro

In [24]:
sentence_embeddings = compute_sentence_embeddings(readmes)
train_random_forest_multilabel(sentence_embeddings, y, 'Sentence Transformer', 'GitHub README Content')
# evaluate_clustering_metrics(sentence_embeddings, y, 'Sentence Transformer', 'GitHub README Content')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for Sentence Transformer (OvR RF, Multi-label) - GitHub README Content:
                             precision    recall  f1-score   support

            Computer Vision       0.90      0.36      0.52      1657
                     Graphs       0.30      0.81      0.43       501
Natural Language Processing       0.80      0.39      0.53      1292
     Reinforcement Learning       0.20      0.93      0.33       268
                 Sequential       0.24      0.81      0.37       412
                      Audio       0.25      0.35      0.29        66

                  micro avg       0.39      0.51      0.44      4196
                  macro avg       0.45      0.61      0.41      4196
               weighted avg       0.68      0.51      0.48      4196
                samples avg       0.44      0.51      0.46      4196

Subset Accuracy: 0.1475, Hamming Loss: 0.4395, Precision (micro): 0.3908, Recall (micro): 0.5

In [25]:
clip_embeddings = compute_clip_embeddings(readmes)
train_random_forest_multilabel(clip_embeddings, y, 'CLIP', 'GitHub README Content')
# evaluate_clustering_metrics(clip_embeddings, y, 'CLIP', 'GitHub README Content')

Converted labels with MultiLabelBinarizer. Shape: (9954, 6)
Fold Results for CLIP (OvR RF, Multi-label) - GitHub README Content:
                             precision    recall  f1-score   support

            Computer Vision       0.88      0.34      0.49      1657
                     Graphs       0.29      0.80      0.43       501
Natural Language Processing       0.77      0.36      0.49      1292
     Reinforcement Learning       0.18      0.90      0.31       268
                 Sequential       0.23      0.80      0.36       412
                      Audio       0.10      0.35      0.16        66

                  micro avg       0.36      0.48      0.42      4196
                  macro avg       0.41      0.59      0.37      4196
               weighted avg       0.66      0.48      0.46      4196
                samples avg       0.40      0.48      0.43      4196

Subset Accuracy: 0.0921, Hamming Loss: 0.4667, Precision (micro): 0.3640, Recall (micro): 0.4845, F1 (micro):

In [26]:
tfidf_embeddings = compute_tfidf(somef)
train_random_forest_multilabel(tfidf_embeddings, y_somef, 'TF-IDF', 'SOMEF descriptions')
# evaluate_clustering_metrics(tfidf_embeddings, y_somef, 'TF-IDF', 'SOMEF descriptions')

Converted labels with MultiLabelBinarizer. Shape: (4050, 6)
Fold Results for TF-IDF (OvR RF, Multi-label) - SOMEF descriptions:
                             precision    recall  f1-score   support

            Computer Vision       0.90      0.71      0.80       661
                     Graphs       0.59      0.69      0.63       203
Natural Language Processing       0.89      0.71      0.79       541
     Reinforcement Learning       0.52      0.61      0.56        92
                 Sequential       0.38      0.79      0.51       158
                      Audio       0.20      0.75      0.32        28

                  micro avg       0.69      0.71      0.70      1683
                  macro avg       0.58      0.71      0.60      1683
               weighted avg       0.78      0.71      0.73      1683
                samples avg       0.72      0.71      0.70      1683

Subset Accuracy: 0.3407, Hamming Loss: 0.2101, Precision (micro): 0.6911, Recall (micro): 0.7112, F1 (micro): 

In [27]:
sentence_embeddings = compute_sentence_embeddings(somef)
train_random_forest_multilabel(sentence_embeddings, y_somef, 'Sentence Transformer', 'SOMEF descriptions')
# evaluate_clustering_metrics(sentence_embeddings, y_somef, 'Sentence Transformer', 'SOMEF descriptions')

Converted labels with MultiLabelBinarizer. Shape: (4050, 6)
Fold Results for Sentence Transformer (OvR RF, Multi-label) - SOMEF descriptions:
                             precision    recall  f1-score   support

            Computer Vision       0.92      0.69      0.79       661
                     Graphs       0.56      0.66      0.60       203
Natural Language Processing       0.83      0.72      0.77       541
     Reinforcement Learning       0.48      0.71      0.57        92
                 Sequential       0.38      0.77      0.51       158
                      Audio       0.23      0.89      0.37        28

                  micro avg       0.68      0.71      0.69      1683
                  macro avg       0.57      0.74      0.60      1683
               weighted avg       0.76      0.71      0.72      1683
                samples avg       0.70      0.72      0.69      1683

Subset Accuracy: 0.3494, Hamming Loss: 0.2185, Precision (micro): 0.6751, Recall (micro): 0.7112

In [28]:
clip_embeddings = compute_clip_embeddings(somef)
train_random_forest_multilabel(clip_embeddings, y_somef, 'CLIP', 'SOMEF descriptions')
# evaluate_clustering_metrics(clip_embeddings, y_somef, 'CLIP', 'SOMEF descriptions')

Converted labels with MultiLabelBinarizer. Shape: (4050, 6)
Fold Results for CLIP (OvR RF, Multi-label) - SOMEF descriptions:
                             precision    recall  f1-score   support

            Computer Vision       0.91      0.67      0.77       661
                     Graphs       0.45      0.62      0.52       203
Natural Language Processing       0.80      0.70      0.75       541
     Reinforcement Learning       0.28      0.67      0.40        92
                 Sequential       0.34      0.76      0.47       158
                      Audio       0.17      0.89      0.29        28

                  micro avg       0.59      0.68      0.63      1683
                  macro avg       0.49      0.72      0.53      1683
               weighted avg       0.72      0.68      0.68      1683
                samples avg       0.63      0.69      0.64      1683

Subset Accuracy: 0.2222, Hamming Loss: 0.2747, Precision (micro): 0.5890, Recall (micro): 0.6839, F1 (micro): 0.

In [29]:
tfidf_embeddings = compute_tfidf(github_title)
train_random_forest_multilabel(tfidf_embeddings, y_complete, 'TF-IDF', 'GitHub Title')
# evaluate_clustering_metrics(tfidf_embeddings, y_complete, 'TF-IDF', 'GitHub Title')

Converted labels with MultiLabelBinarizer. Shape: (1289, 6)
Fold Results for TF-IDF (OvR RF, Multi-label) - GitHub Title:
                             precision    recall  f1-score   support

            Computer Vision       0.82      0.94      0.88       209
                     Graphs       0.23      0.90      0.37        58
Natural Language Processing       0.86      0.22      0.35       173
     Reinforcement Learning       0.33      0.30      0.32        30
                 Sequential       0.40      0.11      0.17        55
                      Audio       0.05      1.00      0.09        11

                  micro avg       0.40      0.58      0.47       536
                  macro avg       0.45      0.58      0.36       536
               weighted avg       0.68      0.58      0.53       536
                samples avg       0.41      0.59      0.48       536

Subset Accuracy: 0.0309, Hamming Loss: 0.4459, Precision (micro): 0.3997, Recall (micro): 0.5840, F1 (micro): 0.4746

In [30]:
sentence_embeddings = compute_sentence_embeddings(github_title)
train_random_forest_multilabel(sentence_embeddings, y_complete, 'Sentence Transformer', 'GitHub Title')
# evaluate_clustering_metrics(sentence_embeddings, y_complete, 'Sentence Transformer', 'GitHub Title')

Converted labels with MultiLabelBinarizer. Shape: (1289, 6)
Fold Results for Sentence Transformer (OvR RF, Multi-label) - GitHub Title:
                             precision    recall  f1-score   support

            Computer Vision       0.86      0.51      0.64       209
                     Graphs       0.30      0.62      0.40        58
Natural Language Processing       0.78      0.65      0.71       173
     Reinforcement Learning       0.23      0.57      0.33        30
                 Sequential       0.28      0.55      0.37        55
                      Audio       0.10      0.82      0.17        11

                  micro avg       0.47      0.58      0.52       536
                  macro avg       0.42      0.62      0.44       536
               weighted avg       0.66      0.58      0.58       536
                samples avg       0.50      0.58      0.51       536

Subset Accuracy: 0.0772, Hamming Loss: 0.3707, Precision (micro): 0.4699, Recall (micro): 0.5821, F1 (

In [31]:
clip_embeddings = compute_clip_embeddings(github_title)
train_random_forest_multilabel(clip_embeddings, y_complete, 'CLIP', 'GitHub Title')
# evaluate_clustering_metrics(clip_embeddings, y_complete, 'CLIP', 'GitHub Title')

Converted labels with MultiLabelBinarizer. Shape: (1289, 6)
Fold Results for CLIP (OvR RF, Multi-label) - GitHub Title:
                             precision    recall  f1-score   support

            Computer Vision       0.83      0.57      0.67       209
                     Graphs       0.31      0.62      0.41        58
Natural Language Processing       0.79      0.61      0.69       173
     Reinforcement Learning       0.19      0.60      0.29        30
                 Sequential       0.27      0.56      0.36        55
                      Audio       0.09      0.91      0.16        11

                  micro avg       0.44      0.60      0.51       536
                  macro avg       0.41      0.65      0.43       536
               weighted avg       0.65      0.60      0.59       536
                samples avg       0.46      0.59      0.50       536

Subset Accuracy: 0.0618, Hamming Loss: 0.3964, Precision (micro): 0.4444, Recall (micro): 0.5970, F1 (micro): 0.5096, 

In [32]:
tfidf_embeddings = compute_tfidf(github_keywords)
train_random_forest_multilabel(tfidf_embeddings, y_complete, 'TF-IDF', 'GitHub Keywords')
# evaluate_clustering_metrics(tfidf_embeddings, y_complete, 'TF-IDF', 'GitHub Keywords')

Converted labels with MultiLabelBinarizer. Shape: (1289, 6)
Fold Results for TF-IDF (OvR RF, Multi-label) - GitHub Keywords:
                             precision    recall  f1-score   support

            Computer Vision       0.94      0.56      0.70       209
                     Graphs       0.47      0.76      0.58        58
Natural Language Processing       0.88      0.75      0.81       173
     Reinforcement Learning       0.62      0.70      0.66        30
                 Sequential       0.41      0.75      0.53        55
                      Audio       0.18      0.82      0.30        11

                  micro avg       0.66      0.68      0.67       536
                  macro avg       0.59      0.72      0.60       536
               weighted avg       0.78      0.68      0.70       536
                samples avg       0.68      0.68      0.66       536

Subset Accuracy: 0.2857, Hamming Loss: 0.2310, Precision (micro): 0.6618, Recall (micro): 0.6754, F1 (micro): 0.6

In [33]:
sentence_embeddings = compute_sentence_embeddings(github_keywords)
train_random_forest_multilabel(sentence_embeddings, y_complete, 'Sentence Transformer', 'GitHub Keywords')
# evaluate_clustering_metrics(sentence_embeddings, y_complete, 'Sentence Transformer', 'GitHub Keywords')

Converted labels with MultiLabelBinarizer. Shape: (1289, 6)
Fold Results for Sentence Transformer (OvR RF, Multi-label) - GitHub Keywords:
                             precision    recall  f1-score   support

            Computer Vision       0.93      0.68      0.78       209
                     Graphs       0.51      0.76      0.61        58
Natural Language Processing       0.82      0.68      0.74       173
     Reinforcement Learning       0.64      0.70      0.67        30
                 Sequential       0.40      0.76      0.53        55
                      Audio       0.21      0.82      0.33        11

                  micro avg       0.67      0.70      0.68       536
                  macro avg       0.58      0.73      0.61       536
               weighted avg       0.76      0.70      0.71       536
                samples avg       0.70      0.70      0.69       536

Subset Accuracy: 0.3243, Hamming Loss: 0.2239, Precision (micro): 0.6673, Recall (micro): 0.6996, F

In [34]:
clip_embeddings = compute_clip_embeddings(github_keywords)
train_random_forest_multilabel(clip_embeddings, y_complete, 'CLIP', 'GitHub Keywords')
# evaluate_clustering_metrics(clip_embeddings, y_complete, 'CLIP', 'GitHub Keywords')

Converted labels with MultiLabelBinarizer. Shape: (1289, 6)
Fold Results for CLIP (OvR RF, Multi-label) - GitHub Keywords:
                             precision    recall  f1-score   support

            Computer Vision       0.93      0.67      0.78       209
                     Graphs       0.45      0.72      0.55        58
Natural Language Processing       0.83      0.68      0.75       173
     Reinforcement Learning       0.62      0.70      0.66        30
                 Sequential       0.35      0.69      0.46        55
                      Audio       0.21      0.73      0.32        11

                  micro avg       0.64      0.68      0.66       536
                  macro avg       0.56      0.70      0.59       536
               weighted avg       0.75      0.68      0.69       536
                samples avg       0.68      0.69      0.66       536

Subset Accuracy: 0.2471, Hamming Loss: 0.2400, Precision (micro): 0.6432, Recall (micro): 0.6828, F1 (micro): 0.662