In [8]:
import json
import numpy as np
import pandas as pd
import torch
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from transformers import CLIPModel, CLIPTokenizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

In [9]:
model_bert = SentenceTransformer('all-MiniLM-L6-v2')
model_clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

In [10]:
def compute_tfidf(text_list):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=3000, sublinear_tf=True)
    vectors = vectorizer.fit_transform(text_list)
    return vectors.toarray()

In [11]:
def compute_sentence_embeddings(text_list, batch_size=256):
    embeddings = []
    text_list = [text.strip() for text in text_list]

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        batch_embeddings = model_bert.encode(batch)
        embeddings.append(batch_embeddings)

    # Concatenate all batch embeddings
    return np.vstack(embeddings)

In [12]:
def compute_clip_embeddings(text_list, batch_size=256):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            batch_embeddings = model_clip.get_text_features(**inputs).cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [13]:
def train_random_forest_with_undersampling(X, y, embedding_name, source_text):
    # Stratified K-Fold for cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Random undersampling for balancing classes
    #rus = RandomUnderSampler(sampling_strategy='not minority', random_state=42)

    # Create the RandomForest model
    base_rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # One-vs-Rest Classifier with custom pipeline that applies undersampling
    clf = OneVsRestClassifier(
        make_pipeline(RandomUnderSampler(sampling_strategy='not minority', random_state=42), base_rf),
        verbose=2
    )
    
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    # Cross-validation loop
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

        print(f'Fold Results for {embedding_name} (OvR Random Forest) - {source_text}:')
        print(classification_report(y_test, y_pred))
        print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')
        print('-----------------------------------')

    # Average metrics over all folds
    print(f'Final Cross-Validated Results for {embedding_name} (OvR Random Forest) - {source_text}:')
    print(f'Average Accuracy: {sum(accuracy_scores)/len(accuracy_scores):.4f}')
    print(f'Average Precision: {sum(precision_scores)/len(precision_scores):.4f}')
    print(f'Average Recall: {sum(recall_scores)/len(recall_scores):.4f}')
    print(f'Average F1-Score: {sum(f1_scores)/len(f1_scores):.4f}')

In [14]:
def evaluate_clustering_metrics(X, y, embedding_name, source_text):
    print(f'Clustering metrics for {embedding_name} - {source_text}:')
    # Calculate metrics
    silhouette_avg = silhouette_score(X, y)
    calinski_harabasz = calinski_harabasz_score(X, y)
    davies_bouldin = davies_bouldin_score(X, y)
    
    # Return results in a dictionary
    metrics = {
        'Silhouette Score': silhouette_avg,
        'Calinski-Harabasz Index': calinski_harabasz,
        'Davies-Bouldin Index': davies_bouldin
    }
    
    table = [["Metric", "Score"]]
    for metric, score in metrics.items():
        table.append([metric, f"{score:.4f}"])
    
    print(f'Clustering metrics for {embedding_name} (OvR Random Forest) - {source_text}:')
    print(f'Silhouette Score: {silhouette_avg:.4f}')
    print(f'Calinski-Harabasz Index: {calinski_harabasz:.4f}')
    print(f'Davies-Bouldin Index: {davies_bouldin:.4f}')

In [15]:
print('Load data')
with open('../data/published_data/paper_title_abstract_dataset.json', 'r') as f:
    papers_data = json.load(f)

df = pd.DataFrame(papers_data)
print(f'Number of samples: {df.shape[0]}')

titles = df['paper_title'].tolist()
abstracts = df['abstract'].tolist()
readmes = df['github_readme_content'].tolist()

Load data
Number of samples: 16093


In [16]:
print('Load data for somef decriptions')
with open('../data/published_data/paper_title_abstract_software_readme_description_dataset.json', 'r') as f:
    papers_data_somef = json.load(f)

df_somef = pd.DataFrame(papers_data_somef)
print(f'Number of samples: {df_somef.shape[0]}')

somef = df_somef['somef_descriptions'].tolist()

Load data for somef decriptions
Number of samples: 13474


In [17]:
print('Load data for github titles and keywords')
with open('../data/published_data/paper_title_abstract_software_readme_description_title_keyword_dataset.json', 'r') as f:
    papers_data_complete = json.load(f)

df_complete = pd.DataFrame(papers_data_complete)
print(f'Number of samples: {df_complete.shape[0]}')
github_title = df_complete['github_repo_title'].tolist()
github_keywords = df_complete['github_keywords'].tolist()

Load data for github titles and keywords
Number of samples: 2904


In [18]:
y = df['main_collection_area']
y_somef = df_somef['main_collection_area']
y_complete = df_complete['main_collection_area']
num_clusters = len(y.unique())

In [19]:
tfidf_embeddings = compute_tfidf(titles)
train_random_forest_with_undersampling(tfidf_embeddings, y, 'TF-IDF', 'Title')
# evaluate_clustering_metrics(tfidf_embeddings, y, 'TF-IDF', 'Title')

Fold Results for TF-IDF (OvR Random Forest) - Title:
                             precision    recall  f1-score   support

                      Audio       0.07      0.25      0.11         8
            Computer Vision       0.88      0.65      0.75      1929
                     Graphs       0.67      0.65      0.66       575
Natural Language Processing       0.40      0.58      0.48       530
     Reinforcement Learning       0.34      0.55      0.42       148
                 Sequential       0.07      0.48      0.12        29

                   accuracy                           0.64      3219
                  macro avg       0.41      0.53      0.42      3219
               weighted avg       0.73      0.64      0.67      3219

Accuracy: 0.6353, Precision: 0.7329, Recall: 0.6353, F1-Score: 0.6684
-----------------------------------
Fold Results for TF-IDF (OvR Random Forest) - Title:
                             precision    recall  f1-score   support

                      Aud

In [20]:
sentence_embeddings = compute_sentence_embeddings(titles)
train_random_forest_with_undersampling(sentence_embeddings, y, 'Sentence Transformer', 'Title')
# evaluate_clustering_metrics(sentence_embeddings, y, 'Sentence Transformer', 'Title')

Fold Results for Sentence Transformer (OvR Random Forest) - Title:
                             precision    recall  f1-score   support

                      Audio       0.07      0.75      0.12         8
            Computer Vision       0.85      0.67      0.75      1929
                     Graphs       0.62      0.58      0.60       575
Natural Language Processing       0.41      0.50      0.45       530
     Reinforcement Learning       0.37      0.64      0.47       148
                 Sequential       0.07      0.45      0.13        29

                   accuracy                           0.62      3219
                  macro avg       0.40      0.60      0.42      3219
               weighted avg       0.71      0.62      0.65      3219

Accuracy: 0.6222, Precision: 0.7072, Recall: 0.6222, F1-Score: 0.6536
-----------------------------------
Fold Results for Sentence Transformer (OvR Random Forest) - Title:
                             precision    recall  f1-score   suppor

In [21]:
clip_embeddings = compute_clip_embeddings(titles)
train_random_forest_with_undersampling(clip_embeddings, y, 'CLIP', 'Title')
# evaluate_clustering_metrics(clip_embeddings, y, 'CLIP', 'Title')

Fold Results for CLIP (OvR Random Forest) - Title:
                             precision    recall  f1-score   support

                      Audio       0.02      0.25      0.04         8
            Computer Vision       0.85      0.62      0.72      1929
                     Graphs       0.60      0.53      0.56       575
Natural Language Processing       0.40      0.45      0.42       530
     Reinforcement Learning       0.27      0.56      0.37       148
                 Sequential       0.04      0.45      0.08        29

                   accuracy                           0.57      3219
                  macro avg       0.36      0.48      0.36      3219
               weighted avg       0.69      0.57      0.62      3219

Accuracy: 0.5732, Precision: 0.6921, Recall: 0.5732, F1-Score: 0.6180
-----------------------------------
Fold Results for CLIP (OvR Random Forest) - Title:
                             precision    recall  f1-score   support

                      Audio  

In [22]:
tfidf_embeddings = compute_tfidf(abstracts)
train_random_forest_with_undersampling(tfidf_embeddings, y, 'TF-IDF', 'Abstract')
# evaluate_clustering_metrics(tfidf_embeddings, y, 'TF-IDF', 'Abstract')

Fold Results for TF-IDF (OvR Random Forest) - Abstract:
                             precision    recall  f1-score   support

                      Audio       0.15      0.50      0.23         8
            Computer Vision       0.96      0.88      0.92      1929
                     Graphs       0.81      0.84      0.83       575
Natural Language Processing       0.84      0.86      0.85       530
     Reinforcement Learning       0.62      0.81      0.70       148
                 Sequential       0.24      0.66      0.36        29

                   accuracy                           0.87      3219
                  macro avg       0.60      0.76      0.65      3219
               weighted avg       0.89      0.87      0.88      3219

Accuracy: 0.8670, Precision: 0.8887, Recall: 0.8670, F1-Score: 0.8753
-----------------------------------
Fold Results for TF-IDF (OvR Random Forest) - Abstract:
                             precision    recall  f1-score   support

                   

In [23]:
sentence_embeddings = compute_sentence_embeddings(abstracts)
train_random_forest_with_undersampling(sentence_embeddings, y, 'Sentence Transformer', 'Abstract')
# evaluate_clustering_metrics(sentence_embeddings, y, 'Sentence Transformer', 'Abstract')

Fold Results for Sentence Transformer (OvR Random Forest) - Abstract:
                             precision    recall  f1-score   support

                      Audio       0.09      0.75      0.16         8
            Computer Vision       0.87      0.72      0.78      1929
                     Graphs       0.67      0.68      0.68       575
Natural Language Processing       0.51      0.59      0.55       530
     Reinforcement Learning       0.45      0.71      0.55       148
                 Sequential       0.12      0.55      0.19        29

                   accuracy                           0.69      3219
                  macro avg       0.45      0.67      0.48      3219
               weighted avg       0.75      0.69      0.71      3219

Accuracy: 0.6869, Precision: 0.7475, Recall: 0.6869, F1-Score: 0.7090
-----------------------------------
Fold Results for Sentence Transformer (OvR Random Forest) - Abstract:
                             precision    recall  f1-score   

In [24]:
clip_embeddings = compute_clip_embeddings(abstracts)
train_random_forest_with_undersampling(clip_embeddings, y, 'CLIP', 'Abstract')
# evaluate_clustering_metrics(clip_embeddings, y, 'CLIP', 'Abstract')

Fold Results for CLIP (OvR Random Forest) - Abstract:
                             precision    recall  f1-score   support

                      Audio       0.04      0.62      0.08         8
            Computer Vision       0.83      0.61      0.70      1929
                     Graphs       0.53      0.44      0.48       575
Natural Language Processing       0.40      0.55      0.47       530
     Reinforcement Learning       0.34      0.53      0.41       148
                 Sequential       0.06      0.55      0.11        29

                   accuracy                           0.56      3219
                  macro avg       0.37      0.55      0.38      3219
               weighted avg       0.67      0.56      0.60      3219

Accuracy: 0.5648, Precision: 0.6736, Recall: 0.5648, F1-Score: 0.6033
-----------------------------------
Fold Results for CLIP (OvR Random Forest) - Abstract:
                             precision    recall  f1-score   support

                      A

In [25]:
tfidf_embeddings = compute_tfidf(readmes)
train_random_forest_with_undersampling(tfidf_embeddings, y, 'TF-IDF', 'GitHub README Content')
# evaluate_clustering_metrics(tfidf_embeddings, y, 'TF-IDF', 'GitHub README Content')

Fold Results for TF-IDF (OvR Random Forest) - GitHub README Content:
                             precision    recall  f1-score   support

                      Audio       0.06      0.38      0.11         8
            Computer Vision       0.89      0.67      0.77      1929
                     Graphs       0.63      0.67      0.65       575
Natural Language Processing       0.52      0.61      0.56       530
     Reinforcement Learning       0.38      0.51      0.44       148
                 Sequential       0.03      0.31      0.06        29

                   accuracy                           0.65      3219
                  macro avg       0.42      0.53      0.43      3219
               weighted avg       0.75      0.65      0.69      3219

Accuracy: 0.6505, Precision: 0.7471, Recall: 0.6505, F1-Score: 0.6873
-----------------------------------
Fold Results for TF-IDF (OvR Random Forest) - GitHub README Content:
                             precision    recall  f1-score   su

In [26]:
sentence_embeddings = compute_sentence_embeddings(readmes)
train_random_forest_with_undersampling(sentence_embeddings, y, 'Sentence Transformer', 'GitHub README Content')
# evaluate_clustering_metrics(sentence_embeddings, y, 'Sentence Transformer', 'GitHub README Content')

Fold Results for Sentence Transformer (OvR Random Forest) - GitHub README Content:
                             precision    recall  f1-score   support

                      Audio       0.03      0.62      0.05         8
            Computer Vision       0.85      0.60      0.71      1929
                     Graphs       0.55      0.53      0.54       575
Natural Language Processing       0.39      0.49      0.43       530
     Reinforcement Learning       0.29      0.52      0.37       148
                 Sequential       0.07      0.38      0.12        29

                   accuracy                           0.57      3219
                  macro avg       0.36      0.52      0.37      3219
               weighted avg       0.68      0.57      0.61      3219

Accuracy: 0.5663, Precision: 0.6829, Recall: 0.5663, F1-Score: 0.6084
-----------------------------------
Fold Results for Sentence Transformer (OvR Random Forest) - GitHub README Content:
                             precis

In [27]:
clip_embeddings = compute_clip_embeddings(readmes)
train_random_forest_with_undersampling(clip_embeddings, y, 'CLIP', 'GitHub README Content')
# evaluate_clustering_metrics(clip_embeddings, y, 'CLIP', 'GitHub README Content')

Fold Results for CLIP (OvR Random Forest) - GitHub README Content:
                             precision    recall  f1-score   support

                      Audio       0.02      0.38      0.03         8
            Computer Vision       0.83      0.55      0.66      1929
                     Graphs       0.50      0.44      0.47       575
Natural Language Processing       0.35      0.45      0.40       530
     Reinforcement Learning       0.19      0.38      0.26       148
                 Sequential       0.03      0.28      0.05        29

                   accuracy                           0.50      3219
                  macro avg       0.32      0.41      0.31      3219
               weighted avg       0.65      0.50      0.56      3219

Accuracy: 0.4998, Precision: 0.6532, Recall: 0.4998, F1-Score: 0.5551
-----------------------------------
Fold Results for CLIP (OvR Random Forest) - GitHub README Content:
                             precision    recall  f1-score   suppor

In [28]:
tfidf_embeddings = compute_tfidf(somef)
train_random_forest_with_undersampling(tfidf_embeddings, y_somef, 'TF-IDF', 'SOMEF descriptions')
# evaluate_clustering_metrics(tfidf_embeddings, y_somef, 'TF-IDF', 'SOMEF descriptions')

Fold Results for TF-IDF (OvR Random Forest) - SOMEF descriptions:
                             precision    recall  f1-score   support

                      Audio       0.01      0.29      0.01         7
            Computer Vision       0.90      0.60      0.72      1610
                     Graphs       0.70      0.67      0.69       489
Natural Language Processing       0.51      0.51      0.51       445
     Reinforcement Learning       0.39      0.56      0.46       120
                 Sequential       0.04      0.46      0.08        24

                   accuracy                           0.60      2695
                  macro avg       0.42      0.51      0.41      2695
               weighted avg       0.77      0.60      0.66      2695

Accuracy: 0.5963, Precision: 0.7651, Recall: 0.5963, F1-Score: 0.6609
-----------------------------------
Fold Results for TF-IDF (OvR Random Forest) - SOMEF descriptions:
                             precision    recall  f1-score   support


In [29]:
sentence_embeddings = compute_sentence_embeddings(somef)
train_random_forest_with_undersampling(sentence_embeddings, y_somef, 'Sentence Transformer', 'SOMEF descriptions')
# evaluate_clustering_metrics(sentence_embeddings, y_somef, 'Sentence Transformer', 'SOMEF descriptions')

Fold Results for Sentence Transformer (OvR Random Forest) - SOMEF descriptions:
                             precision    recall  f1-score   support

                      Audio       0.08      0.86      0.14         7
            Computer Vision       0.87      0.63      0.73      1610
                     Graphs       0.60      0.58      0.59       489
Natural Language Processing       0.40      0.51      0.45       445
     Reinforcement Learning       0.35      0.69      0.46       120
                 Sequential       0.06      0.46      0.11        24

                   accuracy                           0.60      2695
                  macro avg       0.39      0.62      0.42      2695
               weighted avg       0.71      0.60      0.64      2695

Accuracy: 0.6048, Precision: 0.7123, Recall: 0.6048, F1-Score: 0.6418
-----------------------------------
Fold Results for Sentence Transformer (OvR Random Forest) - SOMEF descriptions:
                             precision   

In [30]:
clip_embeddings = compute_clip_embeddings(somef)
train_random_forest_with_undersampling(clip_embeddings, y_somef, 'CLIP', 'SOMEF descriptions')
# evaluate_clustering_metrics(clip_embeddings, y_somef, 'CLIP', 'SOMEF descriptions')

Fold Results for CLIP (OvR Random Forest) - SOMEF descriptions:
                             precision    recall  f1-score   support

                      Audio       0.01      0.29      0.02         7
            Computer Vision       0.84      0.54      0.66      1610
                     Graphs       0.53      0.47      0.50       489
Natural Language Processing       0.35      0.46      0.40       445
     Reinforcement Learning       0.27      0.52      0.35       120
                 Sequential       0.04      0.38      0.07        24

                   accuracy                           0.51      2695
                  macro avg       0.34      0.44      0.33      2695
               weighted avg       0.67      0.51      0.57      2695

Accuracy: 0.5117, Precision: 0.6670, Recall: 0.5117, F1-Score: 0.5653
-----------------------------------
Fold Results for CLIP (OvR Random Forest) - SOMEF descriptions:
                             precision    recall  f1-score   support

   

In [31]:
tfidf_embeddings = compute_tfidf(github_title)
train_random_forest_with_undersampling(tfidf_embeddings, y_complete, 'TF-IDF', 'GitHub Title')
# evaluate_clustering_metrics(tfidf_embeddings, y_complete, 'TF-IDF', 'GitHub Title')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold Results for TF-IDF (OvR Random Forest) - GitHub Title:
                             precision    recall  f1-score   support

                      Audio       0.00      0.00      0.00         2
            Computer Vision       0.87      0.47      0.61       361
                     Graphs       0.49      0.45      0.47       112
Natural Language Processing       0.29      0.27      0.28        73
     Reinforcement Learning       0.09      0.68      0.15        25
                 Sequential       0.14      0.25      0.18         8

                   accuracy                           0.45       581
                  macro avg       0.31      0.35      0.28       581
               weighted avg       0.68      0.45      0.52       581

Accuracy: 0.4475, Precision: 0.6782, Recall: 0.4475, F1-Score: 0.5158
-----------------------------------
Fold Results for TF-IDF (OvR Random Forest) - GitHub Title:
                             precision    recall  f1-score   support

           

In [32]:
sentence_embeddings = compute_sentence_embeddings(github_title)
train_random_forest_with_undersampling(sentence_embeddings, y_complete, 'Sentence Transformer', 'GitHub Title')
# evaluate_clustering_metrics(sentence_embeddings, y_complete, 'Sentence Transformer', 'GitHub Title')

Fold Results for Sentence Transformer (OvR Random Forest) - GitHub Title:
                             precision    recall  f1-score   support

                      Audio       0.00      0.00      0.00         2
            Computer Vision       0.83      0.55      0.66       361
                     Graphs       0.50      0.40      0.45       112
Natural Language Processing       0.37      0.41      0.39        73
     Reinforcement Learning       0.21      0.64      0.31        25
                 Sequential       0.06      0.62      0.11         8

                   accuracy                           0.51       581
                  macro avg       0.33      0.44      0.32       581
               weighted avg       0.67      0.51      0.56       581

Accuracy: 0.5077, Precision: 0.6679, Recall: 0.5077, F1-Score: 0.5613
-----------------------------------
Fold Results for Sentence Transformer (OvR Random Forest) - GitHub Title:
                             precision    recall  f1-

In [33]:
clip_embeddings = compute_clip_embeddings(github_title)
train_random_forest_with_undersampling(clip_embeddings, y_complete, 'CLIP', 'GitHub Title')
# evaluate_clustering_metrics(clip_embeddings, y_complete, 'CLIP', 'GitHub Title')

Fold Results for CLIP (OvR Random Forest) - GitHub Title:
                             precision    recall  f1-score   support

                      Audio       0.00      0.00      0.00         2
            Computer Vision       0.83      0.47      0.60       361
                     Graphs       0.44      0.43      0.43       112
Natural Language Processing       0.27      0.32      0.29        73
     Reinforcement Learning       0.11      0.48      0.18        25
                 Sequential       0.02      0.12      0.03         8

                   accuracy                           0.43       581
                  macro avg       0.28      0.30      0.26       581
               weighted avg       0.64      0.43      0.50       581

Accuracy: 0.4337, Precision: 0.6406, Recall: 0.4337, F1-Score: 0.4993
-----------------------------------
Fold Results for CLIP (OvR Random Forest) - GitHub Title:
                             precision    recall  f1-score   support

               

In [34]:
tfidf_embeddings = compute_tfidf(github_keywords)
train_random_forest_with_undersampling(tfidf_embeddings, y_complete, 'TF-IDF', 'GitHub Keywords')
# evaluate_clustering_metrics(tfidf_embeddings, y_complete, 'TF-IDF', 'GitHub Keywords')

Fold Results for TF-IDF (OvR Random Forest) - GitHub Keywords:
                             precision    recall  f1-score   support

                      Audio       0.04      1.00      0.08         2
            Computer Vision       0.91      0.65      0.76       361
                     Graphs       0.60      0.72      0.66       112
Natural Language Processing       0.45      0.38      0.41        73
     Reinforcement Learning       0.43      0.84      0.57        25
                 Sequential       0.14      0.50      0.22         8

                   accuracy                           0.64       581
                  macro avg       0.43      0.68      0.45       581
               weighted avg       0.76      0.64      0.68       581

Accuracy: 0.6386, Precision: 0.7589, Recall: 0.6386, F1-Score: 0.6781
-----------------------------------
Fold Results for TF-IDF (OvR Random Forest) - GitHub Keywords:
                             precision    recall  f1-score   support

     

In [35]:
sentence_embeddings = compute_sentence_embeddings(github_keywords)
train_random_forest_with_undersampling(sentence_embeddings, y_complete, 'Sentence Transformer', 'GitHub Keywords')
# evaluate_clustering_metrics(sentence_embeddings, y_complete, 'Sentence Transformer', 'GitHub Keywords')

Fold Results for Sentence Transformer (OvR Random Forest) - GitHub Keywords:
                             precision    recall  f1-score   support

                      Audio       0.12      0.50      0.20         2
            Computer Vision       0.83      0.64      0.72       361
                     Graphs       0.60      0.59      0.59       112
Natural Language Processing       0.31      0.40      0.35        73
     Reinforcement Learning       0.39      0.80      0.53        25
                 Sequential       0.10      0.50      0.17         8

                   accuracy                           0.60       581
                  macro avg       0.39      0.57      0.43       581
               weighted avg       0.69      0.60      0.63       581

Accuracy: 0.6041, Precision: 0.6876, Recall: 0.6041, F1-Score: 0.6325
-----------------------------------
Fold Results for Sentence Transformer (OvR Random Forest) - GitHub Keywords:
                             precision    recal

In [36]:
clip_embeddings = compute_clip_embeddings(github_keywords)
train_random_forest_with_undersampling(clip_embeddings, y_complete, 'CLIP', 'GitHub Keywords')
# evaluate_clustering_metrics(clip_embeddings, y_complete, 'CLIP', 'GitHub Keywords')

Fold Results for CLIP (OvR Random Forest) - GitHub Keywords:
                             precision    recall  f1-score   support

                      Audio       0.10      0.50      0.17         2
            Computer Vision       0.85      0.59      0.70       361
                     Graphs       0.57      0.57      0.57       112
Natural Language Processing       0.24      0.38      0.29        73
     Reinforcement Learning       0.39      0.80      0.53        25
                 Sequential       0.07      0.38      0.12         8

                   accuracy                           0.56       581
                  macro avg       0.37      0.54      0.40       581
               weighted avg       0.69      0.56      0.60       581

Accuracy: 0.5645, Precision: 0.6891, Recall: 0.5645, F1-Score: 0.6043
-----------------------------------
Fold Results for CLIP (OvR Random Forest) - GitHub Keywords:
                             precision    recall  f1-score   support

         