# Imports

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import CLIPTokenizer, CLIPModel
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

In [None]:
model_bert = SentenceTransformer('all-MiniLM-L6-v2')
model_clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
def compute_tfidf(text_list):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=3000, sublinear_tf=True)
    vectors = vectorizer.fit_transform(text_list)
    return vectors.toarray()

In [None]:
def compute_sentence_embeddings(text_list, batch_size=256):
    embeddings = []
    text_list = [text.strip() for text in text_list]

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        batch_embeddings = model_bert.encode(batch)
        embeddings.append(batch_embeddings)

    # Concatenate all batch embeddings
    return np.vstack(embeddings)


In [None]:
def compute_clip_embeddings(text_list, batch_size=256):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            batch_embeddings = model_clip.get_text_features(**inputs).cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [None]:
def reduce_dimensionality(embeddings, method):
    if method == 'pca':
        pca = PCA(n_components=2)
        reduced = pca.fit_transform(embeddings)
    elif method == 'tsne':
        tsne = TSNE(n_components=2, random_state=42)
        reduced = tsne.fit_transform(embeddings)
    return reduced

In [None]:
def plot_embeddings(embeddings, color_by, title, color_map='Viridis'):
    # Convert color_by into a categorical type and get unique categories
    categories = pd.Categorical(color_by)
    category_codes = categories.codes
    category_labels = categories.categories

    # Create a DataFrame to hold embeddings and color info for plotting
    df = pd.DataFrame(embeddings, columns=['x', 'y'])
    df['category'] = categories

    # Create scatter plot
    fig = px.scatter(
        df,
        color='category',
        title=title,
        color_discrete_sequence=px.colors.qualitative.Plotly,  # or use color_map for continuous color scales
        labels={'category': 'Category'}
    )

    # Customize the layout for a publication-style figure
    fig.update_layout(
        title=title,
        title_x=0.5,
        xaxis_title='',
        yaxis_title='',
        legend_title_text='Research area',
        font=dict(family="Arial", size=14),
        width=800,
        height=600,
        xaxis=dict(showticklabels=False),
        yaxis=dict(showticklabels=False),
        showlegend=True,
        margin=dict(t=50),
    )


    filename = title.lower().replace(' ', '_')
    filepath = f'../plots/rq2/{filename}.png'
    fig.write_image(filepath, width=1600, height=1200, scale=1)
    # Display plot
    fig.show()

# Load Data

In [None]:
print('Load data')
with open('../data/final_dataset.json', 'r') as f:
    papers_data = json.load(f)

df = pd.DataFrame(papers_data)
print(f'Number of samples: {df.shape[0]}')

titles = df['paper_title'].tolist()
abstracts = df['abstract'].tolist()
readmes = df['github_readme_content'].tolist()

In [None]:
print('Load data for somef decriptions')
with open('../data/filtered_data.json', 'r') as f:
    papers_data_somef = json.load(f)

df_somef = pd.DataFrame(papers_data_somef)
print(f'Number of samples: {df_somef.shape[0]}')

somef = df_somef['somef_descriptions'].tolist()

In [None]:
print('Load data for github titles and keywords')
with open('../data/filtered_data_complete.json', 'r') as f:
    papers_data_complete = json.load(f)

df_complete = pd.DataFrame(papers_data_complete)
print(f'Number of samples: {df_complete.shape[0]}')
github_title = df_complete['github_repo_title'].tolist()
github_keywords = df_complete['github_keywords'].tolist()

In [None]:
sentence_embeddings_titles = compute_sentence_embeddings(titles)
reduced_embeddings_titles = reduce_dimensionality(sentence_embeddings_titles, method='tsne')
plot_embeddings(reduced_embeddings_titles, df['main_collection_area'], f'Sentence-BERT Embeddings (Titles #{len(titles)} ) - TSNE - Colored by Area', 'plasma')
reduced_embeddings_titles = reduce_dimensionality(sentence_embeddings_titles, method='pca')
plot_embeddings(reduced_embeddings_titles, df['main_collection_area'], f'Sentence-BERT Embeddings (Titles #{len(titles)} ) - PCA - Colored by Area', 'plasma')


In [None]:
sentence_embeddings_titles = compute_sentence_embeddings(titles)
reduced_embeddings_titles = reduce_dimensionality(sentence_embeddings_titles, method='tsne')
plot_embeddings(reduced_embeddings_titles, df['main_collection_area'], 'Sentence-BERT Embeddings (T-SNE) - Paper Titles', 'plasma')
#reduced_embeddings_titles = reduce_dimensionality(sentence_embeddings_titles, method='pca')
#plot_embeddings(reduced_embeddings_titles, df['main_collection_area'], 'Sentence-BERT Embeddings (PCA) - Paper Titles', 'plasma')

In [None]:
clip_embeddings_titles = compute_clip_embeddings(titles)
reduced_clip_titles = reduce_dimensionality(clip_embeddings_titles, method='tsne')
plot_embeddings(reduced_clip_titles, df['main_collection_area'], 'CLIP Embeddings (T-SNE) - Paper Titles', 'plasma')
#reduced_clip_titles = reduce_dimensionality(clip_embeddings_titles, method='pca')
#plot_embeddings(reduced_clip_titles, df['main_collection_area'], 'CLIP Embeddings (PCA) - Paper Titles', 'plasma')

In [None]:
abstracts_tfidf = compute_tfidf(abstracts)
reduced_abstracts_tfidf = reduce_dimensionality(abstracts_tfidf, method='tsne')
plot_embeddings(reduced_abstracts_tfidf, df['main_collection_area'], 'TF-IDF Embeddings (T-SNE) - Abstracts', 'plasma')
#reduced_abstracts_tfidf = reduce_dimensionality(abstracts_tfidf, method='pca')
#plot_embeddings(reduced_abstracts_tfidf, df['main_collection_area'], 'TF-IDF Embeddings (PCA) - Abstracts', 'plasma')

In [None]:
sentence_embeddings_abstracts = compute_sentence_embeddings(abstracts)
reduced_embeddings_abstracts = reduce_dimensionality(sentence_embeddings_abstracts, method='tsne')
plot_embeddings(reduced_embeddings_abstracts, df['main_collection_area'], 'Sentence-BERT Embeddings (T-SNE) - Abstracts', 'plasma')
#reduced_embeddings_abstracts = reduce_dimensionality(sentence_embeddings_abstracts, method='pca')
#plot_embeddings(reduced_embeddings_abstracts, df['main_collection_area'], 'Sentence-BERT Embeddings (PCA) - Abstracts', 'plasma')

In [None]:
clip_embeddings_abstracts = compute_clip_embeddings(abstracts)
reduced_clip_abstracts = reduce_dimensionality(clip_embeddings_abstracts, method='tsne')
plot_embeddings(reduced_clip_abstracts, df['main_collection_area'], 'CLIP Embeddings (T-SNE) - Abstracts', 'plasma')
#reduced_clip_abstracts = reduce_dimensionality(clip_embeddings_abstracts, method='pca')
#plot_embeddings(reduced_clip_abstracts, df['main_collection_area'], 'CLIP Embeddings (PCA) - Abstracts', 'plasma')

In [None]:
readmes_tfidf = compute_tfidf(readmes)
reduced_readmes_tfids = reduce_dimensionality(readmes_tfidf, method='tsne')
plot_embeddings(reduced_readmes_tfids, df['main_collection_area'], 'TF-IDF Embeddings (T-SNE) - READMEs', 'plasma')
#reduced_readmes_tfids = reduce_dimensionality(readmes_tfidf, method='pca')
#plot_embeddings(reduced_readmes_tfids, df['main_collection_area'], 'TF-IDF Embeddings (PCA) - READMEs', 'plasma')

In [None]:
sentence_embeddings_readmes = compute_sentence_embeddings(readmes)
reduced_embeddings_readmes = reduce_dimensionality(sentence_embeddings_readmes, method='tsne')
plot_embeddings(reduced_embeddings_readmes, df['main_collection_area'], 'Sentence-BERT Embeddings (T-SNE) - READMEs', 'plasma')
#reduced_embeddings_readmes = reduce_dimensionality(sentence_embeddings_readmes, method='pca')
#plot_embeddings(reduced_embeddings_readmes, df['main_collection_area'], 'Sentence-BERT Embeddings (PCA) - READMEs', 'plasma')

In [None]:
clip_embeddings_readmes = compute_clip_embeddings(readmes)
reduced_clip_readmes = reduce_dimensionality(clip_embeddings_readmes, method='tsne')
plot_embeddings(reduced_clip_readmes, df['main_collection_area'], 'CLIP Embeddings (T-SNE) - READMEs', 'plasma')
#reduced_clip_readmes = reduce_dimensionality(clip_embeddings_readmes, method='pca')
#plot_embeddings(reduced_clip_readmes, df['main_collection_area'], 'CLIP Embeddings (PCA) - READMEs', 'plasma')

In [None]:
somef_tfidf = compute_tfidf(somef)
reduced_somef_tfids = reduce_dimensionality(somef_tfidf, method='tsne')
plot_embeddings(reduced_somef_tfids, df_somef['main_collection_area'], 'TF-IDF Embeddings (T-SNE) - Descriptions', 'plasma')
#reduced_somef_tfids = reduce_dimensionality(somef_tfidf, method='pca')
#plot_embeddings(reduced_somef_tfids, df_somef['main_collection_area'], 'TF-IDF Embeddings (PCA) - Descriptions', 'plasma')

In [None]:
sentence_embeddings_somef = compute_sentence_embeddings(somef)
reduced_embeddings_somef = reduce_dimensionality(sentence_embeddings_somef, method='tsne')
plot_embeddings(reduced_embeddings_somef, df_somef['main_collection_area'], 'Sentence-BERT Embeddings (T-SNE) - Descriptions', 'plasma')
#reduced_embeddings_somef = reduce_dimensionality(sentence_embeddings_somef, method='pca')
#plot_embeddings(reduced_embeddings_somef, df_somef['main_collection_area'], 'Sentence-BERT Embeddings (PCA) - Descriptions', 'plasma')

In [None]:
clip_embeddings_somef = compute_clip_embeddings(somef)
reduced_clip_somef = reduce_dimensionality(clip_embeddings_somef, method='tsne')
plot_embeddings(reduced_clip_somef, df_somef['main_collection_area'], 'CLIP Embeddings (T-SNE) - Descriptions', 'plasma')
#reduced_clip_somef = reduce_dimensionality(clip_embeddings_somef, method='pca')
#plot_embeddings(reduced_clip_somef, df_somef['main_collection_area'], 'CLIP Embeddings (PCA) - Descriptions', 'plasma')

In [None]:
github_titles_tfidf = compute_tfidf(github_title)
reduced_github_titles_tfids = reduce_dimensionality(github_titles_tfidf, method='tsne')
plot_embeddings(reduced_github_titles_tfids, df_complete['main_collection_area'], 'TF-IDF Embeddings (T-SNE) - Titles', 'plasma')
#reduced_github_titles_tfids = reduce_dimensionality(github_titles_tfidf, method='pca')
#plot_embeddings(reduced_github_titles_tfids, df_complete['main_collection_area'], 'TF-IDF Embeddings (PCA) - Titles', 'plasma')

In [None]:
sentence_embeddings_github_titles = compute_sentence_embeddings(github_title)
reduced_embeddings_github_titles = reduce_dimensionality(sentence_embeddings_github_titles, method='tsne')
plot_embeddings(reduced_embeddings_github_titles, df_complete['main_collection_area'], 'Sentence-BERT Embeddings (T-SNE) - Titles', 'plasma')
#reduced_embeddings_github_titles = reduce_dimensionality(sentence_embeddings_github_titles, method='pca')
#plot_embeddings(reduced_embeddings_github_titles, df_complete['main_collection_area'], 'Sentence-BERT Embeddings (PCA) - Titles', 'plasma')

In [None]:
clip_embeddings_github_titles = compute_clip_embeddings(github_title)
reduced_clip_github_titles = reduce_dimensionality(clip_embeddings_github_titles, method='tsne')
plot_embeddings(reduced_clip_github_titles, df_complete['main_collection_area'], 'CLIP Embeddings (T-SNE) - Titles', 'plasma')
#reduced_clip_github_titles = reduce_dimensionality(clip_embeddings_github_titles, method='pca')
#plot_embeddings(reduced_clip_github_titles, df_complete['main_collection_area'], 'CLIP Embeddings (PCA) - Titles', 'plasma')

In [None]:
sentence_embeddings_github_keywords = compute_sentence_embeddings(github_keywords)
reduced_embeddings_github_keywords = reduce_dimensionality(sentence_embeddings_github_keywords, method='tsne')
plot_embeddings(reduced_embeddings_github_keywords, df_complete['main_collection_area'], 'Sentence-BERT Embeddings (T-SNE) - Keywords', 'plasma')
#reduced_embeddings_github_keywords = reduce_dimensionality(sentence_embeddings_github_keywords, method='pca')
#plot_embeddings(reduced_embeddings_github_keywords, df_complete['main_collection_area'], 'Sentence-BERT Embeddings (PCA) - Keywords', 'plasma')

In [None]:
clip_embeddings_github_keywords = compute_clip_embeddings(github_keywords)
reduced_clip_github_keywords = reduce_dimensionality(clip_embeddings_github_keywords, method='tsne')
plot_embeddings(reduced_clip_github_keywords, df_complete['main_collection_area'], 'CLIP Embeddings (T-SNE) - Keywords', 'plasma')
#reduced_clip_github_keywords = reduce_dimensionality(clip_embeddings_github_keywords, method='pca')
#plot_embeddings(reduced_clip_github_keywords, df_complete['main_collection_area'], 'CLIP Embeddings (PCA) - Keywords', 'plasma')