# Imports

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from transformers import CLIPTokenizer, CLIPModel
from sentence_transformers import SentenceTransformer
import Levenshtein
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [2]:
with open('data/paper_title_abstract.json', 'r') as f:
    papers_data = json.load(f)

In [3]:
papers_data[0]

{'paper_title': 'Dynamic Network Model from Partial Observations',
 'abstract': 'Can evolving networks be inferred and modeled without directly observing\ntheir nodes and edges? In many applications, the edges of a dynamic network\nmight not be observed, but one can observe the dynamics of stochastic cascading\nprocesses (e.g., information diffusion, virus propagation) occurring over the\nunobserved network. While there have been efforts to infer networks based on\nsuch data, providing a generative probabilistic model that is able to identify\nthe underlying time-varying network remains an open question. Here we consider\nthe problem of inferring generative dynamic network models based on network\ncascade diffusion data. We propose a novel framework for providing a\nnon-parametric dynamic network model--based on a mixture of coupled\nhierarchical Dirichlet processes-- based on data capturing cascade node\ninfection times. Our approach allows us to infer the evolving community\nstructur

In [4]:
df = pd.DataFrame(papers_data)

In [5]:
df['main_collection_area'].value_counts()

main_collection_area
Computer Vision                24874
Natural Language Processing     7863
Graphs                          7017
Reinforcement Learning          2516
Sequential                       424
Audio                             90
Name: count, dtype: int64

# Define the embeddings

TF-IDF

In [6]:
def compute_tfidf(text_list):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(text_list)
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim

Sentence-BERT embedding

In [7]:
model_bert = SentenceTransformer('all-MiniLM-L6-v2')

def compute_sentence_embeddings(text_list):
    return model_bert.encode(text_list)



OpenAI CLIP embedding

In [8]:
model_clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

def compute_clip_embeddings(text_list):
    inputs = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model_clip.get_text_features(**inputs)
    return outputs.cpu().numpy()

# Calculate the embeddings

In [9]:
titles = df['paper_title'].tolist()
abstracts = df['abstract'].tolist()

: 

In [23]:
titles_tfidf = compute_tfidf(titles)
abstracts_tfidf = compute_tfidf(abstracts)

In [None]:
sentence_embeddings_titles = compute_sentence_embeddings(titles)
sentence_embeddings_abstracts = compute_sentence_embeddings(abstracts)

In [None]:
clip_embeddings_titles = compute_clip_embeddings(titles)
clip_embeddings_abstracts = compute_clip_embeddings(abstracts)

# Reduce dimensionality for the plot

In [2]:
def reduce_dimensionality(embeddings, method='pca'):
    if method == 'pca':
        pca = PCA(n_components=2)
        reduced = pca.fit_transform(embeddings)
    elif method == 'tsne':
        tsne = TSNE(n_components=2, random_state=42)
        reduced = tsne.fit_transform(embeddings)
    return reduced

In [1]:
reduced_titles_tfids = reduce_dimensionality(titles_tfidf)
reduced_abstracts_tfidf = reduce_dimensionality(abstracts_tfidf)

NameError: name 'reduce_dimensionality' is not defined

In [None]:
reduced_embeddings_titles = reduce_dimensionality(sentence_embeddings_titles)
reduced_embeddings_abstracts = reduce_dimensionality(sentence_embeddings_abstracts)

In [None]:
reduced_clip_titles = reduce_dimensionality(clip_embeddings_titles)
reduced_clip_abstracts = reduce_dimensionality(clip_embeddings_abstracts)

# Plot

In [3]:
def plot_embeddings(embeddings, color_by, title, color_map):
    plt.figure(figsize=(15, 5))

    # Convert color_by into a categorical type and get unique categories
    categories = pd.Categorical(color_by)
    category_codes = categories.codes
    category_labels = categories.categories
    
    # Create scatter plot
    scatter = plt.scatter(embeddings[:, 0], embeddings[:, 1], c=category_codes, cmap=color_map)

    plt.title(title)
    
    # Create a custom legend
    unique_categories = np.unique(category_codes)
    legend_elements = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=scatter.cmap(scatter.norm(code)), markersize=10) 
                       for code in unique_categories]
    plt.legend(legend_elements, category_labels, title="Category", loc="upper right")

    plt.show()

## Plot TF-IDF Embeddings

In [None]:
plot_embeddings(reduced_titles_tfids, df['area'], 'TF-IDF Embeddings (Titles) - Colored by Area', 'plasma')
plot_embeddings(reduced_abstracts_tfidf, df['area'], 'TF-IDF Embeddings (Abstracts) - Colored by Area', 'plasma')

## Plot Sentence-BERT Embeddings 

In [None]:
plot_embeddings(reduced_embeddings_titles, df['area'], 'Sentence-BERT Embeddings (Titles) - Colored by Area', 'plasma')
plot_embeddings(reduced_embeddings_abstracts, df['area'], 'Sentence-BERT Embeddings (Abstracts) - Colored by Area', 'plasma')

## Plot CLIP Embeddings

In [4]:
plot_embeddings(reduced_clip_titles, df['area'], 'CLIP Embeddings (Titles) - Colored by Area', 'plasma')
plot_embeddings(reduced_clip_abstracts, df['area'], 'CLIP Embeddings (Abstracts) - Colored by Area', 'plasma')

NameError: name 'reduced_clip_titles' is not defined