In [19]:
import json
import numpy as np
import pandas as pd
import torch
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
from transformers import CLIPModel, CLIPTokenizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from tabulate import tabulate

In [3]:
with open('../data/title_abstract_readme_somef_clean_parallel4.json', 'r') as f:
    papers_data = json.load(f)

In [4]:
df = pd.DataFrame(papers_data)
#print(df.shape)
df = df.dropna(how='any')
df = df.reset_index(drop=True)
print(df.shape)

(15549, 9)


In [5]:
github_title = df['github_repo_title'].tolist()
github_keywords = df['github_keywords'].tolist()

In [6]:
def compute_tfidf(text_list, batch_size=1):
    vectorizer = TfidfVectorizer()

    # First, fit the vectorizer on the entire text_list to ensure consistent feature space
    vectorizer.fit(text_list)
    embeddings = []

    # Process in batches
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        # Use the pre-fitted vectorizer to transform the batch
        batch_vectors = vectorizer.transform(batch)
        batch_cosine_sim = cosine_similarity(batch_vectors)
        embeddings.append(batch_cosine_sim)

    # Concatenate all the batch results
    return np.vstack(embeddings)

In [44]:
def preprocess_text(text):
    # Remove punctuation, numbers, and lower the text
    text = re.sub(f"[{string.punctuation}0-9]", " ", text.lower())
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

def compute_tfidf_no_batches(text_list):
    # Preprocess each text in the list
    text_list = [preprocess_text(text) for text in text_list]
    print(text_list[:10])
    
    # Define the TF-IDF vectorizer with tuning
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, ngram_range=(1, 2), stop_words='english', max_features=3000)

    
    vectors = vectorizer.fit_transform(text_list)
    return vectors.toarray()

In [45]:
tfidf_embeddings_no_batches = compute_tfidf_no_batches(github_title)

['pytorch scalablefhvae', 'ivus image segmentation icsm', 'bwgan pytorch', 'surface networks', 'multivariate gaussian distributions tensorflow', 'vmp svae', 'randomdepthwisecnn', 'temporal coherence based self supervised learning laparoscopic workflow analysis', 'interactive classification deep learning interpretation', 'pytorch implementation one shot unsupervised cross domain translation arxiv']


In [47]:
for t in tfidf_embeddings_no_batches[1]:
    print(t)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
