Researcher Profile Mining and Analysis

Step 3: Build Author Profile

In [None]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvi

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from keybert import KeyBERT
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os

In [None]:
# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Initialize KeyBERT model
kw_model = KeyBERT('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Load Excel file
excel_file = '/content/Task-1.xlsx'
researchers = pd.ExcelFile(excel_file).sheet_names

In [None]:
# Stopwords
stop_words = set(stopwords.words('english')).union({'abstract', 'paper', 'study', 'model', 'models', 'research', 'approach', 'performance', 'results', 'propose', 'experiments'})

In [None]:
# Function to preprocess text
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return ' '.join(tokens)

In [None]:
# Function to extract themes
def extract_themes(abstracts):
    # Combine abstracts
    corpus = ' '.join(abstracts)
    processed_corpus = preprocess_text(corpus)

    # TF-IDF
    vectorizer = TfidfVectorizer(max_features=5)
    tfidf_matrix = vectorizer.fit_transform([processed_corpus])
    tfidf_terms = vectorizer.get_feature_names_out()

    # BERT-based keyword extraction
    keywords = kw_model.extract_keywords(processed_corpus, top_n=5, stop_words=stop_words)
    bert_terms = [kw[0] for kw in keywords]

    # Combine and deduplicate terms
    combined_terms = list(set(tfidf_terms).union(set(bert_terms)))[:5]

    return combined_terms

In [None]:
# Function to generate word cloud
def generate_wordcloud(text, researcher):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {researcher}')
    output_path = os.path.join('wordclouds', f'wordcloud_{researcher.replace(" ", "_")}.png')
    plt.savefig(output_path)
    plt.close()
    return output_path

In [None]:
# Create output directory for word clouds
os.makedirs('wordclouds', exist_ok=True)

In [None]:
# Process each researcher
profile_data = []
for researcher in researchers:
    # Skip the 'Author_Profiles' sheet
    if researcher == 'Author_Profiles':
        continue

    df = pd.read_excel(excel_file, sheet_name=researcher)

    # Check if 'Abstract' column exists before accessing it
    if 'Abstract' in df.columns:
        abstracts = df['Abstract'].dropna().astype(str).tolist()
    else:
        print(f"Warning: 'Abstract' column not found in sheet '{researcher}'. Skipping.")
        abstracts = []

    if abstracts:
        # Extract themes
        themes = extract_themes(abstracts)
        themes_str = ', '.join(themes)

        # Generate word cloud
        combined_abstracts = ' '.join(abstracts)
        wordcloud_path = generate_wordcloud(combined_abstracts, researcher)

        profile_data.append([researcher, themes_str])
    else:
        print(f"No abstracts found for researcher: {researcher}")


# Save to summary sheet
profile_df = pd.DataFrame(profile_data, columns=['Researcher', 'Top Research Themes'])
with pd.ExcelWriter(excel_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    profile_df.to_excel(writer, sheet_name='Author_Profiles', index=False)

print("Author profiles created and saved. Word clouds saved in 'wordclouds' directory.")

Author profiles created and saved. Word clouds saved in 'wordclouds' directory.


In [None]:
# Save to summary sheet
profile_df = pd.DataFrame(profile_data, columns=['Researcher', 'Top Research Themes'])
with pd.ExcelWriter(excel_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    profile_df.to_excel(writer, sheet_name='Author_Profiles', index=False)

print("Author profiles created and saved. Word clouds saved in 'wordclouds' directory.")

Author profiles created and saved. Word clouds saved in 'wordclouds' directory.


In [None]:
# Process each researcher
profile_data = []
for researcher in researchers:
    # Skip the 'Author_Profiles' sheet
    if researcher == 'Author_Profiles':
        continue

    df = pd.read_excel(excel_file, sheet_name=researcher)


    if 'Abstract' in df.columns:
        abstracts = df['Abstract'].dropna().astype(str).tolist()
    else:
        print(f"Warning: 'Abstract' column not found in sheet '{researcher}'. Skipping.")
        abstracts = []

    if abstracts:
        # Extract themes
        themes = extract_themes(abstracts)
        themes_str = ', '.join(themes)

        # Generate word cloud
        combined_abstracts = ' '.join(abstracts)
        wordcloud_path = generate_wordcloud(combined_abstracts, researcher)

        profile_data.append([researcher, themes_str])
    else:
        print(f"No abstracts found for researcher: {researcher}")


# Create the profile DataFrame
profile_df = pd.DataFrame(profile_data, columns=['Researcher', 'Top Research Themes'])

# Display the profile DataFrame
print("Generated Author Profiles:")
display(profile_df)

Generated Author Profiles:


Unnamed: 0,Researcher,Top Research Themes
0,Songlin Yang,"linear, attention, transformers, span, training"
1,Samuel Cahyawijaya,"llms, languages, across, sea, language"
2,Xian-Ling Mao,"llms, critique, datasets, evaluation, proposed"
3,Kaushal Kumar Maurya,"nlg, llms, languages, two, language"
4,Bowei Zou,"knowledge, commonsense, reasoning, language, q..."
5,Sheng Shen,"llms, tasks, large, language, data"
6,xian-Ling Mao-,"llms, critique, datasets, evaluation, proposed"
7,Jiatong Shi,"speech, tasks, audio, singing, language"
8,Malte Ostendorff,"training, document, similarity, language, data"
9,Jingang Wang,"heat, temperature, asphalt, high, binders"
