In [8]:
#!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.11-cp312-cp312-win_amd64.whl.metadata (8.0 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.4-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-

In [9]:
###' ################################################################################
###'
###' IMPORT LIBRARIES
###'
###'



### pandas and numpy
import pandas as pd
import numpy as np
import spacy
import glob
import os
from pathlib import Path

### punctuation, stop words and English language model
from string import punctuation
from spacy.lang.en.stop_words import STOP_WORDS
from spellchecker import SpellChecker
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import en_core_web_sm
nlp = en_core_web_sm.load()
import scattertext as st
import re

### textblob
from textblob import TextBlob

### countvectorizer, tfidfvectorizer, embeddings
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.manifold import TSNE
import umap.umap_ as umap

import torch
import umap
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset


### tqdm
from tqdm import tqdm

### gensim
import gensim
from gensim import models

### PCA
import random
from adjustText import adjust_text

### plotting
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### kMeans and silhouette scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

### ignore warnings
import warnings
warnings.filterwarnings('ignore')

###time
from datetime import datetime

In [10]:
# Load dataset
data_dir = Path(r"C:\Users\Hyemi\Python\TopicModeling\Data")
data_dir.mkdir(parents=True, exist_ok=True)
file_path = data_dir / "articles_tokenize.csv"
articles_full = pd.read_csv(file_path)

## 1. Embedding: group1

In [11]:
# Define a function to classify journals into groups
def classify_year_group(year):
    if 2010 <= year <= 2016:
        return "1"
    elif 2017 <= year <= 2019:
        return "2"
    elif 2020 <= year <= 2021:
        return "3"
    else:
        return "4"

articles_full["Year_Group"] = articles_full["Year"].apply(classify_year_group)

# Load XLM-RoBERTa model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
articles_full.groupby('Year_Group').size().reset_index(name='count')

Unnamed: 0,Year_Group,count
0,1,5085
1,2,5764
2,3,7102
3,4,12175


In [16]:
# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003", trust_remote_code=True)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["tokens"], truncation=True, padding="max_length", is_split_into_words=True, max_length=512)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Convert dataset to PyTorch format
class NERDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.dataset.items() if key in ["input_ids", "attention_mask"]}
        item["labels"] = torch.tensor(self.dataset["ner_tags"][idx])  # Fine-tuning for token classification
        return item

train_dataset = NERDataset(tokenized_datasets["train"])
valid_dataset = NERDataset(tokenized_datasets["validation"])

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("xlm-roberta-large-finetuned-conll03")
tokenizer.save_pretrained("xlm-roberta-large-finetuned-conll03")

print("Fine-tuning complete. Model saved.")

# Reload Fine-tuned Model for Embedding Extraction
fine_tuned_model = AutoModel.from_pretrained("xlm-roberta-large-finetuned-conll03")

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [5]:
# Function to Generate Embeddings
def batch_get_embeddings(texts, model, tokenizer, batch_size=32):
    embeddings = []
    
    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize batch
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        
        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract CLS token representations
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)

In [None]:
# Modify the column assignment
df = articles_full[articles_full["Year_Group"] == "1"]
df["Embeddings"] = list(batch_get_embeddings(df["Abstract_join"].tolist(), fine_tuned_model, tokenizer))

# Convert to NumPy array
embeddings_matrix = np.vstack(df["Embeddings"].values)

# Reduce Dimensionality with UMAP
umap_model = umap.UMAP(n_components=2, random_state=42)
embeddings_2d = umap_model.fit_transform(embeddings_matrix)

# Optional: Add progress tracking
print("Embedding generation complete. Shape:", embeddings_matrix.shape)

Generating Embeddings:   2%|█                                                          | 3/159 [00:55<49:52, 19.18s/it]

In [None]:
#import umap.umap_ as umap
#umap_model = umap.UMAP(n_components=2, random_state=42)
#embeddings_2d = umap_model.fit_transform(embeddings_matrix)

In [None]:
# Function to calculate the Within-Cluster Sum of Squares (WCSS) for different cluster sizes
def plot_elbow_method(embeddings_matrix, max_clusters=9):
    wcss = []  # List to store WCSS for each number of clusters
    
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(embeddings_matrix)
        wcss.append(kmeans.inertia_)  # Inertia is the sum of squared distances to the closest centroid
    
    # Plot Elbow Graph
    plt.figure(figsize=(8, 5))
    plt.plot(range(2, max_clusters + 1), wcss, marker='o', linestyle='-', color='b')
    plt.xlabel("Number of Clusters")
    plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
    plt.title("Elbow Method for Optimal k")
    plt.xticks(range(1, max_clusters + 1))
    plt.grid()
    plt.show()
    
# Run the Elbow Method function to determine the optimal number of clusters
plot_elbow_method(embeddings_2d)

In [None]:
# Perform K-Means clustering
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(embeddings_2d)

# Add clustering results to the dataframe
df["Cluster"] = clusters

# Plot the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=df["Cluster"], palette="tab10", alpha=0.6)
plt.title("Word Clusters Based on Abstract Embeddings")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.legend(title="Cluster")
plt.show()

In [None]:
import umap.umap_ as umap

umap_3d = umap.UMAP(n_components=3, random_state=42)
embeddings_3d = umap_3d.fit_transform(embeddings_matrix)

# Set number of clusters
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(embeddings_3d)  # Use 3D embeddings

# Create 3D scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
scatter = ax.scatter(
    embeddings_3d[:, 0], embeddings_3d[:, 1], embeddings_3d[:, 2], 
    c=df["Cluster"], cmap="tab10", alpha=0.6
)

# Labels and Title
ax.set_title("Word Clusters Based on Abstract Embeddings (3D)")
ax.set_xlabel("UMAP Dimension 1")
ax.set_ylabel("UMAP Dimension 2")
ax.set_zlabel("UMAP Dimension 3")

# Add legend
legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
ax.add_artist(legend1)

# Show plot
plt.show()

In [None]:
data_dir = Path(r"C:\Users\Hyemi\Python\TopicModeling\Data")
data_dir.mkdir(parents=True, exist_ok=True)

file_path = data_dir / "articles_embedding_1.csv"

df.to_csv(file_path, index=False)