In [1]:
### import importlib
import sys
import os
import nltk
import pandas as pd
import networkx as nx
from textblob import TextBlob

# Add the 'src' directory to the system path
sys.path.append(os.path.abspath('../src'))
src_path = os.path.abspath('../src')

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Import classes from the modules using their correct filenames
from DataLoaderClass import DataLoader

In [3]:
# Initialize paths
BIB_FILE_PATH = '../examples/EX1_POWER_SYSTEM_FPGA_FREQUENCY_ESTIMATORS/index.bib'
PDF_FOLDER_PATH = '../examples/EX1_POWER_SYSTEM_FPGA_FREQUENCY_ESTIMATORS/files'

In [4]:
# Step 1: Data Loading and Processing
loader = DataLoader(BIB_FILE_PATH, PDF_FOLDER_PATH)
processed_data = loader.load_and_process()


Matching process completed.
Total references matched: 31 out of 31
Unmatched References: 0

Unmatched PDF Folders: 0


In [5]:
# -------------------------------------------------------------- #
# -- EDA M4 :: Quotes & Cites ---------------------------------- #
# -------------------------------------------------------------- #
from eda.m4_quotes_analysis import Processor
from eda.m4_quotes_analysis import Visualizer
from eda.m4_quotes_analysis import Reporter
# -------------------------------------------------------------- #

# Data
data = processed_data

# Process data
processor = Processor(data)
visualizer = Visualizer()
reporter = Reporter()

In [6]:
#################################################################################
### Function 1 => Most Frequent Quotes (Table & Barplot)
#################################################################################

In [49]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import pickle

# Load spaCy language model
nlp = spacy.load('en_core_web_sm')

# Define the text cleaning function
def clean_text(text):
    """
    Clean the text by removing unwanted characters or patterns.
    """
    import re
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    return text.strip()

# Function to extract quotes using spaCy
def extract_quotes_spacy(text, min_length=10, max_length=300):
    """
    Extract quotes using spaCy's sentence segmentation and filter by length.
    """
    doc = nlp(text)
    quotes = []
    for sent in doc.sents:
        sentence_text = sent.text.strip()
        if min_length <= len(sentence_text) <= max_length:
            quotes.append(sentence_text)
    return quotes

# Function to cluster sentences and save the model
def do_cluster_sentences(sentences, n_clusters=5, save_model=True):
    """
    Cluster sentences into n_clusters using TF-IDF and KMeans. Save the model if required.
    """
    if not sentences:
        raise ValueError("The sentences list is empty. Provide valid input.")
    
    vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
    X = vectorizer.fit_transform(sentences)
    
    model = KMeans(n_clusters=n_clusters, random_state=42)
    model.fit(X)
    
    # Save the model and vectorizer
    if save_model:
        with open("tfidf_vectorizer.pkl", "wb") as f:
            pickle.dump(vectorizer, f)
        with open("kmeans_model.pkl", "wb") as f:
            pickle.dump(model, f)
    
    clusters = {i: [] for i in range(n_clusters)}
    for idx, label in enumerate(model.labels_):
        clusters[label].append(sentences[idx])
    
    return clusters, model, vectorizer

# Function to predict the cluster for a new quote
def predict_quote_cluster(quote, model, vectorizer):
    """
    Predict the cluster for a new quote using the trained model and vectorizer.
    """
    quote_vector = vectorizer.transform([quote])
    cluster_id = model.predict(quote_vector)[0]
    return cluster_id

# Example Usage
all_quotes = []

# Extract quotes
for entry in processed_data[:5]:  # Analyze the first 5 entries
    raw_text = entry.get("plain_text", "")
    cleaned_text = clean_text(raw_text)
    quotes = extract_quotes_spacy(cleaned_text)
    all_quotes.extend(quotes)

# Perform clustering and save results
clusters, kmeans_model, tfidf_vectorizer = do_cluster_sentences(all_quotes, n_clusters=3)

# Save clusters to a CSV file
csv_data = []
for cluster_id, cluster_sentences in clusters.items():
    for sentence in cluster_sentences:
        csv_data.append({"Cluster": cluster_id, "Quote": sentence})

df = pd.DataFrame(csv_data)
df.to_csv("clusters.csv", index=False)

# Predict cluster for a new quote
new_quote = "FPGA Kalman"
predicted_cluster = predict_quote_cluster(new_quote, kmeans_model, tfidf_vectorizer)
similar_quotes = clusters[predicted_cluster]


New Quote belongs to Cluster 1
Similar Quotes in this Cluster: ['xxxiiipart', '1 machine rotor speeds 2278 2 center of inertia 2478 3 applications of the rocop 2569 power system model 2619 1 introduction', 'i e v v θ v', 'i e v v jvdq', 'ϑ t 1 4 dt dt2to properly interpret 1 2 is not always trivial if the angular frequency isconstant the definition is also consistent with the intuition in fact if ϑ is ϑ t ω t θ 1 5 o owhere ω']


In [50]:

print(" ")
print(" new_quote ")
print(new_quote)
print(" ")
print(f"New Quote belongs to Cluster {predicted_cluster}")
print(f"Similar Quotes in this Cluster: {similar_quotes[:5]}")

 
 new_quote 
The frequency variations in power systems require dynamic control.
 
New Quote belongs to Cluster 1
Similar Quotes in this Cluster: ['xxxiiipart', '1 machine rotor speeds 2278 2 center of inertia 2478 3 applications of the rocop 2569 power system model 2619 1 introduction', 'i e v v θ v', 'i e v v jvdq', 'ϑ t 1 4 dt dt2to properly interpret 1 2 is not always trivial if the angular frequency isconstant the definition is also consistent with the intuition in fact if ϑ is ϑ t ω t θ 1 5 o owhere ω']


In [8]:
#################################################################################
### Function 2 => Quotes by Year (Table & Line Chart)
#################################################################################

### **Function 2: Quotes by Year**

#### **Overview**:
This analysis examines how quotes are distributed over time, providing insights into the evolution of popular ideas and recurring themes in the dataset. By grouping quotes by year and counting their occurrences, we can track the rise or decline of specific quotes.

#### **Interpretation**:
- **Emerging Quotes**: Quotes that appear frequently in recent years may indicate emerging ideas or trends in the field.
- **Declining Quotes**: Quotes that were more common in earlier years but less so now might represent outdated concepts or fading interest.
- **Consistent Quotes**: Quotes with steady usage across years may highlight foundational or widely accepted principles.

In [9]:
# Step 1: Analyze quotes by year
quotes_by_year = processor.get_quotes_by_year()

# Step 2: Visualize the trends of quotes over time
Visualizer.plot_quote_trends(
    df=quotes_by_year,
    title="Quote Trends Over Time",
    filename="quote_trends"
)

# Step 3: Save the table
Reporter.save_to_csv(quotes_by_year, "quotes_by_year")

# Step 4: Display the table
quotes_by_year.head(10)

KeyError: 'Quote'

In [None]:
#################################################################################
### Function 3 => Quotes Context Analysis (Table)
#################################################################################

### **Function 3: Quotes Context Analysis**

#### **Overview**:
This analysis extracts the context in which specific quotes appear, including sentences or paragraphs before and after the quote. It helps understand how quotes are used and their relevance to the surrounding discussion.

#### **Interpretation**:
- **Contextual Insights**: Analyze the sentences around a quote to better understand its role in the text.
- **Relevance**: Identify how quotes are used to support arguments or highlight key ideas.
- **Metadata**: Connect quotes to their associated metadata, such as title, authors, and year, for a richer understanding.

In [None]:


# Step 1: Define quotes to analyze
quotes_to_analyze = [
    "Frequency is a challenging parameter to estimate.",
    "Power systems require stability and robustness."
]

# Step 2: Extract contexts for selected quotes
quote_contexts = processor.get_quote_contexts(quotes_to_analyze, context_window=2)

# Step 3: Save the table
Reporter.save_to_csv(quote_contexts, "quote_contexts")

# Step 4: Display the table
quote_contexts.head(10)

In [None]:
#################################################################################
### Sentiment Analysis of Quote Contexts
#################################################################################


In [None]:
# Step 1: Extract contexts for selected quotes
contexts = processor.get_quote_contexts(
    quotes_to_analyze=["Frequency is a challenging parameter to estimate."],
    context_window=2
)

# Step 2: Analyze sentiment of the contexts
sentiment_df = processor.analyze_sentiment_of_contexts(contexts)

# Step 3: Visualize sentiment distribution
Visualizer.plot_sentiment_distribution(
    df=sentiment_df,
    title="Sentiment Distribution of Quote Contexts",
    filename="quote_sentiment_distribution"
)

# Step 4: Save the table
Reporter.save_to_csv(sentiment_df, "quote_sentiment_analysis")

# Step 5: Display the sentiment analysis table
sentiment_df.head(10)

In [None]:
#################################################################################
### Theme Extraction from Quote Contexts
#################################################################################

In [None]:
# Step 1: Extract contexts for selected quotes
contexts = processor.get_quote_contexts(
    quotes_to_analyze=["Frequency is a challenging parameter to estimate."],
    context_window=2
)

# Step 2: Extract themes (keywords) from the contexts
themes_df = processor.extract_themes_from_contexts(contexts, top_n=10)

# Step 3: Visualize themes with a word cloud
Visualizer.plot_word_cloud_from_themes(
    df=themes_df,
    title="Themes from Quote Contexts",
    filename="quote_context_themes"
)

# Step 4: Save the table
Reporter.save_to_csv(themes_df, "quote_context_themes")

# Step 5: Display the themes table
themes_df


In [None]:
themes_df