In [1]:
### import importlib
import spacy

import sys
import os
import re

import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
english_vocab = set(words.words())

nlp = spacy.load("en_core_web_sm")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import networkx as nx
from textblob import TextBlob

# Add the 'src' directory to the system path
sys.path.append(os.path.abspath('../src'))
src_path = os.path.abspath('../src')

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package words to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jorge.mayorga/nltk_data...
[nltk_data]   Package stopwords i

True

In [2]:
# Import classes from the modules using their correct filenames
from DataLoaderClass import DataLoader

In [3]:
# Initialize paths
BIB_FILE_PATH = '../examples/EX2_POWER_SYSTEM_FPGA_FREQUENCY_ESTIMATORS/index.bib'
PDF_FOLDER_PATH = '../examples/EX2_POWER_SYSTEM_FPGA_FREQUENCY_ESTIMATORS/files'

In [4]:
# Step 1: Data Loading and Processing
loader = DataLoader(BIB_FILE_PATH, PDF_FOLDER_PATH)
processed_data = loader.load_and_process()


Matching process completed.
Total references matched: 53 out of 53
Unmatched References: 0

Unmatched PDF Folders: 0


In [5]:
# -------------------------------------------------------------- #
# -- EDA M4 :: Quotes & Cites ---------------------------------- #
# -------------------------------------------------------------- #
from eda.m4_quotes_analysis import Processor
from eda.m4_quotes_analysis import Visualizer
from eda.m4_quotes_analysis import Reporter
# -------------------------------------------------------------- #

# Data
data = processed_data

# Process data
processor = Processor(data)
visualizer = Visualizer()
reporter = Reporter()

In [6]:
#################################################################################
### Function 1 => Most Frequent Quotes (Table & Barplot)
#################################################################################

In [7]:
def split_long_word_with_vocab(word, vocabulary):
    """
    Split a long word into subwords using a predefined English vocabulary.
    """
    subwords = []
    current = word.lower()
    while current:
        match = None
        for i in range(len(current), 0, -1):
            substring = current[:i]
            if substring in vocabulary:
                match = substring
                break
        if match:
            subwords.append(match)
            current = current[len(match):]
        else:
            subwords.append(current)  # Keep as is if no match
            break
    return subwords

def clean_and_split_text_with_vocab(plain_text, vocabulary):
    """
    Clean and split text using vocabulary-based splitting.
    """
    # Lowercase and remove non-alphabetic characters
    plain_text = re.sub(r'[^a-zA-Z\s]', ' ', plain_text.lower())

    # Tokenize the text
    tokens = word_tokenize(plain_text)

    # Remove stop words and single characters
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]

    # Split concatenated words using the vocabulary
    split_tokens = []
    for token in tokens:
        if len(token) > 10:  # Treat long words as concatenated
            split_tokens.extend(split_long_word_with_vocab(token, vocabulary))
        else:
            split_tokens.append(token)

    # Lemmatize the words with SpaCy
    lemmatized_words = []
    for token in nlp(" ".join(split_tokens)):
        if token.is_alpha and not token.is_stop:
            lemmatized_words.append(token.lemma_)

    # Remove domain-specific noise words
    noise_words = {"doi", "org", "http", "https", "www"}
    lemmatized_words = [word for word in lemmatized_words if word not in noise_words]
    
    return lemmatized_words

def process_document(doc, vocabulary):
    """
    Process a single document:
    - Cleans and tokenizes plain_text.
    - Splits authors into a list.
    """
    # Extract and clean plain_text
    plain_text = doc.get('plain_text', '')
    words = clean_and_split_text_with_vocab(plain_text, vocabulary)

    # Split authors into a list
    authors_raw = doc.get('bibliographic_metadata', {}).get('author', '')
    authors = [author.strip() for part in authors_raw.split(",") for author in part.split(" and ")]

    # Add the processed data to the document
    doc['words'] = words
    doc['authors'] = authors

    return doc


def process_documents(data, vocabulary):
    """
    Process a list of documents using the vocabulary.
    """
    return [process_document(doc, vocabulary) for doc in data]

In [None]:
# Process the documents
no_one_letter_english_vocab = [word for word in english_vocab if len(word) > 1]
processed_data = process_documents(data, no_one_letter_english_vocab)

In [None]:
def flatten_metadata(docs):
    """
    Flatten the bibliographic_metadata into individual columns for each document.
    """
    flattened_docs = []
    for doc in docs:
        # Extract bibliographic_metadata and flatten it
        metadata = doc.pop("bibliographic_metadata", {})
        flattened_doc = {**doc, **metadata}  # Merge metadata into the main doc
        flattened_docs.append(flattened_doc)
    return flattened_docs

# Flatten the metadata in processed_data
flattened_data = flatten_metadata(processed_data)

# Convert to a pandas DataFrame for tabular representation
import pandas as pd
df = pd.DataFrame(flattened_data)


In [None]:
# Use your DataFrame
df = pd.DataFrame(flattened_data)

# Explode authors and words into individual rows
exploded_df = df.explode("authors").explode("words")

# Count co-occurrences of authors and words
co_occurrence_counts = exploded_df.groupby(["authors", "words"]).size().reset_index(name="count")

# Pivot to create a co-occurrence matrix
co_occurrence_matrix = co_occurrence_counts.pivot_table(
    index="authors", columns="words", values="count", fill_value=0
)

In [None]:
# Recommend related words based on co-occurrence
def recommend_words(author_name, input_word, co_occurrence_matrix, top_n=5):
    """
    Recommend words related to an input word for a given author.
    
    Args:
        author_name (str): Name of the author.
        input_word (str): The word for which to find related words.
        co_occurrence_matrix (pd.DataFrame): Co-occurrence matrix.
        top_n (int): Number of related words to return.
        
    Returns:
        pd.Series: Top related words with their similarity scores.
    """
    # Ensure the author exists
    if author_name not in co_occurrence_matrix.index:
        raise ValueError(f"Author '{author_name}' not found in co-occurrence matrix.")
    
    # Ensure the word exists
    if input_word not in co_occurrence_matrix.columns:
        raise ValueError(f"Word '{input_word}' not found in co-occurrence matrix.")
    
    # Get the vector for the author's words
    author_vector = co_occurrence_matrix.loc[author_name].values.reshape(1, -1)
    
    # Compute cosine similarity for all words
    word_matrix = co_occurrence_matrix.T.values  # Transpose for word-based similarity
    similarity_scores = cosine_similarity(author_vector, word_matrix).flatten()
    
    # Create a DataFrame of words and their similarity scores
    words = co_occurrence_matrix.columns
    related_words = pd.Series(similarity_scores, index=words).sort_values(ascending=False)
    
    # Exclude the input word itself and return the top N related words
    return related_words.drop(index=input_word).head(top_n)


In [None]:
# Example usage
author_name = "Chughtai"
input_word = "frequency"
recommended_words = recommend_words(author_name, input_word, co_occurrence_matrix)
print(recommended_words)

In [None]:
# Convert the matrix into a DataFrame for training
X = co_occurrence_matrix
y = co_occurrence_matrix.idxmax(axis=1)  # Simplified target example

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Test the model
score = clf.score(X_test, y_test)
print(f"Model Accuracy: {score:.2f}")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute similarity between authors based on the co-occurrence matrix
author_similarity = cosine_similarity(co_occurrence_matrix)

# Recommend words for an author
def recommend_words(author_name, top_n=5):
    author_idx = co_occurrence_matrix.index.get_loc(author_name)
    similar_authors = np.argsort(-author_similarity[author_idx])  # Descending order
    recommended_words = co_occurrence_matrix.iloc[similar_authors[:top_n]].sum(axis=0)
    return recommended_words.sort_values(ascending=False).head(top_n)

# Example usage
print(recommend_words("Chughtai"))
