# ELMR Book Through LDA-BERT Pipeline
This notebook puts the ELMR book through the LDA-BERT pipeline, generates compressed versions of the book and conducts cosine similarity

In [None]:
import importlib
from pipeline import pipeline_final

# Reload the pipeline module
importlib.reload(pipeline)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from pipeline.pipeline_final import pipeline
import numpy as np

# === MongoDB Connection ===
uri = "mongodb+srv://haigbedros:xqLlcSm2DP7VMGiF@cluster0.kfjsw.mongodb.net/?appName=Cluster0"
mongo_client = MongoClient(uri)

# === Access the Elmr Database and Collection ===
elmr_db = mongo_client["elmr"]
elmr_collection = elmr_db["elmr_book"]

# === Load the Elmr data into a pandas DataFrame ===
elmr_cursor = elmr_collection.find({}, {"text": 1, "_id": 0})  # Only pull 'text' field
elmr_df = pd.DataFrame(list(elmr_cursor))

# === Clean and Inspect Data ===
elmr_df = elmr_df.dropna(subset=['text'])  # Drop missing text rows
print(f"Number of documents: {len(elmr_df)}")
print(f"Columns: {elmr_df.columns.tolist()}")
print("\nFirst few rows:")
print(elmr_df.head())

# === Run the Pipeline ===
print("\nRunning pipeline...")
results = pipeline(
    dataframe=elmr_df,
    text_column='text',
    n_topics=5,        # Number of topics
    threshold=0.01,     # BERT similarity threshold
    top_n_words=10     # Top words per topic
)

# === Analyze Topic Assignments ===
print("\nAnalyzing topic assignments...")
topic_assignments = results['topic_assignments']
topic_counts = pd.Series(topic_assignments).value_counts().sort_index()
print("\nTopic distribution:")
print(topic_counts)

# === Top Words for Each Topic ===
print("\nTop words for each topic:")
lda_model = results['lda_model']
lda_model.print_topics(n_top_words=10)

# === Analyze BERT Similarity Results ===
print("\nAnalyzing BERT similarity results...")
query_results = results['query_results']

for query_id, result in query_results.items():
    print(f"\n{query_id}:")
    print(f"Number of relevant documents: {sum(result['relevance'])}")
    flat_sims = np.concatenate(result['similarities']) if result['similarities'] else np.array([])
    avg_sim = flat_sims.mean() if len(flat_sims) > 0 else 0.0
    print(f"Average similarity: {avg_sim:.4f}")

    # Plot similarity distribution (optional)
    plt.figure(figsize=(10, 6))
    sns.histplot(result['similarities'], bins=50)
    plt.title(f'Similarity Distribution for {query_id}')
    plt.xlabel('Similarity Score')
    plt.ylabel('Count')
    plt.close()

# === Create and Save Results DataFrame ===
print("\nCreating results DataFrame...")
results_df = pd.DataFrame({'text': results['filtered_texts']})
results_df['topic'] = results['topic_assignments']

# Add similarity scores
for query_id, result in query_results.items():
    results_df[f'{query_id}_similarity'] = 0.0
    results_df[f'{query_id}_relevant'] = False

    for i in range(len(results_df)):
        sims = np.array(result['similarities'][i])
        flags = np.array(result['relevance'][i])

        if np.any(flags):
            relevant_sims = sims[flags]
            avg_sim = relevant_sims.mean()
            is_relevant = True
        else:
            avg_sim = 0.0
            is_relevant = False

        results_df.at[i, f'{query_id}_similarity'] = avg_sim
        results_df.at[i, f'{query_id}_relevant'] = is_relevant

# Add embeddings
embeddings = results['embeddings']
for dim in range(embeddings.shape[1]):
    results_df[f'embedding_{dim}'] = embeddings[:, dim]

# Save results to CSV
results_df.to_csv('/Users/haigbedros/Desktop/MSDS/Capstone/CODE/ml-models-information-filtering/notebooks/new_adjusted_elmr_pipelined.csv', index=False)
print("\nResults saved to notebooks/adjusted_elmr_pipelined.csv")


Number of documents: 1
Columns: ['text']

First few rows:
                                                text
0  CHAPMAN & HALL/CRC \nTexts in Statistical Scie...

Running pipeline...
Topic 0:  model | data | linear | fit | response | models | effects | value | plot | test
Topic 1:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical
Topic 2:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical
Topic 3:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical
Topic 4:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical
Topic 0:  model | data | linear | fit | response | models | effects | value | plot | test
Topic 1:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical
Topic 2:  extras | chromosome | circumference

Batches: 100%|██████████| 157/157 [00:09<00:00, 15.74it/s]
Batches: 100%|██████████| 157/157 [00:06<00:00, 25.85it/s]
Batches: 100%|██████████| 157/157 [00:05<00:00, 29.59it/s]
Batches: 100%|██████████| 157/157 [00:05<00:00, 28.44it/s]
Batches: 100%|██████████| 157/157 [00:04<00:00, 33.51it/s]



Analyzing topic assignments...

Topic distribution:
0    1
Name: count, dtype: int64

Top words for each topic:
Topic 0:  model | data | linear | fit | response | models | effects | value | plot | test
Topic 1:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical
Topic 2:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical
Topic 3:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical
Topic 4:  extras | chromosome | circumference | circumstance | citations | city | clark | 01752 | classic | classical

Analyzing BERT similarity results...

query_0:
Number of relevant documents: [0 0 1 ... 1 1 0]
Average similarity: 0.0547

query_1:
Number of relevant documents: [0 0 1 ... 1 1 0]
Average similarity: 0.0547

query_2:
Number of relevant documents: [0 0 1 ... 1 1 0]
Average similarity: 0.0547

query_3:
Number of relevant doc

  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{dim}'] = embeddings[:, dim]
  results_df[f'embedding_{di

In [14]:
# === Word Count Comparison ===

# Total word count before filtering (raw MongoDB data)
raw_word_count = elmr_df['text'].dropna().apply(lambda x: len(x.split())).sum()
print(f"Total word count (raw elmr_df): {raw_word_count}")

# Total word count after BERT+LDA filtering
processed_word_count = results_df['text'].dropna().apply(lambda x: len(x.split())).sum()
print(f"Total word count (filtered results_df): {processed_word_count}")

# Word count difference
reduction = raw_word_count - processed_word_count
reduction_pct = (reduction / raw_word_count) * 100 if raw_word_count > 0 else 0

print(f"\nReduction: {reduction} words")
print(f"Reduction %: {reduction_pct:.2f}%")


Total word count (raw elmr_df): 97469
Total word count (filtered results_df): 56518

Reduction: 40951 words
Reduction %: 42.01%


In [15]:
import pandas as pd

# Load the processed ELMR DataFrame
processed_csv_path = "/Users/haigbedros/Desktop/MSDS/Capstone/CODE/ml-models-information-filtering/notebooks/new_adjusted_elmr_pipelined.csv"
processed_df = pd.read_csv(processed_csv_path)

# Check the DataFrame info
print("Processed ELMR DataFrame Info:")
print(processed_df.info())
print("\nFirst few rows:")
print(processed_df.head())

# Check if embedding columns exist
embedding_columns = [col for col in processed_df.columns if col.startswith('embedding')]
print("\nEmbedding Columns Found:")
print(embedding_columns)

# Quick check how many embeddings
print(f"\nTotal number of embedding columns: {len(embedding_columns)}")


Processed ELMR DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Columns: 396 entries, text to embedding_383
dtypes: bool(5), float64(389), int64(1), object(1)
memory usage: 3.2+ KB
None

First few rows:
                                                text  topic  \
0  “ to purchase your own copy of this or any of ...      0   

   query_0_similarity  query_0_relevant  query_1_similarity  query_1_relevant  \
0            0.085992              True            0.085992              True   

   query_2_similarity  query_2_relevant  query_3_similarity  query_3_relevant  \
0            0.085992              True            0.085992              True   

   ...  embedding_374  embedding_375  embedding_376  embedding_377  \
0  ...       0.016137      -0.000208       0.004917        0.00203   

   embedding_378  embedding_379  embedding_380  embedding_381  embedding_382  \
0      -0.021563      -0.004101       0.041452       -0.01009      -0.012337   

   embe

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

# Load files
processed_df = pd.read_csv("/Users/haigbedros/Desktop/MSDS/Capstone/CODE/ml-models-information-filtering/notebooks/elmr_compl_with_embeddings.csv")
raw_df = pd.read_csv("/Users/haigbedros/Desktop/MSDS/Capstone/CODE/ml-models-information-filtering/notebooks/new_adjusted_elmr_pipelined.csv")

# Extract embedding columns
processed_embeddings = processed_df[[col for col in processed_df.columns if col.startswith('embedding')]].values
raw_embeddings = raw_df[[col for col in raw_df.columns if col.startswith('embedding')]].values

# Normalize (important for cosine similarity)
processed_embeddings_norm = normalize(processed_embeddings)
raw_embeddings_norm = normalize(raw_embeddings)

# Compute cosine similarity (1 processed vs all raw)
cosine_similarities = cosine_similarity(processed_embeddings_norm, raw_embeddings_norm)

# Flatten result (because processed is 1 row)
cosine_similarities = cosine_similarities.flatten()

# Top 5 most similar
top_5_indices = np.argsort(cosine_similarities)[::-1][:5]
print("Top 5 matching indices in raw set:", top_5_indices)
print("Top 5 cosine similarity scores:", cosine_similarities[top_5_indices])


Top 5 matching indices in raw set: [0]
Top 5 cosine similarity scores: [0.36831485]
