# Naive Bayes with Bigram

In [None]:
import pandas as pd

# Load the MARR book csv generated with all the embeddings
marr_df = pd.read_csv('local_path/marr_compl_with_embeddings.csv')

# Get the full text
full_text = marr_df['text'].iloc[0]

# Simple word split without NLTK
words = full_text.split()

# Chunk size (words per chunk)
chunk_size = 500

# Create chunks
chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Create new dataframe
chunked_df = pd.DataFrame({'text': chunks})

# Dummy labels (alternating 0 and 1 for now)
chunked_df['label'] = [0 if i % 2 == 0 else 1 for i in range(len(chunked_df))]

print(chunked_df.head())
print(f"Total chunks created: {len(chunked_df)}")

# Save to CSV
chunked_df.to_csv('local_path/marr_chunked_500.csv', index=False)


                                                text  label
0  Springer Texts in Statistics Series Editors: G...      0
1  plots and marginal model plots. The book conta...      1
2  and then overcome or deal with problems with a...      0
3  Brad Barney and Charles Lindsey, wrote the SAS...      1
4  ...... 36 2.7.4 Prediction Intervals for the A...      0
Total chunks created: 248


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

chunked_df = pd.read_csv('local_path/marr_chunked_500.csv')

print(chunked_df.head())

# Features (text) and labels
X_text = chunked_df['text']
y = chunked_df['label']

# Bigram features
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=5000)
X = vectorizer.fit_transform(X_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict
y_pred = nb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


                                                text  label
0  Springer Texts in Statistics Series Editors: G...      0
1  plots and marginal model plots. The book conta...      1
2  and then overcome or deal with problems with a...      0
3  Brad Barney and Charles Lindsey, wrote the SAS...      1
4  ...... 36 2.7.4 Prediction Intervals for the A...      0
Accuracy: 0.36

Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.33      0.38        30
           1       0.29      0.40      0.33        20

    accuracy                           0.36        50
   macro avg       0.37      0.37      0.36        50
weighted avg       0.39      0.36      0.36        50



# Naive Bayes Classifying After LDA-BERT Pipeline
We are suing the here the chunked database first

In [None]:
import pandas as pd
from pipeline import pipeline_final # adjust this import path if needed

# Load the chunked Marr data
chunked_df = pd.read_csv('local_path/marr_chunked_500.csv')

# Run the pipeline
results = pipeline_final.pipeline(
    dataframe=chunked_df,
    text_column='text',
    n_topics=5,
    bert_model="all-MiniLM-L6-v2",
    threshold=0.3,
    top_n_words=5
)

results_df = pd.DataFrame({
    'text': chunked_df['text'],
    'filtered_text': results['filtered_texts'],
    'topic': results['topic_assignments']
})

# Add each embedding dimension as a separate column
for i in range(results['embeddings'].shape[1]):
    results_df[f'embedding_{i}'] = results['embeddings'][:, i]

# Attach your existing labels
results_df['label'] = chunked_df['label']

# Save to CSV
results_df.to_csv('local_path/ml-models-information-filtering/notebooks/marr_chunked_500_pipelined.csv', index=False)

print("Saved marr_chunked_500_pipelined.csv with embeddings per chunk.")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/haigbedros/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic 0:  regression | model | data | value | 10
Topic 1:  model | plot | figure | log | data
Topic 2:  log | model | figure | residuals | price
Topic 3:  regression | rss | aic | model | sum
Topic 4:  model | regression | 10 | residuals | data
Topic 0:  regression | model | data | value | 10
Topic 1:  model | plot | figure | log | data
Topic 2:  log | model | figure | residuals | price
Topic 3:  regression | rss | aic | model | sum
Topic 4:  model | regression | 10 | residuals | data
Topic 0:  regression | model | data | value | 10
Topic 1:  model | plot | figure | log | data
Topic 2:  log | model | figure | residuals | price
Topic 3:  regression | rss | aic | model | sum
Topic 4:  model | regression | 10 | residuals | data
Topic 0:  regression | model | data | value | 10
Topic 1:  model | plot | figure | log | data
Topic 2:  log | model | figure | residuals | price
Topic 3:  regression | rss | aic | model | sum
Topic 4:  model | regression | 10 | residuals | data
Topic 0:  regression

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.84s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.21it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.50it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 32.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.38it/s]
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.69it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 23.70it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 22.68it/s]
Batches: 1

✅ Saved marr_chunked_500_pipelined.csv with embeddings per chunk.



  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embedding_{i}'] = results['embeddings'][:, i]
  results_df[f'embeddin

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load the Marr chunked + pipelined data
df = pd.read_csv('local_path/marr_chunked_500_pipelined.csv')

# Confirm the columns
print(df.columns)

# Select the embedding columns
embedding_cols = [col for col in df.columns if col.startswith('embedding_')]

# Features and labels
X = df[embedding_cols]
y = [0 if i % 2 == 0 else 1 for i in range(len(df))]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Index(['text', 'filtered_text', 'topic', 'embedding_0', 'embedding_1',
       'embedding_2', 'embedding_3', 'embedding_4', 'embedding_5',
       'embedding_6',
       ...
       'embedding_375', 'embedding_376', 'embedding_377', 'embedding_378',
       'embedding_379', 'embedding_380', 'embedding_381', 'embedding_382',
       'embedding_383', 'label'],
      dtype='object', length=388)
Accuracy: 0.38

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.03      0.06        30
           1       0.38      0.90      0.54        20

    accuracy                           0.38        50
   macro avg       0.36      0.47      0.30        50
weighted avg       0.35      0.38      0.25        50

