In [None]:
#pip install tensorflow

In [None]:
import numpy as np
import pandas as pd
import gensim
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import CoherenceModel
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load your DataFrame with text data
df = pd.read_csv('sectionized_data.csv')

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to each document in df['Body']
df['Body'] = df['Body'].astype(str)
processed_docs = df['Body'].apply(preprocess_text)

# Create TaggedDocument for training the Doc2Vec model
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(processed_docs)]

# Train Doc2Vec model
doc2vec_model = gensim.models.Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Extract document embeddings
document_embeddings = [doc2vec_model.docvecs[str(i)] for i in range(len(tagged_data))]

# Convert document embeddings to GloVe-like format
document_vectors = np.array(document_embeddings)

# Define the NTM-GloVe model using TensorFlow
class NTM_GloVe_TensorFlow:
    def __init__(self, num_topics):
        self.num_topics = num_topics
        self.model = self.build_model()

    def build_model(self):
        model = Sequential([
            Dense(512, activation='relu', input_shape=(100,)),
            Dense(self.num_topics, activation='softmax')
        ])
        model.compile(optimizer='adam', loss='categorical_crossentropy')
        return model

    def fit(self, X, y):
        self.model.fit(X, y, epochs=10, batch_size=32)

    def transform(self, X):
        return self.model.predict(X)

# Function to compute coherence score for a given number of topics
def compute_coherence_score(num_topics, document_vectors):
    # Convert document embeddings to one-hot encodings (for topics)
    document_vectors_onehot = np.eye(num_topics)[np.random.choice(num_topics, document_vectors.shape[0])]
    
    # Train NTM-GloVe model
    ntm_glove_tf_model = NTM_GloVe_TensorFlow(num_topics)
    ntm_glove_tf_model.fit(document_vectors, document_vectors_onehot)
    
    # Get the topic distributions for each document
    document_topic_distributions = ntm_glove_tf_model.transform(document_vectors)
    
    # Convert document_topic_distributions to topic vectors
    topic_vectors = np.argmax(document_topic_distributions, axis=1)
    
    # Convert topic vectors to topic distribution matrix
    topic_distribution_matrix = np.eye(num_topics)[topic_vectors]
    
    # Convert topic_distribution_matrix to a list of topic lists
    topic_lists = topic_distribution_matrix.tolist()
    
    # Create a dictionary for the coherence model
    dictionary = gensim.corpora.Dictionary(processed_docs)
    
    # Compute coherence score
    coherence_model = CoherenceModel(topics=topic_lists, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    return coherence_score

# Range of possible numbers of topics to explore
min_topics = 2
max_topics = 100

# List to store coherence scores
coherence_scores = []

# Iterate over the range of topic numbers and compute coherence scores
for num_topics in range(min_topics, max_topics + 1):
    coherence_score = compute_coherence_score(num_topics, document_vectors)
    coherence_scores.append(coherence_score)

# Find the optimal number of topics based on the maximum coherence score
optimal_num_topics = min_topics + coherence_scores.index(max(coherence_scores))

print("Optimal number of topics:", optimal_num_topics)

# Convert document embeddings to one-hot encodings (for topics)
document_vectors_onehot = np.eye(optimal_num_topics)[np.random.choice(optimal_num_topics, document_vectors.shape[0])]

# Train NTM-GloVe model with optimal number of topics
ntm_glove_tf_model = NTM_GloVe_TensorFlow(optimal_num_topics)
ntm_glove_tf_model.fit(document_vectors, document_vectors_onehot)

# Get the topic distributions for each document
document_topic_distributions = ntm_glove_tf_model.transform(document_vectors)

# Print topic distributions for all documents
for i, doc_topic_distribution in enumerate(document_topic_distributions):
    print(f"Topic Distribution for Document {i + 1}:")
    print(doc_topic_distribution)


In [None]:
import matplotlib.pyplot as plt

# Plot topic distributions for all documents
for i, doc_topic_distribution in enumerate(document_topic_distributions):
    plt.figure(figsize=(8, 4))
    plt.bar(range(len(doc_topic_distribution)), doc_topic_distribution, color='skyblue')
    plt.xlabel('Topic')
    plt.ylabel('Probability')
    plt.title(f'Topic Distribution for Document {i + 1}')
    plt.show()


In [None]:
# Define a function to calculate coherence score
def calculate_coherence_score(model, documents, dictionary, coherence='c_v'):
    # Get the topics
    topics = model.model.get_weights()[0]  # Assuming the topic weights are in the first layer
    
    # Convert topic weights to token IDs
    topic_token_ids = []
    for topic_weights in topics:
        topic_token_ids.append([token_id for token_id, weight in sorted(dictionary.token2id.items(), key=lambda x: x[1])])

    # Compute coherence score
    coherence_model = CoherenceModel(topics=topic_token_ids, texts=documents, dictionary=dictionary, coherence=coherence)
    coherence_score = coherence_model.get_coherence()
    
    return coherence_score

# Create a Gensim Dictionary
dictionary = gensim.corpora.Dictionary(processed_docs)

# Calculate coherence score
coherence_score = calculate_coherence_score(ntm_glove_tf_model, processed_docs, dictionary)
print("Coherence Score:", coherence_score)



In [None]:
# Initialize a list to store dominant topics
dominant_topics = []

# Iterate through the document topic distributions
for doc_topic_distribution in document_topic_distributions:
    # Find the index of the topic with the highest probability
    dominant_topic_index = np.argmax(doc_topic_distribution)
    # Append the dominant topic index to the list
    dominant_topics.append(dominant_topic_index)

# Print the dominant topics for each document
for i, dominant_topic in enumerate(dominant_topics):
    print(f"Document {i + 1}: Dominant Topic {dominant_topic}")