In [8]:
!pip install bertopic sentence-transformers scikit-learn nltk




In [9]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import download


In [10]:
FILE_NAME = "atomic_habbit"
text = open(f"{FILE_NAME}.txt", "r+").read()


In [11]:
download('punkt')
download('stopwords')
download('wordnet')

def preprocess_text(text):
    # Remove non-textual elements
    text = re.sub(r'\[.*?\]', '', text)
    
    # Lowercase
    text = text.lower()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jhabarsinghbhati/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jhabarsinghbhati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jhabarsinghbhati/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def load_and_preprocess(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    processed_text = preprocess_text(text)
    return [processed_text]  # BERTopic expects a list of documents


In [15]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

def perform_topic_modeling(docs, model_name="all-MiniLM-L6-v2", n_gram_range=(1, 2), min_topic_size=10):
    # Step 1: Embedding Model
    embedding_model = SentenceTransformer(model_name)
    
    # Step 2: Create and Fit BERTopic Model
    topic_model = BERTopic(
        embedding_model=embedding_model,
        n_gram_range=n_gram_range,
        min_topic_size=min_topic_size
    )
    
    topics, probabilities = topic_model.fit_transform(docs)
    
    return topic_model, topics, probabilities


AttributeError: module 'openai' has no attribute 'OpenAI'

In [None]:
def main():
    file_path = f"{FILE_NAME}.txt"
    docs = load_and_preprocess(file_path)
    
    topic_model, topics, probabilities = perform_topic_modeling(docs)
    
    print(topic_model.get_topic_info())
    
    topic_model.visualize_topics()
    
    topic_model.save("bertopic_model")
    
if __name__ == "__main__":
    main()
