In [2]:
import pandas as pd
import numpy as np
import os
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle

# Load and preprocess the data
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    df['content_text'] = df['content_text'].str.lower()
    df['content_text'] = df['content_text'].str.replace(r'@[\w]+', '', regex=True)
    df['content_text'] = df['content_text'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    df['content_text'] = df['content_text'].str.replace(r'\s+', ' ', regex=True)
    df['content_text'] = df['content_text'].str.strip()
    df['content_text'] = df['content_text'].str.replace(r'["\']', '', regex=True)
    return df

# Tokenize, remove stopwords, and lemmatize
def preprocess_text(text, stop_words, lemmatizer):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Prepare text for LDA (split into train/test, create dictionary and corpus)
def prepare_lda_data(df, prompt_id, stop_words, lemmatizer):
    df['processed_texts'] = df['content_text'].apply(lambda x: preprocess_text(x, stop_words, lemmatizer))
    train_df = df[df['prompt_id'] != prompt_id]
    
    dictionary = corpora.Dictionary(train_df['processed_texts'])
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in train_df['processed_texts']]
    return train_df, dictionary, corpus

# Train the LDA model
def train_lda(corpus, dictionary, num_topics, alpha='auto', passes=100, iterations=400):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, 
                         random_state=42, alpha=alpha, passes=passes, iterations=iterations)
    return lda_model

# Evaluate topics using coherence scores
def evaluate_topics(lda_model, texts, dictionary, coherence_metric='c_npmi'):
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence=coherence_metric)
    overall_coherence = coherence_model.get_coherence()
    return overall_coherence

# Grid search for best LDA model parameters
def grid_search_lda(corpus, dictionary, texts, num_topics_grid, alpha_grid, coherence_metric='c_npmi'):
    best_model = None
    best_score = -np.inf
    best_params = {}
    
    for num_topics in num_topics_grid:
        for alpha in alpha_grid:
            print(f"Training LDA with num_topics={num_topics} and alpha={alpha}...")
            lda_model = train_lda(corpus, dictionary, num_topics=num_topics, alpha=alpha)
            coherence_score = evaluate_topics(lda_model, texts, dictionary, coherence_metric)
            print(f"Coherence Score: {coherence_score}")
            
            # Check if current model is better than the previous best
            if coherence_score > best_score:
                best_score = coherence_score
                best_model = lda_model
                best_params = {'num_topics': num_topics, 'alpha': alpha}
    
    print(f"Best Coherence Score: {best_score} with parameters {best_params}")
    return best_model, best_params, best_score

# Save LDA model, dictionary, and coherence scores
def save_model_and_scores(lda_model, dictionary, model_dir, dict_dir, coherence_df, prompt_id):
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(dict_dir, exist_ok=True)
    lda_model.save(os.path.join(model_dir, f"lda_model_{prompt_id}.model"))
    dictionary.save(os.path.join(dict_dir, f"dictionary_{prompt_id}.dict"))
    coherence_df.to_csv(f'topic_coherence_scores_{prompt_id}.csv', index=False)

# Get topic distributions for test data and save highest topic
def get_topic_distributions(df, lda_model, dictionary, prompt_id):
    df['processed_texts'] = df['content_text'].apply(lambda x: preprocess_text(x, stop_words, lemmatizer))
    topic_distributions = []
    highest_topic_scores = []

    for text in df['processed_texts']:
        new_bow = dictionary.doc2bow(text)
        topic_distribution = lda_model.get_document_topics(new_bow)
        topic_distributions.append(topic_distribution)
        highest_topic_scores.append(max(topic_distribution, key=lambda x: x[1])[1])

    df['highest_topic'] = highest_topic_scores
    return df

# Combine with handcrafted features
def combine_with_handcrafted(final_df, handcrafted_file):
    handcrafted_df = pd.read_csv(handcrafted_file)
    final_df.rename(columns={'essay_id': 'item_id'}, inplace=True)
    merged_df = handcrafted_df.merge(final_df[['item_id', 'highest_topic']], on='item_id')
    return merged_df

# Main function to run each step
def main():
    # Load and preprocess
    file_path = 'combined_data.csv'
    df = load_and_preprocess(file_path)

    # Set parameters and stop words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    prompt_id = 1
    
    # Grid search parameters
    num_topics_grid = [6, 8, 9, 10]  # Define your topic numbers for grid search
    alpha_grid = ['symmetric', 'asymmetric', 'auto']  # Define your alpha values
    
    # Prepare LDA data
    train_df, dictionary, corpus = prepare_lda_data(df, prompt_id, stop_words, lemmatizer)

    # Grid search for best model
    best_lda_model, best_params, best_score = grid_search_lda(corpus, dictionary, train_df['processed_texts'], num_topics_grid, alpha_grid)
    
    # Evaluate and print final coherence score with best parameters
    final_coherence = evaluate_topics(best_lda_model, train_df['processed_texts'], dictionary)
    print(f"Best model coherence score: {final_coherence}")

    # Save best model and coherence scores
    coherence_df = pd.DataFrame({
        'Topic': range(best_params['num_topics']),
        'Coherence Score': evaluate_topics(best_lda_model, train_df['processed_texts'], dictionary, coherence_metric='c_npmi'),
        'word_dist': best_lda_model.print_topics(num_words=10)
    })
    save_model_and_scores(best_lda_model, dictionary, "lda_models", "dictionaries", coherence_df, prompt_id)

    # Get topic distributions and save highest topic
    final_df = get_topic_distributions(df, best_lda_model, dictionary, prompt_id)

    # Combine with handcrafted features and save final merged dataset
    merged_df = combine_with_handcrafted(final_df, 'hand_crafted_v3.csv')
    merged_df.to_csv(f'final_{prompt_id}.csv', index=False)

# Run the main function
main()


Training LDA with num_topics=6 and alpha=symmetric...
Coherence Score: 0.09198450362067177
Training LDA with num_topics=6 and alpha=asymmetric...
Coherence Score: 0.09468709841055484
Training LDA with num_topics=6 and alpha=auto...
Coherence Score: 0.09508168361344975
Training LDA with num_topics=8 and alpha=symmetric...
Coherence Score: 0.1006117632666429
Training LDA with num_topics=8 and alpha=asymmetric...
Coherence Score: 0.1134339642074523
Training LDA with num_topics=8 and alpha=auto...
Coherence Score: 0.09966981054115416
Training LDA with num_topics=9 and alpha=symmetric...
Coherence Score: 0.09228134002104992
Training LDA with num_topics=9 and alpha=asymmetric...
Coherence Score: 0.09140630395775151
Training LDA with num_topics=9 and alpha=auto...
Coherence Score: 0.09997840204222881
Training LDA with num_topics=10 and alpha=symmetric...
Coherence Score: 0.09168175684418124
Training LDA with num_topics=10 and alpha=asymmetric...
Coherence Score: 0.08112771732938608
Training L

NameError: name 'stop_words' is not defined

In [None]:
import pandas as pd
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score

# Load and preprocess the data
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    df['content_text'] = df['content_text'].str.lower()
    df['content_text'] = df['content_text'].str.replace(r'@[\w]+', '', regex=True)
    df['content_text'] = df['content_text'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    df['content_text'] = df['content_text'].str.replace(r'\s+', ' ', regex=True)
    df['content_text'] = df['content_text'].str.strip()
    df['content_text'] = df['content_text'].str.replace(r'["\']', '', regex=True)
    return df

# Tokenize, remove stopwords, and lemmatize
def preprocess_text(text, stop_words, lemmatizer):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)

# Prepare BERTopic model and fit it
def train_bertopic(df, prompt_id, stop_words, lemmatizer):
    df['processed_texts'] = df['content_text'].apply(lambda x: preprocess_text(x, stop_words, lemmatizer))
    train_df = df[df['prompt_id'] != prompt_id]

    # Initialize BERTopic with custom vectorizer (optional, for better control)
    vectorizer_model = CountVectorizer(stop_words="english")
    topic_model = BERTopic(vectorizer_model=vectorizer_model)

    topics, probabilities = topic_model.fit_transform(train_df['processed_texts'])
    return topic_model, topics, probabilities

# Evaluate topics using silhouette score
def evaluate_topics_bertopic(probabilities):
    silhouette_avg = silhouette_score(probabilities, probabilities.argmax(axis=1))
    return silhouette_avg

# Save BERTopic model
def save_bertopic_model(topic_model, model_dir, prompt_id):
    os.makedirs(model_dir, exist_ok=True)
    topic_model.save(os.path.join(model_dir, f"bertopic_model_{prompt_id}"))

# Get topic distributions for test data and save highest topic
def get_topic_distributions(df, topic_model, prompt_id):
    df['processed_texts'] = df['content_text'].apply(lambda x: preprocess_text(x, stop_words, lemmatizer))
    topics, probabilities = topic_model.transform(df['processed_texts'])
    
    highest_topic_scores = probabilities.max(axis=1)
    df['highest_topic'] = highest_topic_scores
    return df

# Combine with handcrafted features
def combine_with_handcrafted(final_df, handcrafted_file):
    handcrafted_df = pd.read_csv(handcrafted_file)
    final_df.rename(columns={'essay_id': 'item_id'}, inplace=True)
    merged_df = handcrafted_df.merge(final_df[['item_id', 'highest_topic']], on='item_id')
    return merged_df

# Main function to run each step
def main():
    # Load and preprocess
    file_path = 'combined_data.csv'
    df = load_and_preprocess(file_path)

    # Set parameters and stop words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    prompt_id = 1

    # Train BERTopic model
    topic_model, topics, probabilities = train_bertopic(df, prompt_id, stop_words, lemmatizer)
    
    # Evaluate topic coherence
    silhouette_avg = evaluate_topics_bertopic(probabilities)
    print(f"Silhouette Score for BERTopic model: {silhouette_avg}")

    # Save BERTopic model
    save_bertopic_model(topic_model, "bertopic_models", prompt_id)

    # Get topic distributions and save highest topic
    final_df = get_topic_distributions(df, topic_model, prompt_id)

    # Combine with handcrafted features and save final merged dataset
    merged_df = combine_with_handcrafted(final_df, 'hand_crafted_v3.csv')
    merged_df.to_csv(f'final_{prompt_id}.csv', index=False)

# Run the main function
main()
