In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

# Load NLTK stopwords and convert to list
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(["you'd", 'a', 'of', 'then', "hadn't", "should've", 'because', 'she', 'shan', 'while', "wouldn't", 'before', 'very', 'by', 'an', 'about', 'nor', 'hasn', "aren't", 'his', 're', 'myself', 'after', 'where', 'my', 'will', 'your', 'ain', "mightn't", 'didn', 'to', 'isn', 'more', 't', 'both', 'hers', 'd', 'at', 'you', "shouldn't", 'ours', 'further', 'he', 'am', 'does', 'its', 'there', 'under', 'such', 'mustn', 'just', 'once', 'between', 'is', 'few', 'weren', 'theirs', 'were', 'him', "weren't", 'their', 've', "you'll", 'now', 'from', 'as', 'has', 'are', 'who', 'into', 'herself', 'the', 'don', 'if', 'haven', 's', 'should', 'what', 'when', 'was', 'most', 'why', 'doing', 'hadn', 'yourselves', 'only', "didn't", 'through', 'which', 'not', 'and', 'how', "it's", 'in', 'some', 'did', 'or', 'ourselves', 'do', 'any', 'other', 'y', "mustn't", "you're", 'during', 'all', 'but', "doesn't", 'have', 'himself', 'them', 'over', 'no', 'yourself', 'been', 'whom', 'up', 'shouldn', 'each', 'against', 'yours', 'that', 'couldn', 'again', 'me', 'aren', 'with', 'i', 'down', 'it', 'be', 'had', "she's", 'below', "isn't", 'we', 'm', "shan't", 'too', 'her', 'our', 'won', 'll', 'themselves', 'o', 'doesn', 'these', 'off', 'ma', "you've", 'on', 'own', 'for', 'wasn', 'those', 'itself', "haven't", "won't", 'being', "wasn't", 'mightn', 'they', "hasn't", "couldn't", 'until', 'above', 'than', 'having', 'so', 'needn', 'wouldn', 'this', 'can', "needn't", 'here', "that'll", 'out', "don't", 'same'])

# Load the dataset in chunks
chunksize = 10000  # Adjust the chunk size as needed
reviews_chunks = pd.read_json(r'c:\Users\kasam\OneDrive\Desktop\yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json', lines=True, chunksize=chunksize)

# Process each chunk
for chunk in reviews_chunks:
    # Filter for only 1-star reviews
    chunk = chunk[chunk['stars'] == 1]
    
    # Tokenization and TF-IDF transformation
    tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = tfidf_vectorizer.fit_transform(chunk['text'])

    # Define the number of topics
    num_topics = 10

    # Model training using Latent Dirichlet Allocation
    lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=200, learning_method='batch', random_state=317)
    lda_model.fit(tfidf_matrix)

    # Get the top words for each topic
    feature_names = tfidf_vectorizer.get_feature_names_out()
    top_word_indices = lda_model.components_.argsort(axis=1)[:, :-11:-1]
    top_words = [[feature_names[index] for index in indices] for indices in top_word_indices]

    # Plot word clouds for each topic
    def plot_word_cloud(words):
        wordcloud = WordCloud(background_color='white').generate(' '.join(words))
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.show()

    # Print and plot the topics
    for i, words in enumerate(top_words):
        print(f"Topic {i}:")
        print(words)
        plot_word_cloud(words)
