In [1]:
!pip install numpy pandas matplotlib plotly gensim scikit-learn nltk



In [3]:
# Import the necessary libraries for our project
import os  # For working with file paths
import plotly.express as px  # For generating visualizations
import plotly.io as pio  # For displaying plots inline in the notebook
import gensim  # For applying LDA and text processing
from gensim import corpora  # To create a dictionary for LDA
from gensim.models import LdaModel  # LDA model to apply topic modeling
import pandas as pd  # For handling data in a DataFrame format
import nltk  # For natural language processing
from nltk.corpus import stopwords  # To remove common words from the text
from nltk.stem import WordNetLemmatizer  # For lemmatizing words (reducing them to their base form)


In [18]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Downloading required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Define the preprocessing function
def preprocess_text(text):
    # Check if text is a valid string
    if isinstance(text, str):
        stop_words = set(stopwords.words('english'))  # Get English stopwords
        lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
        tokens = text.split()  # Split text into words
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]
        return " ".join(tokens)
    else:
        # If text is not valid, return an empty string
        return ""


# Let's load the first dataset
# Make sure 'letterboxd-reviews.csv' is in the same directory as the notebook
data = pd.read_csv('Datasets/MovieReviews/letterboxd-reviews.csv', encoding='ISO-8859-1')

# Loading the second dataset into a DataFrame
# Similarly, 'metacritic-reviews.csv' should also be in the same directory
data2 = pd.read_csv('Datasets/MovieReviews/metacritic-reviews.csv', encoding='ISO-8859-1', on_bad_lines='skip')

# Check the first few rows to ensure the data is loaded properly
print("First dataset loaded:")
print(data.head())
print("\nSecond dataset loaded:")
print(data2.head())

# Apply the preprocessing function to the 'Review' column in the first dataset
data['cleaned_text'] = data['Review'].apply(preprocess_text)

# Apply the preprocessing function to the 'summary' column in the second dataset
data2['cleaned_text'] = data2['summary'].apply(preprocess_text)

# Check if the 'cleaned_text' column exists now
print("\nCleaned text for the first dataset:")
print(data[['Review', 'cleaned_text']].head())

print("\nCleaned text for the second dataset:")
print(data2[['summary', 'cleaned_text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\b0161166\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\b0161166\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


First dataset loaded:
                                 Movie name Release Year           Rating  \
0                           Aftersun (2022)         2022   â??â??â??â??Â½   
1                              Joker (2019)         2019  â??â??â??â??â??   
2       Puss in Boots: The Last Wish (2022)         2022            â??Â½   
3          The Banshees of Inisherin (2022)         2022  â??â??â??â??â??   
4  Everything Everywhere All at Once (2022)         2022         â??â??Â½   

     Reviewer name Review date  \
0           Tuomas   12-Jan-20   
1             Joao   20-Dec-22   
2         NicoPico   15-Sep-22   
3        Ella Kemp    8-Apr-22   
4  CosmonautMarkie   14-Aug-19   

                                              Review Comment count  \
0                  This review may contain spoilers.           130   
1  if youâ??ve never swam in the ocean then of co...          1.8K   
2                Puss in Boots: Into the Pussy-Verse          6  2   
3  I will NOT leave my donkey 

In [20]:
# Importing necessary libraries
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel

# Function to display topic frequency distribution
def display_topic_frequency(data, num_topics_list, dataset_name="Dataset"):
    """
    Display the frequency distribution of topics for the given dataset for each number of topics.
    
    Parameters:
        data (pandas.DataFrame): The dataset containing the cleaned text.
        num_topics_list (list): List of integers representing the number of topics (e.g., [5, 10, 20, 50]).
        dataset_name (str): The name of the dataset (e.g., "Dataset 1").
    """
    # Tokenize the cleaned text into words (tokens) for topic modeling
    texts = [text.split() for text in data['cleaned_text']]  # Split the text into tokens
    
    # Create a dictionary and corpus for the dataset
    dictionary = corpora.Dictionary(texts)  # Creating a Gensim dictionary
    corpus = [dictionary.doc2bow(text) for text in texts]  # Corpus for LDA model

    # Loop through each number of topics and apply LDA
    for num_topics in num_topics_list:
        lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)  # LDA Model
        
        print(f"\nTopic Frequency Distribution for {dataset_name} (N={num_topics}):")
        
        # Get topic frequencies for each topic
        topic_frequency = {i: 0 for i in range(num_topics)}
        for doc in corpus:
            topics = lda_model.get_document_topics(doc)  # Get topics for the document
            for topic_id, topic_prob in topics:
                topic_frequency[topic_id] += topic_prob  # Accumulate frequency of topics

        # Sort topics by frequency in descending order
        sorted_topic_frequency = sorted(topic_frequency.items(), key=lambda x: x[1], reverse=True)
        
        # Print sorted topics and their frequencies
        for topic_id, freq in sorted_topic_frequency:
            print(f"Topic {topic_id+1}: Frequency: {freq}")

# List of topics configurations to apply: 5, 10, 20, 50 topics
num_topics_list = [5, 10, 20, 50]

# Apply LDA and display topic frequency distribution for both datasets
display_topic_frequency(data, num_topics_list, dataset_name="Dataset_1")
display_topic_frequency(data2, num_topics_list, dataset_name="Dataset_2")



Topic Frequency Distribution for Dataset_1 (N=5):
Topic 5: Frequency: 1094.8106513181701
Topic 4: Frequency: 1074.2860631616786
Topic 2: Frequency: 965.3405799735337
Topic 3: Frequency: 737.3883082242683
Topic 1: Frequency: 716.9674706868827

Topic Frequency Distribution for Dataset_1 (N=10):
Topic 1: Frequency: 644.3870600331575
Topic 3: Frequency: 640.6374024255201
Topic 10: Frequency: 552.9169020568952
Topic 5: Frequency: 442.8778112512082
Topic 2: Frequency: 430.61615092214197
Topic 8: Frequency: 426.6375790098682
Topic 4: Frequency: 392.3534887060523
Topic 7: Frequency: 384.85901754908264
Topic 9: Frequency: 335.3840465247631
Topic 6: Frequency: 283.15369773563

Topic Frequency Distribution for Dataset_1 (N=20):
Topic 18: Frequency: 522.2518114391714
Topic 12: Frequency: 331.29696818534285
Topic 1: Frequency: 295.82968818116933
Topic 2: Frequency: 269.52548527438194
Topic 8: Frequency: 257.160044407472
Topic 11: Frequency: 240.52002521697432
Topic 9: Frequency: 223.2029868485406


In [21]:
# Importing the necessary libraries for data processing and plotting
import plotly.express as px
import plotly.io as pio
import os
from gensim import corpora
from gensim.models import LdaModel

# Function to save each plot as an HTML file
def save_figures_as_html(fig, num_topics, dataset_name, output_dir="./output/"):
    """
    Save each plot (donut chart) as an HTML file in the specified output directory.
    
    Parameters:
        fig (plotly.graph_objs._figure.Figure): The plotly figure to save.
        num_topics (int): The number of topics for the chart filename.
        dataset_name (str): Name of the dataset to include in the filename.
        output_dir (str): Directory where the HTML files should be saved.
    """
    # Ensure the output directory exists, if not, create it
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a unique filename based on the dataset name and number of topics
    output_file = os.path.join(output_dir, f"{dataset_name}_topic_distribution_N{num_topics}.html")
    
    # Save the figure as HTML
    fig.write_html(output_file)
    print(f"Saved {output_file}")  # Output the location where the figure has been saved

# Function to apply LDA and plot donut charts for each dataset and number of topics
def apply_lda_and_plot_donut(data, num_topics_list, dataset_name="Dataset", output_dir="./output/"):
    """
    Apply LDA topic modeling to the given dataset, generate donut charts for different topic configurations, 
    and save them as HTML files.

    Parameters:
        data (pandas.DataFrame): The dataset containing the cleaned text.
        num_topics_list (list): List of integers representing the number of topics (e.g., [5, 10, 20, 50]).
        dataset_name (str): The name of the dataset (e.g., "Dataset 1").
        output_dir (str): Directory where the HTML files should be saved.
    """
    # Tokenize the cleaned text into words (tokens) for topic modeling
    texts = [text.split() for text in data['cleaned_text']]  # Splitting text into tokens
    
    # Create a dictionary and corpus for the dataset
    dictionary = corpora.Dictionary(texts)  # Creating a Gensim dictionary
    corpus = [dictionary.doc2bow(text) for text in texts]  # Corpus for LDA model

    # Loop through each number of topics and apply LDA
    for num_topics in num_topics_list:
        lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)  # LDA Model
        
        # Get the topic frequencies for each topic in the corpus
        topic_frequency = {i: 0 for i in range(num_topics)}
        for doc in corpus:
            topics = lda_model.get_document_topics(doc)  # Get topics for the document
            for topic_id, topic_prob in topics:
                topic_frequency[topic_id] += topic_prob  # Accumulate frequency of topics

        # Prepare data for plotting the donut chart
        topic_labels = [f"Topic {i+1}" for i in range(num_topics)]  # Topic labels
        topic_counts = [topic_frequency[i] for i in range(num_topics)]  # Topic frequencies

        # Create a donut chart using Plotly to visualize the topic distribution
        fig = px.pie(names=topic_labels, values=topic_counts, hole=0.3, 
                     title=f"Topic Distribution in {dataset_name} (N={num_topics})")

        # Save the donut chart as an HTML file using the helper function
        save_figures_as_html(fig, num_topics, dataset_name, output_dir)

        # Display the figure directly in the notebook for visualization
        pio.show(fig)

# List of topics configurations to apply: 5, 10, 20 topics
num_topics_list = [5, 10, 20]

# Apply LDA to both datasets and plot donut charts for each configuration
# Running for the first dataset
apply_lda_and_plot_donut(data, num_topics_list, dataset_name="Dataset_1")

# Running for the second dataset
apply_lda_and_plot_donut(data2, num_topics_list, dataset_name="Dataset_2")


Saved ./output/Dataset_1_topic_distribution_N5.html


Saved ./output/Dataset_1_topic_distribution_N10.html


Saved ./output/Dataset_1_topic_distribution_N20.html


Saved ./output/Dataset_2_topic_distribution_N5.html


Saved ./output/Dataset_2_topic_distribution_N10.html


Saved ./output/Dataset_2_topic_distribution_N20.html
