### Data Prep

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
PATH = os.getcwd()

filesArray = os.listdir(f'{PATH}/archive')
csvFiles = [file for file in filesArray if file.endswith(".csv")]
csvFiles.sort()

#### combine to one dataframe

In [None]:
# Combine into one data frame
data = []
for i in csvFiles:
    data.append(pd.read_csv(f'{PATH}/archive/{i}'))

reviewsRaw = pd.concat(data, ignore_index=True)

columnNames = {'Date of Exp' : 'dateExp', 'Star Rating' : 'starRating', 'Reviews': 'reviews'}
reviewsRaw = reviewsRaw.rename(columns=columnNames)

reviewsRaw['dateExp'] = pd.to_datetime(reviewsRaw['dateExp'], format = 'mixed')
reviewsRaw.head()

### maybe include date filter here?

treat onedate = onedocument

topics modelling will be per day (or according to desired range)

In [None]:
# Filter the DataFrame to get only the rows from 2023
reviews = reviewsRaw.loc[reviewsRaw['dateExp'].dt.year >= 2022].copy()

> By using the .loc accessor and the .copy() method, you explicitly indicate that you want to modify a specific subset of the DataFrame, avoiding the warning

## DataPreprocessing

#### recode starRating

In [None]:
reviews['starRatingRecode'] = reviews['starRating'].apply(lambda x: 'positive' if x >=4 else ('neutral' if x==3 else 'negative'))
reviews.head()
#print(reviews.shape)

#### remove puctuations & convert to lowercase

In [None]:
import re

def lowerCase_removePunc(dataframeName, columnName):
    dataframeName[columnName] = dataframeName['reviews'].map(lambda text: re.sub(r'[,\.!?]', '', text))
    dataframeName[columnName] = dataframeName[columnName].map(lambda text: text.lower())    
    return dataframeName


reviews = lowerCase_removePunc(reviews, 'processedText')
reviews.head()

#### tokenization

In [None]:
content = (reviews['processedText'])

In [None]:
import nltk
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text)

processedText = content.apply(tokenize)
processedText.head()

#### remove stopwords

In [None]:
from nltk.corpus import stopwords
stopwords  = stopwords.words('english')
#stopwords

def removeStopwords(tokenizedText):
    filteredTokens = [token for token in tokenizedText if token.lower() not in stopwords]
    return filteredTokens

processedText = processedText.apply(removeStopwords)
#processedText.head()

#### wordcloud

In [None]:
from wordcloud import WordCloud

longString = ' '.join(processedText.apply(lambda x: ' '.join(x)))

In [None]:
wordcloud = WordCloud(
    background_color='white',
    max_words=300,
    max_font_size=50,
    random_state=42
).generate(longString)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Save the wordcloud image
wordcloud_image_path = 'wordcloud.png'  # Specify the path and filename
wordcloud.to_file(wordcloud_image_path)
print(f"Wordcloud saved as {wordcloud_image_path}")

#### lemmatized

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize(tokenizedText):
    filteredTokens = [lemmatizer.lemmatize(token) for token in tokenizedText]
    return filteredTokens

#processedText = processedText.apply(lemmatize)
#processedText.head()

### Modelling using LDA-gensim

In [None]:
# STEP 1: Prepare the corpus
import gensim
from gensim.utils import simple_preprocess 
# simple_preprocess:
# tokenization, lowercasing,
# filtering removes tokens that are too short (less than 3 characters) or too long (more than 15 characters).

In [None]:
content = (reviews['reviews'])
corpus = []

def preprocess_corpus(data):
    processed_corpus = data.apply(lambda x: simple_preprocess(x))
    return processed_corpus

preprocessed_corpus = preprocess_corpus(content)

In [None]:
content.head()

In [None]:
print(preprocessed_corpus[:4])

In [None]:
# Step2: Creating dictionary
from nltk import bigrams
from gensim import corpora, models

def create_bigrams(corpus):
    # Create a list to hold the bigram models
    corpus_bigrams = []

    # Create bigrams for each document in the corpus
    for doc in corpus:
        doc_bigrams = list(bigrams(doc))
        doc_bigrams = [' '.join(bigram) for bigram in doc_bigrams]  # Convert bigrams to strings
        corpus_bigrams.append(doc_bigrams)

    return corpus_bigrams

def create_dict_tfidf(corpus):
    # create dict using the preprocessed corpus
    dictionary = corpora.Dictionary(corpus)
    
    # create bag-of-words representation of the corpus
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]
    
    # create tf-idf model and convert the bow vector to tfidf vectors
    tfidf_model = models.TfidfModel(bow_corpus)
    tfidf_corpus = tfidf_model[bow_corpus]
    
    return dictionary, tfidf_corpus

# dictionary, tfidf_corpus = create_dict_tfidf(preprocessed_corpus)

In [None]:
# Apply the preprocessing steps to the corpus
preprocessed_corpus = preprocess_corpus(content)
preprocessed_corpus = preprocessed_corpus.apply(removeStopwords)
preprocessed_corpus = preprocessed_corpus.apply(lemmatize)

# Create the bigrams in the corpus
corpus_bigrams = create_bigrams(preprocessed_corpus)

# Create the dictionary and TF-IDF corpus with bigrams
dictionary, tfidf_corpus = create_dict_tfidf(corpus_bigrams)

> By incorporating the TF-IDF transformation, you can assign higher weights to terms that are important in a particular document while downweighting terms that are common across multiple documents.

In [None]:
print(dictionary)

In [None]:
print(corpus_bigrams[0])

In [None]:
#for doc in tfidf_corpus:
#    print(doc)

In [None]:
# Step 3: Build LDA model

def train_lda_model(corpus, num_topics):
    # Train lda model on tf-idf corpus
    lda_model = models.LdaModel(corpus=corpus,
                                num_topics=num_topics,
                                id2word=dictionary,
                                passes=20)
    
    return lda_model

In [None]:
%%time
num_topics = 5

# Train the LDA model
lda_model_gensim = train_lda_model(tfidf_corpus, num_topics)

# Print the topics and their corresponding keywords
for topic_id, topic_words in lda_model_gensim.print_topics():
    print(f"Topic #{topic_id+1}: {topic_words}")

In [None]:
# Step 4: Analyze topics

# Get the top keywords for each topic
topics = lda_model_gensim.show_topics(num_topics=num_topics, num_words=20)

# Assign documents to topics
document_topics = [lda_model_gensim.get_document_topics(doc) for doc in tfidf_corpus]

In [None]:
# Step 5: Visualize

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import webbrowser

# Visualize the topics
vis_data = gensimvis.prepare(lda_model_gensim, tfidf_corpus, dictionary)

# Convert the document-topic assignments to a format suitable for visualization
vis_data = pyLDAvis.gensim_models.prepare(lda_model_gensim, tfidf_corpus, dictionary)

pyLDAvis.save_html(vis_data, 'lda_visualization2.html')
webbrowser.open('lda_visualization.html', new=2)

## Evaluate Model

In [None]:
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=lda_model_gensim, 
                                 texts=corpus, 
                                 dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print("Coherence Score:", coherence_score)

In [None]:
from PIL import Image

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a graph
G = nx.Graph()

# Add edges (bigrams) to the graph
for doc in corpus_bigrams:
    for bigram in doc:
        G.add_edge(bigram[0], bigram[1], weight=1)

# Plot the network graph
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G)  # Layout algorithm for graph visualization
nx.draw_networkx(G, pos, with_labels=True, node_color='lightblue', node_size=2000, font_size=12, edge_color='gray')
labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
plt.title('Bigram Network Graph')
plt.show()


In [None]:
'_'.join(corpus_bigrams[0][0].split(' '))

In [None]:
b = [ '_'.join(bigram.split(' ')) for bigram in corpus_bigrams]

In [None]:
long_string = ' '.join(['_'.join(bigram.split(' ')) for doc_bigrams in corpus_bigrams for bigram in doc_bigrams])
long_string

In [None]:
joined_bigrams = ['_'.join(bigram.split(' ')) for doc_bigrams in corpus_bigrams for bigram in doc_bigrams]
joined_bigrams

In [None]:
corpus_bigrams

In [None]:
# Get the salient terms for each topic
topic_terms = lda_model_gensim.show_topics(num_topics=num_topics, num_words=20, formatted=False)

# Create a summary for each topic using the salient terms
topic_summaries = []

for topic in topic_terms:
    terms = [term for term, _ in topic[1]]
    summary = " ".join(terms)
    topic_summaries.append(summary)

In [None]:
topic_summaries