Import all the necessary libraries

In [None]:
import streamlit as st
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
import re
from transformers import pipeline

nltk.download('stopwords')

function to scrap data from other websites 

In [None]:
# Function to scrape reviews from a single Amazon URL
def scrape_reviews(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
#deposit the data in an empty df
    reviews_data = []
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find review containers (this is a very common structure)
        reviews = soup.find_all('div', {'data-hook': 'review'})
        #make a for loop to scrape each review
        for review in reviews:
            rating = review.find('i', {'data-hook': 'review-star-rating'})
            text = review.find('span', {'data-hook': 'review-body'})
            user = review.find('span', class_='a-profile-name')

            reviews_data.append({
                'Rating': rating.get_text(strip=True) if rating else 'N/A',
                'Review Text': text.get_text(strip=True) if text else 'N/A',
                'User': user.get_text(strip=True) if user else 'N/A',
                'Source URL': url
            })
#make sure there is a response in case of error
    except Exception as e:
        st.error(f"Failed to scrape {url}: {e}")

    return reviews_data

APP LOOKS

In [None]:

# Streamlit UI
st.title("WebVoice: Know what your customers are saying quickly")
st.markdown("Enter one or more URLs below (one per line):")

input_text = st.text_area("Amazon Product URLs", height=150)

if st.button("Scrape Reviews"):
    urls = [line.strip() for line in input_text.splitlines() if line.strip()]
    all_reviews = []

    with st.spinner("Scraping reviews..."):
        for url in urls:
            reviews = scrape_reviews(url)
            all_reviews.extend(reviews)

    if all_reviews:
        df = pd.DataFrame(all_reviews)
        st.success(f"Scraped {len(all_reviews)} reviews from {len(urls)} links.")
        st.dataframe(df)
        csv = df.to_csv(index=False)
        st.download_button("Download as CSV", csv, "amazon_reviews.csv", "text/csv")
    else:
        st.warning("No reviews found or scraping failed.")
        if st.warning ("No reviews found or scraping failed."):
            display st.link_button("Learn More", url, *, help=None, type="secondary", icon=None, disabled=False, use_container_width=False)


In case there 

In [None]:
errorscrape_learnmore = Path (LEARNMORESCRAPE.md)
if st.warning ("No reviews found or scraping failed."):
    display st.button("Learn More"):
    # Read the content of the README.md file
    readme_content = errorscrape_learnmore.read_text(encoding="utf-8")
    #display content
    st.markdown(readme_content)

In [None]:
if st.warning ("No reviews found or scraping failed."):
    display st.link_button("Learn More", url, *, help=None, type="secondary", icon=None, disabled=False, use_container_width=False)

In [None]:
st.sidebar.markdown("#Topic Model")
st.sidebar.markdown("#Sentiment Analysis")
st.sidebar.markdown("#Both")

In [None]:
custom_stopwords = st.text_area("insert custom stopwords:", height = 100)

In [None]:
# Streamlit App UI

if uploaded_file:
    reviews = pd.read_csv(all_reviews)

    # Preprocessing

    st.subheader("1. Preprocessing Text")

    stop_words = set(stopwords.words('english'))
    stemmer = SnowballStemmer("english")
    custom_stopwords = custom_stopwords

    def preprocess(text):
        text = str(text).lower()
        text = re.sub(r"[^a-z\s]", "", text)
        tokens = text.split()
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and word not in custom_stopwords]
        return tokens
    reviews['tokens'] = reviews['review_text'].fillna("").apply(preprocess)

    dictionary = corpora.Dictionary(reviews['tokens'])
    corpus = [dictionary.doc2bow(text) for text in reviews['tokens']]

  
    # LDA Topic Modeling
 
    st.subheader("2. Training LDA Model")
    num_topics = st.slider("Select number of topics", min_value=5, max_value=60, value=10, step=5)

    lda_model = gensim.models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=1,
        passes=10,
        alpha='auto',
        per_word_topics=True
    )


    # Display Topics

    st.subheader("3. Extracted Topics")
    for idx, topic in lda_model.print_topics(-1):
        st.write(f"**Topic {idx}:** {topic}")


    # Topic Visualization

    st.subheader("4. Interactive Topic Visualization")
    with st.spinner("Generating visualization..."):
        vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
        pyLDAvis.save_html(vis, 'lda_vis.html')
        st.components.v1.html(open('lda_vis.html', 'r', encoding='utf-8').read(), height=800)


    # Topic and Rating Correlation

    st.subheader("5. Topic Prevalence by Rating")

    topic_dist = []
    for row in corpus:
        topic_probs = lda_model.get_document_topics(row, minimum_probability=0)
        topic_array = np.array([prob for _, prob in topic_probs])
        topic_dist.append(topic_array)

    topic_dist_df = pd.DataFrame(topic_dist, columns=[f"Topic {i}" for i in range(num_topics)])
    topic_dist_df['Rating'] = reviews['review_star']

    mean_topic_by_rating = topic_dist_df.groupby("Rating").mean()
    st.line_chart(mean_topic_by_rating)


In [None]:
custom_stopwords = st.text_area("insert custom stopwords:", height = 100)

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis")

In [None]:
# Define a custom transformer-based sentiment analyzer function
def transformer_sentiment_analyzer(review):
    # Get the sentiment from the pipeline; truncate text if necessary
    pipeline_return = sentiment_pipeline(review, truncation=True)

    # Return "negative" if the label is "NEGATIVE", otherwise "positive"
    label = pipeline_return[0]['label']
    if label == "NEGATIVE":
        return "negative"
    else:
        return "positive"

In [None]:

# This will download the model the first time it runs
@st.cache_resource
def load_model():
    return pipeline("sentiment-analysis")

sentiment_analyzer = load_model()


# Button to trigger sentiment analysis
if st.button("Analyze Sentiment"):
    if all_reviews.strip():
        # Run sentiment analysis on the input
        result = sentiment_analyzer(all_reviews)[0]
        
        # Display the result
        st.success(f"**Sentiment:** {result['label']} \n\n**Confidence:** {result['score']:.2%}")
    else:
        st.warning("Please enter some text before clicking the button.")

