In [None]:
"""
Heidi Jiang
charity_ride.ipynb
05/13/2025

This program analyzes three forums regarding charity bike rides by calculating
the polarity scores and applying LDA analysis.
"""

'\nHeidi Jiang\n05/13/2025\n\nThis program analyzes three forums regarding charity bike rides by calculating\nthe polarity scores and applying LDA analysis.\n'

In [1]:
# imports
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hyjiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# constants
HEADERS = {"User-Agent": "Mozilla/5.0"}
URL_50_PLUS = "https://www.bikeforums.net/fifty-plus-50/1193362-charity-rides.html"
URL_MIN_FUND = "https://www.bikeforums.net/charity-events/1176088-charity-rides-minimum-fundraising-requirements.html"
URL_IMPACT = "https://www.cyclingforums.com/threads/have-you-participated-in-charity-rides-before-if-yes-could-you-share-your-experience-and-its-impact-on-yourself-as-well-as-others.488013/"
ID_PATTERN = r'^post_message_'


STOPWORDS = set(stopwords.words('english'))
CUSTOM_SW = ["ride", "rides", "riders", "quoteoriginally", "posted", "like",
    "one", "dont", "youre", "sure", "back", "time", "event", "events", "want",
    "lets", "ms", "every", "got", "get", "well", "truly", "ive", "theyre",
    "also", "im", "done", "impact", "experience", "charity"]
ALL_SW = STOPWORDS.union(CUSTOM_SW)

In [3]:
# web scraping for first URL
req_50_plus = Request(URL_50_PLUS, headers=HEADERS)
html_50_plus = urlopen(req_50_plus)
bs_50_plus = BeautifulSoup(html_50_plus.read(), "html.parser")
texts_50_plus = bs_50_plus.find_all("div", id=re.compile(r'^post_message_'))

In [4]:
# web scraping for second URL
req_min_fund = Request(URL_MIN_FUND, headers=HEADERS)
html_min_fund = urlopen(req_min_fund)
bs_min_fund = BeautifulSoup(html_min_fund.read(), "html.parser")
texts_min_fund = bs_min_fund.find_all("div", id=re.compile(r'^post_message_'))

In [5]:
# web scraping for third URL
req_impact = Request(URL_IMPACT, headers=HEADERS)
html_impact = urlopen(req_impact)
bs_impact = BeautifulSoup(html_impact.read(), "html.parser")
texts_impact = bs_impact.find_all("div", {"class": "bbWrapper"})

In [6]:
def polarity_score(texts):
    """
    Given a list of BeautifulSoup objects, calculate the polarity score
    """
    sentiments = []
    for text in texts:
        blob = TextBlob(text.get_text())
        sentiments.append(blob.sentiment.polarity)
    polarity = sum(sentiments) / len(sentiments)
    return polarity

In [7]:
def clean_text(texts):
    """
    Given a list of BeautifulSoup objects, clean the text
    """
    cleaned = []
    for t in texts:
        text = t.get_text(strip=True).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = text.split()
        tokens = [token for token in tokens if token not in ALL_SW]
        cleaned.append(tokens)
    return cleaned

In [8]:
def lda_model(texts, n_topics=1, n_top_words=5):
    """
    Given a list of BeautifulSoup objects, apply LDA analysis
    """
    docs = clean_text(texts)
    docs = [' '.join(doc) for doc in docs]
    vectorizer = CountVectorizer(max_df=0.95, min_df=2)
    dtm = vectorizer.fit_transform(docs)

    lda_model = LatentDirichletAllocation(n_components=n_topics)
    lda_model.fit(dtm)

    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for i, topic in enumerate(lda_model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
    return topics

In [9]:
# results
agg_texts = [texts_50_plus, texts_min_fund, texts_impact]
descr = ["Bikers over 50",
        "Charity rides with minimum fundraising requirements",
        "Experience and impact of charity rides"]

for i, texts in enumerate(agg_texts):
    print(f"{descr[i]}")
    pol = polarity_score(texts)
    topic = " ".join(lda_model(texts)[0])
    print(f"Polarity score: {pol:.2f}")
    print(f"Main topic: {topic}")
    print()

Bikers over 50
Polarity score: 0.12
Main topic: food support century club course

Charity rides with minimum fundraising requirements
Polarity score: 0.16
Main topic: minimum fee costs money people

Experience and impact of charity rides
Polarity score: 0.19
Main topic: community personal social change fitness

