## 1. Install dependencies

In [18]:
#!pip install praw nltk gensim pyLDAvis

## 2. Imports & NLTK setup

In [19]:
import praw
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

from gensim import corpora, models

# Optional visualization
import pyLDAvis.gensim_models

# Download NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## 3. Reddit API authentication

In [20]:
CLIENT_ID     = "DHJ6i6-NjssQ5b5SPIqJvw"
CLIENT_SECRET = "NkUebd7YS1ZzQLhzVq_9dyyYHOyDEQ"
USER_AGENT    = "script:aston.scraper:v1.0 (by u/Sad-State-7591)"

reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)

## 4. Fetch comments and upvote scores

In [21]:
def fetch_comments_for_post(url, reddit_instance):
    submission = reddit_instance.submission(url=url)
    submission.comments.replace_more(limit=None)
    comments = submission.comments.list()
    
    records = []
    for c in comments:
        # skip deleted or empty bodies
        if c.body and c.body.strip().lower() != '[deleted]':
            records.append({
                'comment_id': c.id,
                'body':       c.body,
                'upvotes':    c.score
            })
    return pd.DataFrame(records)

# Apply to your specific thread
post_url = "https://www.reddit.com/r/formula1/comments/17gd7t3/is_anyone_a_true_fan_of_aston_martin/"
df = fetch_comments_for_post(post_url, reddit)
df.head(10)

Unnamed: 0,comment_id,body,upvotes
0,k6fsjxc,As a general rule ([see full rules](https://ww...,1
1,k6ftymh,In the way that it's basically currently Jorda...,427
2,k6ft9fz,Maybe some hardcore Jordan fans that never left?,348
3,k6fxo40,You can count me as one. Seb being a part of t...,111
4,k6ft2ak,"Somewhere, someone, maybe, perhaps...\n\nBut n...",170
5,k6fvpge,"""Aston"", no, not particularly. But as others s...",58
6,k6g2a5w,Iam a fan of the team but not because of the A...,27
7,k6fwveo,"Probably there are a Aston Martin fans, the th...",74
8,k6hjd9y,"I’m not ashamed to admit it, when I was a kid,...",19
9,k6h27mx,Luv me British racin' green\n\nLuv me James Bo...,37


## 5. Sentiment scoring with VADER

In [22]:
sent_analyzer = SentimentIntensityAnalyzer()

def attach_sentiment(df, text_col='body'):
    scores = df[text_col].apply(sent_analyzer.polarity_scores)
    return pd.concat([df, pd.DataFrame(list(scores))], axis=1)

df = attach_sentiment(df)
df[['body','upvotes','compound','neg','neu','pos']].head(10)

Unnamed: 0,body,upvotes,compound,neg,neu,pos
0,As a general rule ([see full rules](https://ww...,1,0.9272,0.034,0.82,0.146
1,In the way that it's basically currently Jorda...,427,0.4019,0.0,0.899,0.101
2,Maybe some hardcore Jordan fans that never left?,348,0.0,0.0,1.0,0.0
3,You can count me as one. Seb being a part of t...,111,0.9127,0.054,0.732,0.215
4,"Somewhere, someone, maybe, perhaps...\n\nBut n...",170,0.0,0.0,1.0,0.0
5,"""Aston"", no, not particularly. But as others s...",58,0.9558,0.026,0.674,0.299
6,Iam a fan of the team but not because of the A...,27,0.926,0.0,0.816,0.184
7,"Probably there are a Aston Martin fans, the th...",74,0.8647,0.029,0.824,0.148
8,"I’m not ashamed to admit it, when I was a kid,...",19,0.9967,0.035,0.71,0.255
9,Luv me British racin' green\n\nLuv me James Bo...,37,0.0,0.0,1.0,0.0


## 6. Text preprocessing for LDA

In [23]:
# Build stopword set
stop_words = set(stopwords.words('english'))
stop_words |= {"’","…","—","“","”","–"}  # add punctuation tokens

def tokenize_and_clean(text):
    # skip sent_tokenize, just do raw tokenization
    tokens = word_tokenize(text.lower(), preserve_line=True)
    return [
        t for t in tokens
        if t.isalpha() and t not in stop_words and len(t) > 2
    ]

df['tokens'] = df['body'].map(tokenize_and_clean)

In [24]:
# Inspect token lists
df[['body','tokens']].head(5)

Unnamed: 0,body,tokens
0,As a general rule ([see full rules](https://ww...,"[general, rule, see, full, rules, https, stand..."
1,In the way that it's basically currently Jorda...,"[way, basically, currently, jordan, theres, so..."
2,Maybe some hardcore Jordan fans that never left?,"[maybe, hardcore, jordan, fans, never, left]"
3,You can count me as one. Seb being a part of t...,"[count, seb, part, team, helped, always, fond,..."
4,"Somewhere, someone, maybe, perhaps...\n\nBut n...","[somewhere, someone, maybe, perhaps, without, ..."


## 7. Build corpus & dictionary

In [25]:
dictionary = corpora.Dictionary(df['tokens'])
# Filter out words that appear in <5 documents or in >50% of docs
dictionary.filter_extremes(no_below=5, no_above=0.5)

corpus = [dictionary.doc2bow(toks) for toks in df['tokens']]

## 8. Train LDA model

In [26]:
NUM_TOPICS = 4

lda = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    passes=10,
    random_state=42
)

for topic_id, terms in lda.show_topics(formatted=False):
    top_terms = ", ".join([w for w,_ in terms])
    print(f"Topic {topic_id}: {top_terms}")


Topic 0: team, like, cars, fan, stroll, alonso, fans, martin, aston, love
Topic 1: fan, team, would, alonso, racing, one, aston, force, point, india
Topic 2: team, car, aston, think, company, green, would, martin, lawrence, long
Topic 3: team, aston, like, fan, martin, teams, really, fans, drivers, stroll


## 9. pyLDAvis visualization

In [27]:
vis_data = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)

## 10. Split into positive vs. negative comments

In [28]:
# Define your thresholds (based on VADER’s compound score)
pos_threshold =  0.05
neg_threshold = -0.05

# Filter DataFrame
df_pos = df[df['compound'] >= pos_threshold].copy()
df_neg = df[df['compound'] <= neg_threshold].copy()

print(f"Positive comments: {len(df_pos)}")
print(f"Negative comments: {len(df_neg)}")

Positive comments: 198
Negative comments: 48


## 11. Build separate dictionaries & corpora

In [29]:
def build_dict_corpus(token_lists, no_below=3, no_above=0.5):
    d = corpora.Dictionary(token_lists)
    d.filter_extremes(no_below=no_below, no_above=no_above)
    c = [d.doc2bow(toks) for toks in token_lists]
    return d, c

# You can adjust filtering parameters if one subset is much smaller
dict_pos, corpus_pos = build_dict_corpus(df_pos['tokens'], no_below=3, no_above=0.5)
dict_neg, corpus_neg = build_dict_corpus(df_neg['tokens'], no_below=3, no_above=0.5)

## 12. Train two LDA models

In [32]:
NUM_TOPICS = 3  # or choose separately for each

lda_pos = models.LdaModel(
    corpus=corpus_pos,
    id2word=dict_pos,
    num_topics=NUM_TOPICS,
    passes=10,
    random_state=42
)

lda_neg = models.LdaModel(
    corpus=corpus_neg,
    id2word=dict_neg,
    num_topics=NUM_TOPICS,
    passes=10,
    random_state=42
)

## 13. Inspect topics for positive vs. negative

In [33]:
print("=== Positive Comments Topics ===")
for tid, terms in lda_pos.show_topics(formatted=False):
    print(f"Topic {tid}: {', '.join(w for w,_ in terms)}")

print("\n=== Negative Comments Topics ===")
for tid, terms in lda_neg.show_topics(formatted=False):
    print(f"Topic {tid}: {', '.join(w for w,_ in terms)}")

=== Positive Comments Topics ===
Topic 0: like, team, fans, aston, martin, would, really, stroll, think, car
Topic 1: would, team, think, also, alonso, could, quite, good, season, see
Topic 2: team, fan, aston, martin, racing, like, one, alonso, india, force

=== Negative Comments Topics ===
Topic 0: team, fan, alonso, aston, would, martin, care, really, stroll, get
Topic 1: team, india, force, people, lost, since, midland, racing, fan, may
Topic 2: like, stroll, teams, driver, pay, much, one, team, hard, though
