# Reddit Sentiment Anlysis: 
## London low-traffic Neighborhood zones


1. Data "Aquisition"
---

Reddit r/London scraping script
- accesses json endpoint 
- looks for given keywords 
- gets given number of posts + 5 comments each

In [None]:
import requests
import pandas as pd 
import time
from datetime import datetime
from urllib.parse import quote 

URL = "https://www.reddit.com/r/london"
keywords = ["low traffic neighbourhood", "low-traffic neighbourhood", "LTN", "low-traffic zones"]
Headers = {'User-Agent': 'LTN_Sentiment_Analysis (University of Amsterdam)'}

results = []
seen_posts = set()

def search_keyword(keyword):
    url = f"{URL}/search.json?q={quote(keyword)}&restrict_sr=1&sort=relevance&limit=100&t=all"
    return requests.get(url, headers=Headers).json()['data']['children']

def get_comments(post_id):
    url = f"{URL}/comments/{post_id}.json"
    data = requests.get(url, headers=Headers).json()
    return data[1]['data']['children'] if len(data) >= 2 else []

def process_comments(comment_data, post_id, post_title, max_comments=5):
    comments = []
    for item in comment_data:
        if len(comments) >= max_comments:
            break
        if item.get('kind') == 't1':
            comment = item['data']
            body = comment.get('body', '')
            if body and body not in ['[removed]', '[deleted]']:
                comments.append({
                    'type': 'comment',
                    'title': '',
                    'created_utc': datetime.fromtimestamp(comment.get('created_utc', 0)).strftime('%Y-%m-%d %H:%M:%S'),
                    'body': body,
                    'num_comments': '',
                    'url': f"{URL}{comment.get('permalink', '')}",
                    'keyword_matched': '',
                    'parent_post_title': post_title,
                })
            if len(comments) < max_comments:
                replies = comment.get('replies', {})
                if isinstance(replies, dict):
                    comments.extend(process_comments(replies['data']['children'], post_id, post_title, max_comments - len(comments)))
    return comments



Nice Progress Bar by chat gpt :)

In [1]:
# Reusable progress bar function
def print_progress_bar(current, total, bar_length=40, label="Progress"):
    percent = current / total
    filled = int(bar_length * percent)
    bar = '█' * filled + '░' * (bar_length - filled)
    print(f"\r  {label}: [{bar}] {current}/{total}", end='', flush=True)


Main Function
- currently keeping small dataset (20 posts + 5 comments each per keyword), might extend later 

In [None]:
POSTS_PER_KEYWORD = 20  # Single control point for limit

def main():
    print(f"Starting json scraping script --> Collecting {POSTS_PER_KEYWORD} posts for every keyword with 5 comments each\n")

    for idx, keyword in enumerate(keywords, 1):
        print(f"Keyword [{idx}/{len(keywords)}]: '{keyword}'")
        posts = search_keyword(keyword)
        keyword_count = 0
        
        for post_item in posts:
            if keyword_count >= POSTS_PER_KEYWORD:
                break
            if post_item.get('kind') != 't3':
                continue
            post = post_item['data']
            post_id = post.get('id', '')
            if post_id in seen_posts or not post_id:
                continue
            seen_posts.add(post_id)
            keyword_count += 1
            
            results.append({
                'type': 'post',
                'title': post.get('title', ''),
                'created_utc': datetime.fromtimestamp(post.get('created_utc', 0)).strftime('%Y-%m-%d %H:%M:%S'),
                'body': post.get('selftext', ''),
                'num_comments': post.get('num_comments', 0),
                'url': f"{URL}{post.get('permalink', '')}",
                'keyword_matched': keyword,
                'parent_post_title': '',
            })
            
            if post.get('num_comments', 0) > 0:
                time.sleep(1)
                comments = process_comments(get_comments(post_id), post_id, post.get('title', ''))
                results.extend(comments)
            
            print_progress_bar(keyword_count, POSTS_PER_KEYWORD, label="Progress")
            time.sleep(1)
        
        print("\n")
    
    df = pd.DataFrame(results)
    df.to_csv("ltn_london_reddit_scraped.csv", index=False, encoding='utf-8')
    print(f"✓ Saved {len(df[df['type'] == 'post'])} posts and {len(df[df['type'] == 'comment'])} comments")


if __name__ == "__main__":
    main()


2. Cleaning Data
---

Use AI  to check if each post is geniunely about LTN 

- script only saves context relevant post and their comments 
- small local AI model (llama3.2:3b --> best for my laptop)

In [None]:
import pandas as pd
import ollama
import time

INPUT_FILE = 'ltn_london_reddit_scraped.csv'
OUTPUT_FILE = 'filtered_ltn_reddit.csv'
MODEL_NAME = 'llama3.2:3b'  # Optimized for 8GB RAM (3x better than 1b)

df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} entries. Filtering with AI...\n")


def filter_ltn():
    filtered = []
    for idx, row in df.iterrows():
        title = str(row.get('title', '')) if pd.notna(row.get('title')) else ""
        body = str(row.get('body', '')) if pd.notna(row.get('body')) else ""
        
        prompt = f"""Is this post about Low Traffic Neighbourhoods (LTN) in London UK?

Title: {title}
Text: {body[:500]}

Answer ONLY 'yes' or 'no'."""
        
        response = ollama.generate(model=MODEL_NAME, prompt=prompt, options={'temperature': 0.2, 'num_predict': 5})
        
        if 'yes' in response['response'].strip().lower():
            filtered.append(row)
        
        print_progress_bar(idx + 1, len(df), label="Filtering")
        time.sleep(0.3)
    
    return pd.DataFrame(filtered)


filtered_df = filter_ltn()
filtered_df.to_csv(OUTPUT_FILE, index=False)
print(f"\n\n✓ Saved {len(filtered_df)} entries to {OUTPUT_FILE}")


3. Sentiment Analysis 
---


Basic sentiment Analysis with Vader

In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
df = pd.read_csv('filtered_ltn_reddit.csv')
print(f"Analyzing {len(df)} entries...\n")

sentiments = []
for idx, row in df.iterrows():
    text = str(row.get('title', '')) if row['type'] == 'post' else str(row.get('body', ''))
    if not text or text == 'nan':
        text = str(row.get('body', ''))
    
    score = analyzer.polarity_scores(text)['compound']
    
    if score >= 0.05:
        sentiments.append('Positive')
    elif score <= -0.05:
        sentiments.append('Negative')
    else:
        sentiments.append('Neutral')

counts = pd.Series(sentiments).value_counts()
total = len(sentiments)
print(f"\nSentiment Distribution:\n  Positive: {counts.get('Positive', 0)/total*100:.1f}%\n  Neutral:  {counts.get('Neutral', 0)/total*100:.1f}%\n  Negative: {counts.get('Negative', 0)/total*100:.1f}%")


Analyzing 212 entries...


Sentiment Distribution:
  Positive: 38.7%
  Neutral:  17.0%
  Negative: 44.3%
