In [None]:
pip install transformers torch pandas praw altair

In [3]:
import praw # Python Reddit API Wrapper
import re
import torch
import datetime
import altair as alt
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Fill these fields in with API information pulled from https://www.reddit.com/prefs/apps
client_id = 'ENTER HERE'
client_secret = 'ENTER HERE'
user_agent = 'ucsb-sentiment v1.0 (by u/UCSBHealthWellness)'

reddit = praw.Reddit(client_id=client_id, 
                     client_secret=client_secret, 
                     user_agent=user_agent)

In [5]:
# https://praw.readthedocs.io/en/stable/getting_started/quick_start.html
# define function to pull posts
def get_subreddit_data(num_posts):
    subreddit = reddit.subreddit('UCSantaBarbara')
    posts = []


    # for post in subreddit.new(limit=num_posts):
    # for post in subreddit.rising(limit=num_posts): ... 
    
    for post in subreddit.hot(limit=num_posts):
        data = {
            'title': post.title,
            'content': post.selftext,
            'score': post.score,
            'id': post.id,
            'url': post.url,
            'created': datetime.datetime.fromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
        }
        posts.append(data)
    
    # returns a list of all posts 
    return posts

In [6]:
# call API to gather data
call_posts = 200
subreddit_data = get_subreddit_data(call_posts)

In [7]:
# define text cleaning function

def preprocess_text(text):
    # remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # convert to lowercase
    text = text.lower()

    # remove all extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [8]:
for post in subreddit_data:
    post['title'] = preprocess_text(post['title'])
    post['content'] = preprocess_text(post['content'])

In [9]:
# Click "Use in Transformers" on HuggingFaces
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

In [10]:
tokens = tokenizer.encode_plus(
        "This is a test sentence",
        None,
        add_special_tokens=True,
        max_length=20,
        padding='max_length',
        return_token_type_ids=False,
        return_tensors='pt',
        truncation=True
    )

In [11]:
tokenizer.decode(tokens['input_ids'][0])

2023-05-17 15:07:12.710192: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'<s>This is a test sentence</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [12]:
def predict_sentiment(text):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_token_type_ids=False,
        return_tensors='pt',
        truncation=True
    )

    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    prediction = torch.argmax(probabilities, dim=1)

    # Convert prediction to sentiment label
    sentiment_map = {
        0: 'negative',
        1: 'neutral',
        2: 'positive'
    }

    # Convert the tensor to a Python integer
    return sentiment_map[prediction.item()]

In [13]:
for post in subreddit_data:
    sentiment = predict_sentiment(post['title'] + ' ' + post['content'])
    post['sentiment'] = sentiment

# convert data to a pandas dataframe
subreddit_df = pd.DataFrame(subreddit_data)

In [14]:
subreddit_df

Unnamed: 0,title,content,score,id,url,created,sentiment
0,link updated spring groupme and discord list,access the spreadsheet here you can only acces...,11,12cpq9b,https://www.reddit.com/r/UCSantaBarbara/commen...,2023-04-05 09:25:45,neutral
1,ucsb transfer admissions megathread,class of field your questions here congrats pr...,17,12z0myr,https://www.reddit.com/r/UCSantaBarbara/commen...,2023-04-25 17:07:31,neutral
2,ucsb iv music scene at risk practice spaces fo...,,14,13kecyx,https://i.redd.it/72conrim1i0b1.jpg,2023-05-17 14:16:14,neutral
3,honors program application,does anyone who got into the honors program th...,7,13kch17,https://www.reddit.com/r/UCSantaBarbara/commen...,2023-05-17 13:03:54,negative
4,need help deciding ucsb data science vs umd cs,hi was wondering which school would be the bet...,4,13kaamu,https://www.reddit.com/r/UCSantaBarbara/commen...,2023-05-17 11:41:57,neutral
...,...,...,...,...,...,...,...
195,am i making a mistake by not searching for roo...,im an incoming freshman and i submitted my hou...,13,13dc5sj,https://www.reddit.com/r/UCSantaBarbara/commen...,2023-05-09 18:12:56,neutral
196,i need housing asap but i dont know where to l...,i am a third year ee transfer student m and lo...,9,13ddkx8,https://www.reddit.com/r/UCSantaBarbara/commen...,2023-05-09 19:17:45,negative
197,are the manzanita double rooms larger than the...,title,5,13dewrk,https://www.reddit.com/r/UCSantaBarbara/commen...,2023-05-09 20:23:12,neutral
198,pokemon go,yo is there a group of people here in iv who p...,23,13d2cf4,https://www.reddit.com/r/UCSantaBarbara/commen...,2023-05-09 11:50:14,neutral


In [15]:
## bar chart of sentiments ##
sentiment_counts = subreddit_df['sentiment'].value_counts(normalize=True).reset_index()

sentiment_chart = alt.Chart(sentiment_counts).mark_bar(color='#003660').encode(
    alt.X("sentiment:N", title="Sentiment"),
    alt.Y("proportion:Q", axis=alt.Axis(format='%', title='Percentage of Posts')),
    tooltip=[alt.Tooltip('sentiment:N', title='Sentiment'), alt.Tooltip('proportion:Q', format='.1%', title='Percentage')]
)

sentiment_text = sentiment_chart.mark_text(
    align='center',
    baseline='middle',
    dy=-10  # moves upwards
).encode(
    text=alt.Text('proportion:Q', format='.1%')
)

sentiment_chart = (sentiment_chart + sentiment_text).properties(
    title="Distribution of Sentiments",
    width=500,
    height=200
)

## histogram of post scores ##
score_histogram = alt.Chart(subreddit_df).mark_bar(color='#003660').encode(
    alt.X("score:Q", bin=alt.Bin(maxbins=30), title="Post Score"),
    alt.Y("count():Q", title="Number of Posts"),
    tooltip=["count()"]
).properties(
    title="Histogram of Post Scores",
    width=500,
    height=200
)

## scatter plot of post scores vs sentiment ##
scatter_plot = alt.Chart(subreddit_df).mark_circle(color='#003660').encode(
    alt.X("score:Q", title="Post Score"),
    alt.Y("sentiment:N", title="Sentiment"),
    tooltip=["id", "title", "score", "sentiment"]
).properties(
    title="Post Score // Sentiment",
    width=1000,
    height=200
)

# combine plots
combined_plots = (sentiment_chart | score_histogram) & scatter_plot

# add title to the combined plots
combined_plots = combined_plots.properties(
    title={
        "text": ["Hot {} Posts from r/UCSantaBarbara".format(str(call_posts))], 
        "subtitle": ["Sentiments determined using fine-tuned BERT model"],
        "color": "black",
        "fontSize": 20,
        "subtitleColor": "gray",
        "subtitleFontSize": 16,
        "align": "center"
    }
).configure_title(orient='top', anchor='middle')

# display
combined_plots

In [16]:
sentiment_counts

Unnamed: 0,sentiment,proportion
0,neutral,0.615
1,negative,0.235
2,positive,0.15
