In [1]:
import pandas as pd

In [2]:
# specify here path to your data
data = pd.read_csv('../data/reddit_posts_3_years.csv')

data.head()
data.shape
data.columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1718 entries, 0 to 1717
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             1691 non-null   object 
 1   title            1718 non-null   object 
 2   selftext         1270 non-null   object 
 3   author           1718 non-null   object 
 4   created_utc      1718 non-null   float64
 5   score            1718 non-null   int64  
 6   num_comments     1718 non-null   int64  
 7   subreddit        1718 non-null   object 
 8   link_flair_text  1717 non-null   object 
 9   url              1613 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 134.3+ KB


In [3]:
# let's see what possible reddit tags do we have for a post
unique_flair_texts = data['link_flair_text'].unique()
print(unique_flair_texts)

['Question' 'Sports' 'Discussion' 'Rant' 'Social/Club' 'Survey/Study/Poll'
 'Meme/Shitpost' 'Announcement' 'Photo' 'Other' 'YOUR MOD SPEAKS' 'News'
 nan 'Job Listing' 'MEGATHREAD' 'Meta' '(Misreported) News' 'MegaThread'
 'Social' 'Survey' 'Meme/shitpost']


In [4]:
# and how many words on average do we have in a posts
non_empty_selftext = data['selftext'].dropna()
average_word_count = non_empty_selftext.apply(lambda x: len(x.split())).mean()
print(f"Average number of words in non-empty 'selftext' columns: {average_word_count}")


Average number of words in non-empty 'selftext' columns: 71.52992125984252


In [5]:
# lets inspect average number of chars in post to calculate later the cost for using 4o-mini model
average_char_count = non_empty_selftext.apply(len).mean()
print(f"Average number of characters in non-empty 'selftext' columns: {average_char_count}")


Average number of characters in non-empty 'selftext' columns: 428.88188976377955


In [98]:
# total chars across all posts
total_char_count = non_empty_selftext.apply(len).sum()
print(f"Total number of characters in 'selftext' column: {total_char_count}")


Total number of characters in 'selftext' column: 544680


In [7]:
# let's inspect posts that don't contain any text at all
empty_selftext_posts = data[data['selftext'].isna()]
top_10_empty_selftext_posts = empty_selftext_posts["url"].head(10)
print(top_10_empty_selftext_posts)

3                  https://i.redd.it/epw2y828kvzd1.jpeg
9                  https://i.redd.it/vqesp9lvpnzd1.jpeg
12    https://www.reddit.com/r/gatech/comments/1glh4...
26                  https://i.redd.it/12zirh5at4zd1.png
31                 https://i.redd.it/a6yq8b5nl3zd1.jpeg
32               https://www.reddit.com/gallery/1gk7yds
36                 https://i.redd.it/58hqahveiwyd1.jpeg
37                  https://i.redd.it/pxvn6lcbmzyd1.png
42                  https://i.redd.it/uld58v2k8ztd1.png
49     https://youtu.be/0-2EXrw09Uw?si=mTXWlDeT46nMk6N_
Name: url, dtype: object


In [100]:
# let's see how many input chars we'll have for a combined text (title | text | reddit tag)
non_empty_selftext_df = data[data['selftext'].notna()]

total_combined_text_char_count = non_empty_selftext_df.apply(
    lambda row: len(f"Title: {row['title']} | Selftext: {row['selftext']} | Flair: {row['link_flair_text']}"), axis=1
).sum()

print(f"Total number of characters in combined text: {total_combined_text_char_count}")


Total number of characters in combined text: 659033


In [10]:
from openai import OpenAI 
import os

In [18]:
# load your .env file and don't forget to put your OpenAI API key there
%load_ext dotenv
%dotenv

In [20]:
# specify the model you want to be using 
MODEL="gpt-4o-mini"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as an env var>"))

In [34]:
neutral_question = non_empty_selftext_df['combined_text'].iloc[2]
completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": "You are a sentiment analysis classifier for a subreddit of Georgia Tech. Most of the most have sentiment so you rarely pick a neutral option, mostly it's either positive or negative. Sometimes, it's neutral mostly when it's a question. Answer only with a single digit: 1 for positive, -1 for negative, 0 for neutral"},
    {"role": "user", "content": neutral_question}  
  ]
)
print(neutral_question)
print("Assistant: " + completion.choices[0].message.content)

In [41]:
positive_post = non_empty_selftext_df['combined_text'].iloc[3]
completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": "You are a sentiment analysis classifier for a subreddit of Georgia Tech. Most of the most have sentiment so you rarely pick a neutral option, mostly it's either positive or negative. Sometimes, it's neutral mostly when it's a question. Answer only with a single digit: 1 for positive, -1 for negative, 0 for neutral"}, 
    {"role": "user", "content": positive_post}  
  ]
)
print(positive_post)
print("Assistant: " + completion.choices[0].message.content)

In [43]:
# let's create a function to process every row in DF and apply it for the whole dataset
def get_sentiment(text):
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a sentiment analysis classifier for a subreddit of Georgia Tech. Most of the most have sentiment so you rarely pick a neutral option, mostly it's either positive or negative. Sometimes, it's neutral mostly when it's a question. Answer only with a single digit: 1 for positive, -1 for negative, 0 for neutral"}, # <-- This is the system message that provides context to the model
            {"role": "user", "content": text}
        ]
    )
    return int(completion.choices[0].message.content)


non_empty_selftext_df['4o-mini-sentiment'] = non_empty_selftext_df['combined_text'].apply(get_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_empty_selftext_df['4o-mini-sentiment'] = non_empty_selftext_df['combined_text'].apply(get_sentiment)


In [48]:
# it took 12 minutes and around $0.04 for all requests, let's see sentiment stats
sentiment_counts = non_empty_selftext_df['4o-mini-sentiment'].value_counts()
print(sentiment_counts)


4o-mini-sentiment
 0    952
 1    187
-1    131
Name: count, dtype: int64


In [54]:
# let's try to minimize cost now by including only the middle part of the text 
def extract_middle_third(text):
    length = len(text)
    start = length // 3
    end = 2 * length // 3
    return text[start:end]

non_empty_selftext_df['combined_text_middle'] = non_empty_selftext_df.apply(
    lambda row: f"Title: {row['title']} | Selftext: {extract_middle_third(row['selftext'])} | Flair: {row['link_flair_text']}", axis=1
)

print(non_empty_selftext_df['combined_text_middle'].iloc[4])

Title: How do so many people graduate one year early from Georgia Tech?  | Selftext: people from my class are graduating one year early, i thought  | Flair: Discussion


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_empty_selftext_df['combined_text_middle'] = non_empty_selftext_df.apply(


In [60]:
middle_neutral_question = non_empty_selftext_df['combined_text_middle'].iloc[2]
completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": "You are a sentiment analysis classifier for a subreddit of Georgia Tech. Most of the most have sentiment so you rarely pick a neutral option, mostly it's either positive or negative. Sometimes, it's neutral mostly when it's a question. Answer only with a single digit: 1 for positive, -1 for negative, 0 for neutral"}, # <-- This is the system message that provides context to the model
    {"role": "user", "content": middle_neutral_question}  # <-- This is the user message for which the model will generate a response
  ]
)
print(middle_neutral_question)
print("Assistant: " + completion.choices[0].message.content)

Assistant: 0


In [65]:
negative_post = non_empty_selftext_df['combined_text_middle'].iloc[6]
completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": "You are a sentiment analysis classifier for a subreddit of Georgia Tech. Most of the most have sentiment so you rarely pick a neutral option, mostly it's either positive or negative. Sometimes, it's neutral mostly when it's a question. Answer only with a single digit: 1 for positive, -1 for negative, 0 for neutral"}, # <-- This is the system message that provides context to the model
    {"role": "user", "content": negative_post}  # <-- This is the user message for which the model will generate a response
  ]
)
print(negative_post)
print("Assistant: " + completion.choices[0].message.content)

Assistant: -1


In [87]:
positive_post = non_empty_selftext_df['combined_text_middle'].iloc[11]
completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": "You are a sentiment analysis classifier for a subreddit of Georgia Tech. Most of the most have sentiment so you rarely pick a neutral option, mostly it's either positive or negative. Sometimes, it's neutral mostly when it's a question. Answer only with a single digit: 1 for positive, -1 for negative, 0 for neutral"},
    {"role": "user", "content": positive_post}  
  ]
)
print(positive_post)
print("Assistant: " + completion.choices[0].message.content)

Assistant: 0


In [88]:
# let's run this evaluation on the middle part of the post for all entries
non_empty_selftext_df['4o-mini-sentiment-middle'] = non_empty_selftext_df['combined_text_middle'].apply(get_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_empty_selftext_df['4o-mini-sentiment-middle'] = non_empty_selftext_df['combined_text_middle'].apply(get_sentiment)


In [89]:
# let's compare how different the result is with the full-text analysis
same_values_count = (non_empty_selftext_df['4o-mini-sentiment'] == non_empty_selftext_df['4o-mini-sentiment-middle']).sum()
different_values_count = (non_empty_selftext_df['4o-mini-sentiment'] != non_empty_selftext_df['4o-mini-sentiment-middle']).sum()

print(f"Number of same values: {same_values_count}")
print(f"Number of different values: {different_values_count}")


Number of same values: 1180
Number of different values: 90


In [90]:
# if needed, we can save the different posts for further analysis
different_sentiment_df = non_empty_selftext_df[non_empty_selftext_df['4o-mini-sentiment'] != non_empty_selftext_df['4o-mini-sentiment-middle']]
# different_sentiment_df.to_csv('different_sentiment_posts.csv', index=False)


In [95]:
# let's examine manually the post where 4o-mini had different sentiments for the full text and middle of the text
random_20_posts = different_sentiment_df.sample(n=10, random_state=1)
for index, row in random_20_posts.iterrows():
    print(f"Title: {row['title']}")
    print(f"4o-mini-sentiment: {row['4o-mini-sentiment']}")
    print(f"4o-mini-sentiment-middle: {row['4o-mini-sentiment-middle']}")
    print(f"Link: {row['url']}\n")


Title: Here is a list of important people and things that did not exist the last time Georgia Tech Football entered a game with a winning record on December 26, 2018
4o-mini-sentiment: 1
4o-mini-sentiment-middle: 0
Link: https://www.reddit.com/r/gatech/comments/y9akuf/here_is_a_list_of_important_people_and_things/

Title: Why do free t shirt people not stock enough mediums?
4o-mini-sentiment: -1
4o-mini-sentiment-middle: 0
Link: https://www.reddit.com/r/gatech/comments/u8stwx/why_do_free_t_shirt_people_not_stock_enough/

Title: Fearful and Anxious for CS 1331 Summer 23 with Landry,advice and help would be appreciated
4o-mini-sentiment: -1
4o-mini-sentiment-middle: 0
Link: https://www.reddit.com/r/gatech/comments/13hua77/fearful_and_anxious_for_cs_1331_summer_23_with/

Title: I‘m trying to get in contact with undergrad admission, not ever getting through
4o-mini-sentiment: 0
4o-mini-sentiment-middle: -1
Link: https://www.reddit.com/r/gatech/comments/yx685i/im_trying_to_get_in_contact_wi

In [102]:
# save sentiment results to a new (or the old) file
non_empty_selftext_df = non_empty_selftext_df.drop_duplicates(subset='name')
non_empty_selftext_df.to_csv('../data/reddit_posts_3_years.csv', index=False)
