In [127]:
import pandas as pd

In [128]:
# rename file if its different for you
data = pd.read_csv('../data/reddits_posts_3_years.csv')

data.head()
data.shape
data.columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1236 entries, 0 to 1235
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      1235 non-null   object 
 1   title                     1236 non-null   object 
 2   selftext                  1236 non-null   object 
 3   author                    1236 non-null   object 
 4   created_utc               1236 non-null   float64
 5   score                     1236 non-null   int64  
 6   num_comments              1236 non-null   int64  
 7   subreddit                 1236 non-null   object 
 8   link_flair_text           1235 non-null   object 
 9   url                       1131 non-null   object 
 10  combined_text             1236 non-null   object 
 11  4o-mini-sentiment         1236 non-null   int64  
 12  combined_text_middle      1236 non-null   object 
 13  4o-mini-sentiment-middle  1236 non-null   int64  
dtypes: float

In [129]:
# let's leave only the columns we want to train on
processed_data = data[['combined_text', 'link_flair_text', '4o-mini-sentiment']]

In [130]:
import re
import string
import nltk
import joblib
from nltk.corpus import stopwords
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [131]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/koselev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [132]:
# tokenize and process posts to remove redundant words
def process_reddit_post(post):
    post = re.sub(r'https?:\/\/.*[\r\n]*', '', post)
    post = re.sub(r'#', '', post)
    post = post.translate(str.maketrans('', '', string.punctuation))
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    post_tokens = tokenizer.tokenize(post)
    post_clean = [word for word in post_tokens if word not in stop_words and word not in string.punctuation]
    stemmer = PorterStemmer()
    post_stemmed = [stemmer.stem(word) for word in post_clean]
    return post_stemmed  

In [133]:
processed_data['processed_text'] = processed_data['combined_text'].apply(process_reddit_post)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['processed_text'] = processed_data['combined_text'].apply(process_reddit_post)


In [134]:
processed_data['processed_text'] = processed_data['processed_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

vectorizer = TfidfVectorizer(max_features=5000)

processed_data = processed_data.dropna(subset=['combined_text'])

X = vectorizer.fit_transform(processed_data['combined_text'])
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['processed_text'] = processed_data['processed_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)


['../models/tfidf_vectorizer.pkl']

In [135]:
# specify target column
y = data['4o-mini-sentiment']

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [137]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [138]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8266129032258065
              precision    recall  f1-score   support

          -1       1.00      0.05      0.09        21
           0       0.82      0.99      0.90       189
           1       0.94      0.42      0.58        38

    accuracy                           0.83       248
   macro avg       0.92      0.49      0.52       248
weighted avg       0.85      0.83      0.78       248



In [139]:
cherry_picked_post = processed_data['combined_text'].iloc[2]

# Cherry pick one post from the data frame and see how the trained model performs
cherry_picked_post = processed_data['combined_text'].iloc[2]  
cherry_picked_post_vectorized = vectorizer.transform([cherry_picked_post]) 

predicted_sentiment = model.predict(cherry_picked_post_vectorized)
print(f"Cherry picked post: {cherry_picked_post}")
print(f"Predicted sentiment: {predicted_sentiment[0]}")


Cherry picked post: Title: Stadium seating question for today | Selftext: Morning! We haven't been to a game since having kids but are finally going back today with our oldest. Are portable stadium seats allowed, advisable, helpful? Fwiw, this is for a 9 yo - one one hand I need him to learn to enjoy the experience even with uncomfortable seats, but on the other I need him to remember his first Tech game experience as amazing. Thanks if you have any advice! | Flair: Question
Predicted sentiment: 0


In [140]:
import joblib

model_filename = '../models/logistic_regression_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")


Model saved to ../models/logistic_regression_model.pkl


In [142]:
# print all negative posts
for i in range(500):
    post = processed_data['combined_text'].iloc[i]
    post_vectorized = vectorizer.transform([post])
    predicted_sentiment = model.predict(post_vectorized)
    if predicted_sentiment[0] == -1:
        print(f"Negative post: {post}")


Negative post: Title: Why does Skiles smell like vomit | Selftext: It’s been smelling like that for the past month or two. | Flair: Rant
Negative post: Title: GT needs to drop their contracts with BEST/GardaWorld security.  | Selftext: BEST has been an overall plague on so many GT events over the past few years. With the recent discussion on athletics troubles and career fair issues, I just wanted to publish my thoughts on things I’ve seen happen.  Every year we play at home for the uGA football game, the student section gets flooded and BEST “rent a cops” have always been belligerent to students, blocking them from their seats.  Last year during the UNC football game, students stormed the field. The entire stadium atmosphere changed 1-2 minutes before the end of the game, and it was obvious it was coming. BEST security body checked and tackled the first few field stormers, despite 1000s more behind them. They injured at least 2 doing this, instead of dropping back to protect players a