### Libraries

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
# from dateutil.tz import gettz

# for text processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from textblob import TextBlob

from pathlib import Path
# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='ticks', font_scale=1.5)

Root = Path('.').absolute().parent
SCRIPTS = Root / r'scripts'
# DATA = Root/ r'C:\Users\krishnadas\Projects\ML Projects\ManipDetect\data'
DATA = Root/ r'C:\Users\Admin\Projects\ML Projects\ManipDetect\data'

In [3]:
df = pd.read_csv(DATA/'wallstreetbetsnew_posts.csv')
print(f"Data shape: {df.shape}")
df.head()

Data shape: (4954, 10)


Unnamed: 0,post_id,title,text,post_type,author_name,author_id,score,num_comments,created_utc,url
0,11dk0dd,The Ultimate Free Course for Options Trading,# Here’s a free resource for options trading I...,text,AlphaGiveth,bjb5f1tl,271,71,1677525000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...
1,1lujzds,NVIDIA Blackwell Ultra chip commercialized,It is learned that NVIDIA (NVDA) and CoreWeave...,text,Hawdet,lokj08bz,3,0,1751966000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...
2,1luock6,Timeline to Liftoff-Key Dates Every $WKSP Watc...,Late July: Mid-pilot check-in with constructio...,text,DenisEchoField,1rwjrgb4sc,1,0,1751981000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...
3,1ltvyqb,Tariff-Proof Growth Engine Goes Viral — Worksp...,Latest numbers crushed expectations. Unit outp...,text,3421431boom,766ues2,17,0,1751899000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...
4,1ltzbl9,Worksport’s SOLIS Pilot with Fortune 500 Giant...,Worksport (WKSP) just locked in a major valida...,text,DenisEchoField,1rwjrgb4sc,8,0,1751907000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...


In [4]:
# check for null values
print(df.isnull().sum())

post_id           0
title             0
text              0
post_type         0
author_name       0
author_id       492
score             0
num_comments      0
created_utc       0
url               0
dtype: int64


Looks like some of the authors are either deleted or suspended accounts.

In [5]:
# check the min and max date
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
print(df['created_utc'].min(), df['created_utc'].max())

2021-01-26 14:12:31 2025-07-09 09:08:41


In [None]:
df.groupby('post_id').size().sort_values(ascending=False).head(10)

post_id
1ipv4u5    5
1lknrj7    5
1iprgwk    5
1kp6t87    5
1ky39lx    5
1jj2wvf    5
1jvz454    5
1jwjqiu    5
1i6adlx    5
1ih6t19    5
dtype: int64

: 

In [9]:
# list all the post by post_id 1il6h9l
df[df['post_id'] == '1il6h9l'].head(10)

Unnamed: 0,post_id,title,text,post_type,author_name,author_id,score,num_comments,created_utc,url
502,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
1161,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
1836,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,1,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
3457,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
3967,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
4504,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,1,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
5010,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,1,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
6159,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
