### Libraries

In [10]:
import numpy as np
import pandas as pd
from datetime import datetime
# from dateutil.tz import gettz

# for text processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from textblob import TextBlob

from pathlib import Path
# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='ticks', font_scale=1.5)

Root = Path('.').absolute().parent
SCRIPTS = Root / r'scripts'
DATA = Root/ r'C:\Users\krishnadas\Projects\ML Projects\ManipDetect\data'

In [12]:
df = pd.read_csv(DATA/'wallstreetbetsnew_posts_2.csv')
print(f"Data shape: {df.shape}")
df.head()

Data shape: (6297, 11)


Unnamed: 0,post_id,title,text,post_type,author_name,author_id,score,num_comments,created_utc,url,created_datetime
0,1lu3q54,Thoughts on PLL stock?,Merger news impact on PLL? DO YOU HAVE ANY INS...,text,Still-Great89,1hpnylgt70,0,6,1751917000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...,2025-07-07 19:37:00
1,1ltzbl9,Worksport’s SOLIS Pilot with Fortune 500 Giant...,Worksport (WKSP) just locked in a major valida...,text,DenisEchoField,1rwjrgb4sc,10,0,1751907000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...,2025-07-07 16:50:55
2,1ltwsp7,SOLIS Countdown + Fortune-500 Pilot-Worksport ...,"Clean-tech skeptics, meet hard proof: a top-15...",text,DenisEchoField,1rwjrgb4sc,2,0,1751901000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...,2025-07-07 15:14:42
3,1ltvyqb,Tariff-Proof Growth Engine Goes Viral — Worksp...,Latest numbers crushed expectations. Unit outp...,text,3421431boom,766ues2,16,0,1751899000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...,2025-07-07 14:42:25
4,1ltjdp7,BMNR & RGC Explode! 🔥 Massive Moves You Can’t ...,\n🚀 $BMNR and $RGC are blasting off – and this...,text,Mino3621,rqvik0bj,2,1,1751857000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...,2025-07-07 02:58:19


In [4]:
# check for null values
print(df.isnull().sum())

post_id           0
title             0
text              0
post_type         0
author_name       0
author_id       387
score             0
num_comments      0
created_utc       0
url               0
dtype: int64


Looks like some of the authors are either deleted or suspended accounts.

In [5]:
# check the min and max date
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
print(df['created_utc'].min(), df['created_utc'].max())

2021-01-26 14:12:31 2025-07-08 13:18:25


In [8]:
df.groupby('post_id').size().sort_values(ascending=False).head(10)

post_id
1il6h9l    8
1ioj7u9    8
1imcyro    8
1ltcd94    8
1lkwdkr    8
1ifyuy9    8
1iy1jnk    8
1j3givt    8
1jbu2ne    8
1jk6j1f    8
dtype: int64

In [9]:
# list all the post by post_id 1il6h9l
df[df['post_id'] == '1il6h9l'].head(10)

Unnamed: 0,post_id,title,text,post_type,author_name,author_id,score,num_comments,created_utc,url
502,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
1161,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
1836,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,1,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
3457,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
3967,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
4504,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,1,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
5010,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,1,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
6159,1il6h9l,¡! new discord !¡,started a discord community where we discuss t...,text,In4thelongrun_,a0so7n65,0,36,2025-02-09 04:19:29,https://www.reddit.com/r/Wallstreetbetsnew/com...
