### Libraries

In [16]:
from dotenv import load_dotenv
import os
import neptune
import praw
from datetime import datetime, timedelta, timezone
import json
import time
import pandas as pd
from tqdm import tqdm
import regex as re

# for text processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from textblob import TextBlob

from pathlib import Path
Root = Path('.').absolute().parent
SCRIPTS = Root / r'scripts'
# SCRIPTS = Root / r'C:\Users\Admin\Projects\ML Projects\ManipDetect\research\scripts'
DATA = Root/ r'C:\Users\krishnadas\Projects\ML Projects\ManipDetect\data'

In [2]:
df = pd.read_csv(DATA / 'submissions_reddit.csv.zip', compression='zip')
df.head()

Unnamed: 0,id,author,created,retrieved,edited,pinned,archived,locked,removed,deleted,...,link_flair_text,upvote_ratio,score,gilded,total_awards_received,num_comments,num_crossposts,selftext,thumbnail,shortlink
0,ko124i,[deleted],2021-01-01 00:02:06,2021-02-02 21:52:13,1970-01-01 00:00:00,0,0,0,1,1,...,Gain,1.0,34,0,1,14,0,[deleted],default,https://redd.it/ko124i
1,ko12uq,[deleted],2021-01-01 00:03:20,2021-02-02 21:52:13,1970-01-01 00:00:00,0,0,0,1,1,...,Gain,1.0,2,0,0,0,0,[deleted],default,https://redd.it/ko12uq
2,ko13df,[deleted],2021-01-01 00:04:11,2021-02-02 21:52:13,1970-01-01 00:00:00,0,0,0,1,1,...,Meme,0.88,13,0,0,7,0,[deleted],default,https://redd.it/ko13df
3,ko17yf,shirotimatim,2021-01-01 00:11:51,2021-02-02 21:52:13,1970-01-01 00:00:00,0,0,0,0,0,...,News,0.92,183,0,0,26,0,,https://b.thumbs.redditmedia.com/LmWdNB0W1qHT0...,https://redd.it/ko17yf
4,ko1a4i,WSBVoteBot,2021-01-01 00:15:38,2021-02-02 21:52:13,1970-01-01 00:00:00,0,0,0,0,0,...,,0.5,0,0,0,19,0,Every time a new submission is posted to walls...,self,https://redd.it/ko1a4i


In [3]:
df['author'].value_counts().head(10)
# df['created_utc'] = pd.to_datetime(df['created'])

author
[deleted]               140686
Onboarding92               785
OPINION_IS_UNPOPULAR       630
RealPennyMuncher           204
Janto_2021                 196
DrioMarqui                 185
dailystockalert            170
Starlight-786              166
VisualMod                  161
pdwp90                     157
Name: count, dtype: int64

In [4]:
# Convert created column to datetime if it's not already
df_wsb = df.copy()
df_wsb['created'] = pd.to_datetime(df_wsb['created'])

In [5]:
"""
Initialize with known manipulation periods for labeling

manipulation_periods: list of tuples [(start_date, end_date), ...]
Example: [('2021-01-13', '2021-01-15'), ('2021-01-25', '2021-01-28')]
"""
manipulation_periods = [
    ('2021-01-13', '2021-01-15'),  # Initial coordination buildup
    ('2021-01-25', '2021-01-28'),  # Peak manipulation period
    ('2021-02-01', '2021-02-05')   # Sustained artificial activity
]

# Convert to datetime objects
manipulation_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) 
    for start, end in manipulation_periods
]

In [6]:
# select authors from manipulation periods
def select_authors_in_manipulation_periods(df, manipulation_periods):
    authors = set()
    for start, end in manipulation_periods:
        mask = (df['created'] >= start) & (df['created'] <= end)
        authors.update(df.loc[mask, 'author'].unique())
    return authors

selected_authors = select_authors_in_manipulation_periods(df_wsb, manipulation_periods)

In [7]:
# check if the selected authors contain deleted
deleted_authors = [authors for authors in selected_authors if authors == '[deleted]']
deleted_authors_count = len(deleted_authors)
print(f"Number of deleted authors in selected authors: {deleted_authors_count}")

Number of deleted authors in selected authors: 1


In [8]:
df_wsb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775326 entries, 0 to 775325
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   id                     775326 non-null  object        
 1   author                 775326 non-null  object        
 2   created                775326 non-null  datetime64[ns]
 3   retrieved              775326 non-null  object        
 4   edited                 775326 non-null  object        
 5   pinned                 775326 non-null  int64         
 6   archived               775326 non-null  int64         
 7   locked                 775326 non-null  int64         
 8   removed                775326 non-null  int64         
 9   deleted                775326 non-null  int64         
 10  is_self                775326 non-null  int64         
 11  is_video               775326 non-null  int64         
 12  is_original_content    775326 non-null  int6

In [9]:
df_wsb.columns

Index(['id', 'author', 'created', 'retrieved', 'edited', 'pinned', 'archived',
       'locked', 'removed', 'deleted', 'is_self', 'is_video',
       'is_original_content', 'title', 'link_flair_text', 'upvote_ratio',
       'score', 'gilded', 'total_awards_received', 'num_comments',
       'num_crossposts', 'selftext', 'thumbnail', 'shortlink'],
      dtype='object')

In [10]:
# remove unnecessary columns
columns_to_remove = ['retrieved', 'edited', 'gilded', 'edited', 'pinned', 'archived','locked', 'total_awards_received',
                    'num_crossposts', 'thumbnail', 'link_flair_text']
df_wsb.drop(columns=columns_to_remove, inplace=True, errors='ignore')
print(df_wsb.columns)
df_wsb.head()

Index(['id', 'author', 'created', 'removed', 'deleted', 'is_self', 'is_video',
       'is_original_content', 'title', 'upvote_ratio', 'score', 'num_comments',
       'selftext', 'shortlink'],
      dtype='object')


Unnamed: 0,id,author,created,removed,deleted,is_self,is_video,is_original_content,title,upvote_ratio,score,num_comments,selftext,shortlink
0,ko124i,[deleted],2021-01-01 00:02:06,1,1,1,0,0,"3k - 170k since March (Also, buy LIT!!)",1.0,34,14,[deleted],https://redd.it/ko124i
1,ko12uq,[deleted],2021-01-01 00:03:20,1,1,0,0,0,Got out of PLTR calls after learning about IV ...,1.0,2,0,[deleted],https://redd.it/ko12uq
2,ko13df,[deleted],2021-01-01 00:04:11,1,1,0,0,0,Hell of a headline,0.88,13,7,[deleted],https://redd.it/ko13df
3,ko17yf,shirotimatim,2021-01-01 00:11:51,0,0,0,0,0,"Top popular stocks on WSB too! Also, why they ...",0.92,183,26,,https://redd.it/ko17yf
4,ko1a4i,WSBVoteBot,2021-01-01 00:15:38,0,0,1,0,0,WSBVoteBot Log for Jan 01 2021,0.5,0,19,Every time a new submission is posted to walls...,https://redd.it/ko1a4i


In [None]:
col_names_to_change = ['id', 'created', 'selftext', 'num_comments', 'shortlink']
new_col_names = ['post id', 'timestamp', 'body', 'url']
df_wsb.rename(columns=dict(zip(col_names_to_change, new_col_names)), inplace=True)

In [12]:
df_wsb['deleted'].value_counts(), df_wsb['removed'].value_counts(), df_wsb['is_original_content'].value_counts()

(deleted
 0    685664
 1     89662
 Name: count, dtype: int64,
 removed
 1    698970
 0     76356
 Name: count, dtype: int64,
 is_original_content
 0    760265
 1     15061
 Name: count, dtype: int64)

In [13]:
def _clean_text(text):
    """Clean and normalize text data"""
    if pd.isna(text):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text.lower()

In [14]:
"""Load Reddit data and perform initial cleaning"""

# Convert timestamp to datetime
df_wsb['timestamp'] = pd.to_datetime(df_wsb['timestamp'])

# Filter for GameStop-related posts
gme_keywords = ['GME', 'GameStop', 'Gamestop', 'GAMESTOP']
df_wsb['is_gme'] = df_wsb['title'].str.contains('|'.join(gme_keywords), case=False, na=False) | \
                df_wsb['body'].str.contains('|'.join(gme_keywords), case=False, na=False)

# Keep only GME-related posts
df_wsb = df_wsb[df_wsb['is_gme']].copy()

# Clean text data
df_wsb['title_clean'] = df_wsb['title'].apply(_clean_text)
df_wsb['body_clean'] = df_wsb['body'].fillna('').apply(_clean_text)
df_wsb['combined_text'] = df_wsb['title_clean'] + ' ' + df_wsb['body_clean']

df_wsb.sort_values('timestamp').reset_index(drop=True)
df_wsb.head()


Unnamed: 0,post id,author,timestamp,removed,deleted,is_self,is_video,is_original_content,title,upvote_ratio,score,num_comments,body,url,is_gme,title_clean,body_clean,combined_text
5,ko1bnp,dluther93,2021-01-01 00:18:03,1,0,1,0,0,What would make GME shorts win?,1.0,1,0,[removed],https://redd.it/ko1bnp,True,what would make gme shorts win,removed,what would make gme shorts win removed
12,ko1kck,[deleted],2021-01-01 00:32:49,1,1,0,0,0,Not sure how reliable a random comment in Cohe...,0.93,80,33,[deleted],https://redd.it/ko1kck,True,not sure how reliable a random comment in cohe...,deleted,not sure how reliable a random comment in cohe...
21,ko1ttx,WSBProfitProphet,2021-01-01 00:49:32,1,0,1,0,0,🚀🚀🚀🚀How have we been so fucking blind? GME is ...,1.0,1,0,[removed],https://redd.it/ko1ttx,True,how have we been so fucking blind gme is liter...,removed,how have we been so fucking blind gme is liter...
25,ko1xxb,WSBProfitProphet,2021-01-01 00:56:35,0,0,1,0,0,GME is the Rockets 🚀🚀🚀🚀,0.82,57,10,"Gamestop colors: Red, White and Black\n\nHoust...",https://redd.it/ko1xxb,True,gme is the rockets,gamestop colors red white and black houston ro...,gme is the rockets gamestop colors red white a...
26,ko1zs3,WhiskeySierra1984,2021-01-01 00:59:55,1,0,0,0,0,Looked out the window of my PLTR/GME rocketshi...,0.9,75,8,,https://redd.it/ko1zs3,True,looked out the window of my pltr gme rocketshi...,,looked out the window of my pltr gme rocketshi...


In [15]:
df_wsb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 107499 entries, 5 to 775258
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   post id              107499 non-null  object        
 1   author               107499 non-null  object        
 2   timestamp            107499 non-null  datetime64[ns]
 3   removed              107499 non-null  int64         
 4   deleted              107499 non-null  int64         
 5   is_self              107499 non-null  int64         
 6   is_video             107499 non-null  int64         
 7   is_original_content  107499 non-null  int64         
 8   title                107499 non-null  object        
 9   upvote_ratio         107499 non-null  float64       
 10  score                107499 non-null  int64         
 11  num_comments         107499 non-null  int64         
 12  body                 75958 non-null   object        
 13  url                

In [18]:
# Extract post features
"""Extract features from individual posts"""

# Sentiment analysis
df_wsb['sentiment'] = df_wsb['combined_text'].apply(
    lambda x: TextBlob(x).sentiment.polarity if x else 0
)
df_wsb['sentiment_magnitude'] = df_wsb['combined_text'].apply(
    lambda x: abs(TextBlob(x).sentiment.polarity) if x else 0
)
df_wsb['subjectivity'] = df_wsb['combined_text'].apply(
    lambda x: TextBlob(x).sentiment.subjectivity
)

# Text length features
df_wsb['title_length'] = df_wsb['title_clean'].str.len()
df_wsb['body_length'] = df_wsb['body_clean'].str.len()
df_wsb['total_length'] = df_wsb['title_length'] + df_wsb['body_length']

# Engagement features
df_wsb['engagement_score'] = df_wsb['score'] + df_wsb['comms_num']
df_wsb['score_per_comment'] = df_wsb['score'] / (df_wsb['comms_num'] + 1)  # Avoid division by zero

# Author features (anonymized but trackable patterns)
df_wsb['author_post_count'] = df_wsb.groupby('author_id')['author_id'].transform('count')
df_wsb.head()


KeyError: 'comms_num'

In [None]:
# Calculate text similarity within time windows
window_hours=1
# Group posts by hour windows
df['hour_window'] = df['timestamp'].dt.floor(f'{window_hours}H')