In [19]:
import numpy as np
import pandas as pd
from datetime import datetime
from dateutil.tz import gettz

# for text processing
import re
from textblob import TextBlob

from pathlib import Path
# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='ticks', font_scale=1.5)

Root = Path('.').absolute().parent
# DATA = Root / r'C:\Users\Admin\Projects\ML Projects\ManipDetect\data'
DATA = Root/ r'C:\Users\krishnadas\Projects\ML Projects\ManipDetect\data'

In [7]:
filepath = DATA / 'reddit_wsb.csv'
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53187 entries, 0 to 53186
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      53187 non-null  object 
 1   score      53187 non-null  int64  
 2   id         53187 non-null  object 
 3   url        53187 non-null  object 
 4   comms_num  53187 non-null  int64  
 5   created    53187 non-null  float64
 6   body       24738 non-null  object 
 7   timestamp  53187 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 3.2+ MB


In [10]:
df.value_counts()

title                                                                                                                                                                                            score  id      url                                                                                                         comms_num  created       body                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [8]:
"""
Initialize with known manipulation periods for labeling

manipulation_periods: list of tuples [(start_date, end_date), ...]
Example: [('2021-01-13', '2021-01-15'), ('2021-01-25', '2021-01-28')]
"""
manipulation_periods = [
    ('2021-01-13', '2021-01-15'),  # Initial coordination buildup
    ('2021-01-25', '2021-01-28'),  # Peak manipulation period
    ('2021-02-01', '2021-02-05')   # Sustained artificial activity
]

# Convert to datetime objects
manipulation_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) 
    for start, end in manipulation_periods
]

In [14]:
def _clean_text(text):
    """Clean and normalize text data"""
    if pd.isna(text):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text.lower()

In [15]:
"""Load Reddit data and perform initial cleaning"""

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Filter for GameStop-related posts
gme_keywords = ['GME', 'GameStop', 'Gamestop', 'GAMESTOP']
df['is_gme'] = df['title'].str.contains('|'.join(gme_keywords), case=False, na=False) | \
                df['body'].str.contains('|'.join(gme_keywords), case=False, na=False)

# Keep only GME-related posts
df = df[df['is_gme']].copy()

# Clean text data
df['title_clean'] = df['title'].apply(_clean_text)
df['body_clean'] = df['body'].fillna('').apply(_clean_text)
df['combined_text'] = df['title_clean'] + ' ' + df['body_clean']

df.sort_values('timestamp').reset_index(drop=True)
df.head()


Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,is_gme,title_clean,body_clean,combined_text
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10,True,math professor scott steiner says the numbers ...,,math professor scott steiner says the numbers ...
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,True,exit the system,the ceo of nasdaq pushed to halt trading to gi...,exit the system the ceo of nasdaq pushed to ha...
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57,True,new sec filing for gme can someone less retard...,,new sec filing for gme can someone less retard...
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56,True,not to distract from gme just thought our amc ...,,not to distract from gme just thought our amc ...
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,True,short stock doesn t have an expiration date,hedgefund whales are spreading disinfo saying ...,short stock doesn t have an expiration date he...


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15018 entries, 1 to 53183
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   title          15018 non-null  object        
 1   score          15018 non-null  int64         
 2   id             15018 non-null  object        
 3   url            15018 non-null  object        
 4   comms_num      15018 non-null  int64         
 5   created        15018 non-null  float64       
 6   body           10116 non-null  object        
 7   timestamp      15018 non-null  datetime64[ns]
 8   is_gme         15018 non-null  bool          
 9   title_clean    15018 non-null  object        
 10  body_clean     15018 non-null  object        
 11  combined_text  15018 non-null  object        
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2), object(7)
memory usage: 1.4+ MB


In [None]:
# Extract post features
"""Extract features from individual posts"""

# Sentiment analysis
df['sentiment'] = df['combined_text'].apply(
    lambda x: TextBlob(x).sentiment.polarity if x else 0
)
df['sentiment_magnitude'] = df['combined_text'].apply(
    lambda x: abs(TextBlob(x).sentiment.polarity) if x else 0
)

# Text length features
df['title_length'] = df['title_clean'].str.len()
df['body_length'] = df['body_clean'].str.len()
df['total_length'] = df['title_length'] + df['body_length']

# Engagement features
df['engagement_score'] = df['score'] + df['comms_num']
df['score_per_comment'] = df['score'] / (df['comms_num'] + 1)  # Avoid division by zero

# Author features (anonymized but trackable patterns)
df['author_post_count'] = df.groupby('author')['author'].transform('count')
