### Libraries

In [5]:
import numpy as np
import pandas as pd
from datetime import datetime
from dateutil.tz import gettz

# for text processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from textblob import TextBlob

from pathlib import Path
# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='ticks', font_scale=1.5)

Root = Path('.').absolute().parent
DATA = Root / r'C:\Users\Admin\Projects\ML Projects\ManipDetect\data'
# DATA = Root/ r'C:\Users\krishnadas\Projects\ML Projects\ManipDetect\data'

In [6]:
filepath = DATA / 'reddit_wsb_with_authors.csv'
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,title,score,post_id,url,comms_num,created,body,timestamp,author_name,author_id
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41,[deleted],[deleted]
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10,jaxxtothemaxx,onvag
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,stonerbobo,f3p9m
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57,Sleavitt10,9u7y1
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56,di3_b0ld,13cexg


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53187 entries, 0 to 53186
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        53187 non-null  object 
 1   score        53187 non-null  int64  
 2   post_id      53187 non-null  object 
 3   url          53187 non-null  object 
 4   comms_num    53187 non-null  int64  
 5   created      53187 non-null  float64
 6   body         24738 non-null  object 
 7   timestamp    53187 non-null  object 
 8   author_name  50623 non-null  object 
 9   author_id    50623 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 4.1+ MB


In [10]:
df['author_name'].value_counts()

author_name
[deleted]               19087
OPINION_IS_UNPOPULAR      373
pdwp90                     81
AutoModerator              61
dhiral1994                 58
                        ...  
brokebroker90               1
Lleoki                      1
Nitoryuhunter               1
warriormitch                1
vF101                       1
Name: count, Length: 22887, dtype: int64

In [11]:
"""
Initialize with known manipulation periods for labeling

manipulation_periods: list of tuples [(start_date, end_date), ...]
Example: [('2021-01-13', '2021-01-15'), ('2021-01-25', '2021-01-28')]
"""
manipulation_periods = [
    ('2021-01-13', '2021-01-15'),  # Initial coordination buildup
    ('2021-01-25', '2021-01-28'),  # Peak manipulation period
    ('2021-02-01', '2021-02-05')   # Sustained artificial activity
]

# Convert to datetime objects
manipulation_periods = [
    (pd.to_datetime(start), pd.to_datetime(end)) 
    for start, end in manipulation_periods
]

In [12]:
def _clean_text(text):
    """Clean and normalize text data"""
    if pd.isna(text):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text.lower()

In [13]:
"""Load Reddit data and perform initial cleaning"""

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Filter for GameStop-related posts
gme_keywords = ['GME', 'GameStop', 'Gamestop', 'GAMESTOP']
df['is_gme'] = df['title'].str.contains('|'.join(gme_keywords), case=False, na=False) | \
                df['body'].str.contains('|'.join(gme_keywords), case=False, na=False)

# Keep only GME-related posts
df = df[df['is_gme']].copy()

# Clean text data
df['title_clean'] = df['title'].apply(_clean_text)
df['body_clean'] = df['body'].fillna('').apply(_clean_text)
df['combined_text'] = df['title_clean'] + ' ' + df['body_clean']

df.sort_values('timestamp').reset_index(drop=True)
df.head()


Unnamed: 0,title,score,post_id,url,comms_num,created,body,timestamp,author_name,author_id,is_gme,title_clean,body_clean,combined_text
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10,jaxxtothemaxx,onvag,True,math professor scott steiner says the numbers ...,,math professor scott steiner says the numbers ...
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,stonerbobo,f3p9m,True,exit the system,the ceo of nasdaq pushed to halt trading to gi...,exit the system the ceo of nasdaq pushed to ha...
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57,Sleavitt10,9u7y1,True,new sec filing for gme can someone less retard...,,new sec filing for gme can someone less retard...
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56,di3_b0ld,13cexg,True,not to distract from gme just thought our amc ...,,not to distract from gme just thought our amc ...
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,Aiken_Drumn,6g1d5,True,short stock doesn t have an expiration date,hedgefund whales are spreading disinfo saying ...,short stock doesn t have an expiration date he...


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15018 entries, 1 to 53183
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   title          15018 non-null  object        
 1   score          15018 non-null  int64         
 2   post_id        15018 non-null  object        
 3   url            15018 non-null  object        
 4   comms_num      15018 non-null  int64         
 5   created        15018 non-null  float64       
 6   body           10116 non-null  object        
 7   timestamp      15018 non-null  datetime64[ns]
 8   author_name    14295 non-null  object        
 9   author_id      14295 non-null  object        
 10  is_gme         15018 non-null  bool          
 11  title_clean    15018 non-null  object        
 12  body_clean     15018 non-null  object        
 13  combined_text  15018 non-null  object        
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2), object(9)
memory usage

In [17]:
TextBlob(df.loc[39, 'combined_text']).sentiment.subjectivity

0.4965277777777778

In [18]:
# Extract post features
"""Extract features from individual posts"""

# Sentiment analysis
df['sentiment'] = df['combined_text'].apply(
    lambda x: TextBlob(x).sentiment.polarity if x else 0
)
df['sentiment_magnitude'] = df['combined_text'].apply(
    lambda x: abs(TextBlob(x).sentiment.polarity) if x else 0
)
df['subjectivity'] = df['combined_text'].apply(
    lambda x: TextBlob(x).sentiment.subjectivity
)

# Text length features
df['title_length'] = df['title_clean'].str.len()
df['body_length'] = df['body_clean'].str.len()
df['total_length'] = df['title_length'] + df['body_length']

# Engagement features
df['engagement_score'] = df['score'] + df['comms_num']
df['score_per_comment'] = df['score'] / (df['comms_num'] + 1)  # Avoid division by zero

# Author features (anonymized but trackable patterns)
df['author_post_count'] = df.groupby('author_id')['author_id'].transform('count')
df.head()


Unnamed: 0,title,score,post_id,url,comms_num,created,body,timestamp,author_name,author_id,...,combined_text,sentiment,sentiment_magnitude,subjectivity,title_length,body_length,total_length,engagement_score,score_per_comment,author_post_count
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10,jaxxtothemaxx,onvag,...,math professor scott steiner says the numbers ...,0.0,0.0,0.0,80,0,80,133,4.583333,1.0
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,stonerbobo,f3p9m,...,exit the system the ceo of nasdaq pushed to ha...,0.016187,0.016187,0.348753,15,1156,1171,47,0.0,1.0
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57,Sleavitt10,9u7y1,...,new sec filing for gme can someone less retard...,-0.232576,0.232576,0.480303,73,0,73,103,0.386667,3.0
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56,di3_b0ld,13cexg,...,not to distract from gme just thought our amc ...,0.25,0.25,0.25,78,0,78,227,0.452229,2.0
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,Aiken_Drumn,6g1d5,...,short stock doesn t have an expiration date he...,-0.02,0.02,0.3625,43,648,691,370,5.87037,2.0


In [19]:
# Calculate text similarity within time windows
window_hours=1
# Group posts by hour windows
df['hour_window'] = df['timestamp'].dt.floor(f'{window_hours}H')

  df['hour_window'] = df['timestamp'].dt.floor(f'{window_hours}H')


In [20]:
df.groupby('hour_window').agg({'sentiment': 'mean',
                                'sentiment_magnitude': 'mean',
                                'subjectivity': 'mean',
                                'title_length': 'mean',
                                'body_length': 'mean'}).reset_index()

Unnamed: 0,hour_window,sentiment,sentiment_magnitude,subjectivity,title_length,body_length
0,2021-01-28 09:00:00,0.077098,0.141449,0.434277,69.100000,399.500000
1,2021-01-28 10:00:00,0.089336,0.142669,0.569091,60.000000,739.200000
2,2021-01-28 11:00:00,-0.039918,0.115682,0.476641,53.875000,762.250000
3,2021-01-28 12:00:00,0.058265,0.075210,0.503884,82.800000,1334.200000
4,2021-01-28 13:00:00,0.148107,0.154764,0.502638,77.571429,1547.285714
...,...,...,...,...,...,...
1825,2021-08-13 18:00:00,0.019949,0.019949,0.482323,56.000000,321.000000
1826,2021-08-13 21:00:00,0.079651,0.079651,0.412879,23.000000,9149.000000
1827,2021-08-13 23:00:00,0.095299,0.095299,0.416016,80.000000,16532.000000
1828,2021-08-14 20:00:00,0.101338,0.101338,0.344874,70.000000,1914.000000


In [21]:
# Calculate text similarity within time windows
window_hours=1
# Group posts by hour windows
df['hour_window'] = df['timestamp'].dt.floor(f'{window_hours}H')

similarity_scores = []

for window, group in df.groupby('hour_window'):
    if len(group) < 2:
        # If only one post in window, similarity is 0
        similarity_scores.extend([0] * len(group))
        continue
        
    # Calculate TF-IDF similarity
    texts = group['combined_text'].fillna('').tolist()
    
    try:
        vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(texts)
        similarity_matrix = cosine_similarity(tfidf_matrix)
        
        # Average similarity for each post to all others in the window
        avg_similarities = []
        for i in range(len(similarity_matrix)):
            # Exclude self-similarity (diagonal = 1)
            others = np.concatenate([similarity_matrix[i][:i], similarity_matrix[i][i+1:]])
            avg_similarity = np.mean(others) if len(others) > 0 else 0
            avg_similarities.append(avg_similarity)
            
        similarity_scores.extend(avg_similarities)
        
    except:
        # If TF-IDF fails (e.g., empty texts), set similarity to 0
        similarity_scores.extend([0] * len(group))

df['text_similarity'] = similarity_scores
df.head()


  df['hour_window'] = df['timestamp'].dt.floor(f'{window_hours}H')


Unnamed: 0,title,score,post_id,url,comms_num,created,body,timestamp,author_name,author_id,...,sentiment_magnitude,subjectivity,title_length,body_length,total_length,engagement_score,score_per_comment,author_post_count,hour_window,text_similarity
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10,jaxxtothemaxx,onvag,...,0.0,0.0,80,0,80,133,4.583333,1.0,2021-01-28 21:00:00,0.093633
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,stonerbobo,f3p9m,...,0.016187,0.348753,15,1156,1171,47,0.0,1.0,2021-01-28 21:00:00,0.094657
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57,Sleavitt10,9u7y1,...,0.232576,0.480303,73,0,73,103,0.386667,3.0,2021-01-28 21:00:00,0.072739
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56,di3_b0ld,13cexg,...,0.25,0.25,78,0,78,227,0.452229,2.0,2021-01-28 21:00:00,0.093194
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,Aiken_Drumn,6g1d5,...,0.02,0.3625,43,648,691,370,5.87037,2.0,2021-01-28 21:00:00,0.129248


In [24]:
df['author_id'].nunique()

7270

In [28]:
def _aggregate_to_hourly(df):
        """Aggregate posts to hourly features"""
        df['hour'] = df['timestamp'].dt.floor('H')
        
        hourly_agg = df.groupby('hour').agg({
            'sentiment': ['mean', 'std', 'min', 'max'],
            'sentiment_magnitude': 'mean',
            'text_similarity': 'mean',
            'score': ['sum', 'mean', 'std'],
            'comms_num': ['sum', 'mean'],
            'engagement_score': ['sum', 'mean'],
            'author_id': 'nunique',  # Unique authors per hour
            'author_post_count': 'mean',  # Average posts per author
            'total_length': 'mean',
            'timestamp': 'count'  # Post volume
        }).round(4)
        
        # Flatten column names
        hourly_agg.columns = [f'{col[0]}_{col[1]}' if col[1] else col[0] 
                             for col in hourly_agg.columns]
        
        # Rename count column
        hourly_agg = hourly_agg.rename(columns={'timestamp_count': 'post_volume'})
        
        # Add coordination signals
        hourly_agg['author_diversity'] = hourly_agg['author_id_nunique'] / (hourly_agg['post_volume'] + 1)
        hourly_agg['repeat_poster_ratio'] = 1 - hourly_agg['author_diversity']
        
        return hourly_agg.reset_index()
    
def _aggregate_to_daily(hourly_df):
    """Aggregate hourly features to daily"""
    hourly_df['date'] = hourly_df['hour'].dt.date
    
    # Select key features for daily aggregation
    daily_agg = hourly_df.groupby('date').agg({
        'sentiment_mean': ['mean', 'std'],
        'sentiment_std': 'mean',
        'text_similarity_mean': 'mean',
        'post_volume': ['sum', 'mean', 'std', 'max'],
        'score_sum': 'sum',
        'engagement_score_sum': 'sum',
        'author_diversity': 'mean',
        'repeat_poster_ratio': 'mean',
    }).round(4)
    
    # Flatten column names
    daily_agg.columns = [f'daily_{col[0]}_{col[1]}' 
                        for col in daily_agg.columns]
    
    return daily_agg.reset_index()
    
def _aggregate_to_weekly(daily_df):
    """Aggregate daily features to weekly"""
    daily_df['week'] = pd.to_datetime(daily_df['date']).dt.to_period('W')
    
    weekly_agg = daily_df.groupby('week').agg({
        'daily_sentiment_mean_mean': 'mean',
        'daily_post_volume_sum': ['sum', 'mean'],
        'daily_score_sum_sum': 'sum',
        'daily_author_diversity_mean': 'mean',
    }).round(4)
    
    # Flatten column names
    weekly_agg.columns = [f'weekly_{col[0]}_{col[1]}' 
                            for col in weekly_agg.columns]
    
    return weekly_agg.reset_index()

In [29]:
"""Create features at different time scales"""

# Level 1: Hourly aggregation
hourly_features = _aggregate_to_hourly(df)

# Level 2: Daily aggregation
daily_features = _aggregate_to_daily(hourly_features)

# Level 3: Weekly aggregation (for longer patterns)
weekly_features = _aggregate_to_weekly(daily_features)

  df['hour'] = df['timestamp'].dt.floor('H')


In [35]:
def label_manipulation_periods(daily_df):
        """Label time periods as manipulation (1) or normal (0)"""
        daily_df['date'] = pd.to_datetime(daily_df['date'])
        daily_df['is_manipulation'] = 0
        
        for start_date, end_date in manipulation_periods:
            mask = (daily_df['date'] >= start_date) & (daily_df['date'] <= end_date)
            daily_df.loc[mask, 'is_manipulation'] = 1
            
        return daily_df
    
def prepare_lstm_sequences(daily_df, sequence_length=7):
    """Prepare sequences for LSTM training"""
    
    # Sort by date
    daily_df = daily_df.sort_values('date').reset_index(drop=True)
    
    # Select feature columns (exclude date and target)
    feature_cols = [col for col in daily_df.columns 
                    if col not in ['date', 'is_manipulation']]
    
    # Create sequences
    X, y = [], []
    
    for i in range(sequence_length, len(daily_df)):
        # Use previous 7 days to predict current day
        X.append(daily_df[feature_cols].iloc[i-sequence_length:i].values)
        y.append(daily_df['is_manipulation'].iloc[i])
    
    return np.array(X), np.array(y), feature_cols

def get_preprocessing_summary(df):
        """Print summary of preprocessing results"""
        print("=== Preprocessing Summary ===")
        print(f"Total GME posts: {len(df)}")
        print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        print(f"Unique authors: {df['author_id'].nunique()}")
        print(f"Average sentiment: {df['sentiment'].mean():.3f}")
        print(f"Average text similarity: {df['text_similarity'].mean():.3f}")
        
        # Manipulation period stats
        manipulation_posts = 0
        for start, end in manipulation_periods:
            mask = (df['timestamp'] >= start) & (df['timestamp'] <= end)
            manipulation_posts += mask.sum()
            print(f"Posts during {start.date()} to {end.date()}: {mask.sum()}")
        
        print(f"Total manipulation period posts: {manipulation_posts}")
        print(f"Normal period posts: {len(df) - manipulation_posts}")


In [37]:
# Label manipulation periods
# daily_labeled = label_manipulation_periods(daily_features)
weekly_labeled = label_manipulation_periods(weekly_features)


# Prepare LSTM sequences
X, y, feature_names = prepare_lstm_sequences(weekly_labeled)

# Print summary
get_preprocessing_summary(df)

KeyError: 'date'