In [47]:
# CELL 2: Setup and Dependencies
#
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import warnings
import json

warnings.filterwarnings('ignore', category=FutureWarning)
print("Dependencies loaded successfully.")


Dependencies loaded successfully.


In [None]:
# CELL 3: Load and Combine Datasets
#
print("Loading datasets...")

# Identify paths for brokers and social media platforms from the injected dictionary
# broker_paths = [path for owner, path in client_paths.items() if owner.startswith('Broker')]
# social_paths = [path for owner, path in client_paths.items() if owner.startswith('Social')]

broker_paths = [clientA_path, clientB_path]
social_paths = [ClientC_path]

# Load and concatenate all broker datasets into a single DataFrame
broker_dfs = [pd.read_csv(path, parse_dates=['timestamp']) for path in broker_paths]
trading_df = pd.concat(broker_dfs, ignore_index=True)
print(f"-> Loaded and combined {len(broker_dfs)} broker datasets. Total trades: {len(trading_df)}")

# Load the social media dataset
social_df = pd.read_csv(social_paths[0], parse_dates=['timestamp'])
print(f"-> Loaded {len(social_df)} social media records.")
alerts = []  # Initialize alerts list

Loading datasets...
-> Loaded and combined 2 broker datasets. Total trades: 5000
-> Loaded 2500 social media records.


In [50]:
print("\nPhase 1: Detecting trading anomalies...")
# This model identifies trades that are outliers compared to the rest of the data.
# We assume about 1% of trades might be anomalous for this simulation.
anomaly_model = IsolationForest(contamination=0.005, random_state=42)
features = ['trade_volume', 'order_book_depth']

# The model's 'fit_predict' returns -1 for anomalies and 1 for normal trades.
predictions = anomaly_model.fit_predict(trading_df[features])
trading_df['is_anomaly'] = (predictions == -1)
print(f"-> Found {trading_df['is_anomaly'].sum()} anomalous trading events across all brokers.")
alerts.append("Trading Anomaly Detected:")
for row in trading_df[trading_df['is_anomaly']].itertuples():
    alert = f"Suspicious Trade: Trader {row.anonymized_trader_id} anomalously traded {row.trade_volume} shares of {row.stock_symbol} on {row.timestamp.date()}."
    alerts.append(alert)
    print(alert)


Phase 1: Detecting trading anomalies...
-> Found 25 anomalous trading events across all brokers.
Suspicious Trade: Trader user-k5m6 anomalously traded 3673600 shares of GME on 2021-01-22.
Suspicious Trade: Trader user-c8e1 anomalously traded 1926780 shares of GME on 2021-01-24.
Suspicious Trade: Trader cluster-9f4e anomalously traded 4106088 shares of GME on 2021-01-26.
Suspicious Trade: Trader user-c8e1 anomalously traded 2272995 shares of GME on 2021-01-27.
Suspicious Trade: Trader user-f6a9 anomalously traded 2276064 shares of GME on 2021-01-27.
Suspicious Trade: Trader user-g2h7 anomalously traded 2187360 shares of GME on 2021-01-27.
Suspicious Trade: Trader cluster-3b8a anomalously traded 1975824 shares of GME on 2021-02-10.
Suspicious Trade: Trader cluster-1c5d anomalously traded 3228520 shares of GME on 2021-02-12.
Suspicious Trade: Trader cluster-1c5d anomalously traded 1740078 shares of GME on 2021-02-13.
Suspicious Trade: Trader cluster-9f4e anomalously traded 3271743 shares

In [None]:
# CELL 5: Phase 2 - Social Media Hype Detection
#
print("\nPhase 2: Detecting social media hype...")
# We'll define "hype" as a burst of activity significantly higher than the average for a given stock.
# A Z-score approach (measuring standard deviations from the mean) is effective here.
social_df['mean_activity'] = social_df.groupby('stock_symbol')['activity_level'].transform('mean')
social_df['std_activity'] = social_df.groupby('stock_symbol')['activity_level'].transform('std')

# A hype event is when activity is more than 3 standard deviations above the average.
social_df['is_hype'] = social_df['activity_level'] > (social_df['mean_activity'] + 3 * social_df['std_activity'])
print(f"-> Found {social_df['is_hype'].sum()} social media hype events.")
alerts.append(f"Social Media Hype Detected:")
for row in social_df[social_df['is_hype']].itertuples():
    alert = f"Hype Spike: Stock {row.stock_symbol} saw activity {row.activity_level} on {row.timestamp.date()}."
    alerts.append(alert)
    print(alert)


Phase 2: Detecting social media hype...
-> Found 18 social media hype events.
Hype Spike: Stock GME saw activity 144016 on 2021-01-20.
Hype Spike: Stock GME saw activity 99576 on 2021-01-21.
Hype Spike: Stock GME saw activity 105066 on 2021-01-22.
Hype Spike: Stock GME saw activity 105906 on 2021-01-23.
Hype Spike: Stock GME saw activity 144696 on 2021-01-25.
Hype Spike: Stock GME saw activity 138240 on 2021-01-30.
Hype Spike: Stock GME saw activity 102130 on 2021-01-31.
Hype Spike: Stock GME saw activity 193491 on 2021-02-02.
Hype Spike: Stock GME saw activity 124704 on 2021-02-04.
Hype Spike: Stock GME saw activity 138816 on 2021-02-04.
Hype Spike: Stock GME saw activity 45321 on 2021-02-04.
Hype Spike: Stock GME saw activity 86778 on 2021-02-04.
Hype Spike: Stock GME saw activity 89782 on 2021-02-09.
Hype Spike: Stock GME saw activity 96060 on 2021-02-10.
Hype Spike: Stock GME saw activity 96888 on 2021-02-11.
Hype Spike: Stock GME saw activity 85962 on 2021-02-12.
Hype Spike: Stoc

In [None]:
# CELL 6: Phase 3 - Correlation Engine
#
print("\nPhase 3: Correlating social hype with trading anomalies...")
# This is the core of the model. We check if a trading anomaly occurred
# shortly after a social media hype event for the same stock.

# We sort both datasets by time to prepare for the time-sensitive merge.
trading_df.sort_values('timestamp', inplace=True)
social_df.sort_values('timestamp', inplace=True)

# pd.merge_asof is perfect for this. For each trade, it finds the most recent
# social media signal within the last 15 minutes.
merged_df = pd.merge_asof(
    trading_df,
    social_df,
    on='timestamp',
    by='stock_symbol',
    direction='forward', # Looks for the social signal *before* the trade.
    tolerance=pd.Timedelta('120min')
)

# A final alert is generated only if an anomalous trade is linked to a recent hype event.
alerts_df = merged_df[(merged_df['is_anomaly'] == True) & (merged_df['is_hype'] == True)].copy()
print(f"-> Generated {len(alerts_df)} high-confidence pump-and-dump alerts.")
for _, alert in alerts_df.iterrows():
    alerts.append(f"Alert: Anomalous trade of {alert['trade_volume']} shares "
        f"for stock {alert['stock_symbol']} by Broker {alert['anonymized_trader_id']} "
        f"within 2 hours of hype spike (activity={alert['activity_level']}).")
    print(
        f"Alert: Anomalous trade of {alert['trade_volume']} shares "
        f"for stock {alert['stock_symbol']} by Broker {alert['anonymized_trader_id']} "
        f"within 2 hours of hype spike (activity={alert['activity_level']})."
    )


Phase 3: Correlating social hype with trading anomalies...
-> Generated 1 high-confidence pump-and-dump alerts.
Alert: Anomalous trade of 3171136 shares for stock GME by Broker user-d9f2 within 2 hours of hype spike (activity=144016.0).


In [54]:
# CELL 7: Save the Result
#
print("\nSaving results...")
# The executor expects a file named 'result.*' to upload.
alerts_json = {"alerts": alerts}
with open("result.json", "w", encoding="utf-8") as f:
    json.dump(alerts_json, f, indent=4)


# if not alerts_df.empty:
#     output_columns = [
#         'timestamp',
#         'stock_symbol',
#         'trade_volume',
#         'activity_level',
#         'mean_activity'
#     ]
#     # Ensure all selected columns exist before saving
#     final_alerts = alerts_df[[col for col in output_columns if col in alerts_df.columns]]
#     final_alerts.to_csv('result.csv', index=False)
#     print("-> Alerts saved to result.csv")
# else:
#     # Create an empty result file if no alerts were found.
#     pd.DataFrame(columns=['timestamp', 'stock_symbol', 'message']).to_csv('result.csv', index=False)
#     print("-> No alerts generated. An empty result.csv has been created.")


Saving results...
