In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
from data_loader import load_subreddit_data

# Pass a subreddit or list of subreddits to load
# Will query if parquet does not exist or reload=True
data_df = load_subreddit_data(["politics", "news"], reload=False)

Loading cached data for politics...
Loading cached data for news...
Loaded 722587 rows from 2 subreddit(s).


In [3]:
from feature_engineering import create_features

# Creates dataframe with features for each user
# You can replace this with your own engineered features
features_df, feature_cols = create_features(data_df)
print(feature_cols)

['n_comments', 'avg_ups', 'avg_downs', 'avg_score', 'avg_body_len', 'median_time_diff', 'lexical_diversity', 'active_hours', 'activity_entropy']


In [4]:
from network_creation import build_edge_df, build_pyg_graph

# Create edge dataframe
# Keeps only edges above a threshold to keep it small
edge_df = build_edge_df(data_df, id_col='author', post_col='link_id', min_weight=5)

# Attach features and return a pytorch geometric object.
# If feature_cols not included it will default to all columns that aren't the ID
graph_data = build_pyg_graph(edge_df, features_df, id_col='author', feature_cols=feature_cols)


Graph created: 8,638 nodes, 150,062 edges


In [10]:
from dgi import DGI

# Embed graph with DGI
# 'hidden' is the size of hidden layer. Larger means more complex model
# 'learning_rate' is for SGD. Too high and the optimization is unstable
dgi = DGI(graph_data, hidden=500, learning_rate=0.001, epochs=200)
embedding_df = dgi.embed()

Epoch 0, Loss: 1.3791
Epoch 20, Loss: 1.1232
Epoch 40, Loss: 0.9564
Epoch 60, Loss: 0.8901
Epoch 80, Loss: 0.8919
Epoch 100, Loss: 1.0146
Epoch 120, Loss: 0.8459
Epoch 140, Loss: 0.8162
Epoch 160, Loss: 0.8664
Epoch 180, Loss: 0.8340


In [11]:
embedding_df.head()

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_491,emb_492,emb_493,emb_494,emb_495,emb_496,emb_497,emb_498,emb_499,author
0,0.0,0.004446,0.049313,0.064438,0.139999,0.004674,0.0,0.156005,0.09671,0.219387,...,0.004962,0.0,0.066775,0.129797,0.0,0.0,0.0,0.0,0.127721,-BMP-
1,0.087507,0.02042,0.031792,0.0,0.128022,0.0,0.051254,0.0,0.010186,0.202123,...,0.0,0.010949,0.016812,0.075446,0.039197,0.0,0.0,0.0,0.240311,-JDubs-
2,0.0,0.0,0.084039,0.079063,0.0,0.020178,0.0,0.0,0.0,0.0,...,0.0,0.09948,0.038535,0.0,0.017403,0.102751,0.105451,0.031497,0.0,-Mockingbird
3,0.0,0.035956,0.0,0.041142,0.0,0.0,0.0,0.124218,0.0,0.020266,...,0.0,0.0,0.0,0.042743,0.0,0.0,0.0,0.0,0.0,-Mountain-King-
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288578,0.0,0.0,...,0.019833,0.035301,0.0,0.0,0.171505,0.255843,0.143512,0.0,0.0,-ParticleMan-


In [15]:
from sklearn.ensemble import IsolationForest

X_dgi = embedding_df.drop('author', axis=1).values

iso_dgi = IsolationForest(contamination=0.2, random_state=0)
iso_dgi.fit(X_dgi)
scores = -iso_dgi.decision_function(X_dgi)

result = embedding_df[["author"]].copy()
result['anomaly_score'] = scores
result = result.sort_values('anomaly_score', ascending=False)

print("Top 10 anomalies (DGI):")
print(result.head(10))

Top 10 anomalies (DGI):
                 author  anomaly_score
3059  RedBeardedWhiskey       0.328230
3536    StationaryNomad       0.322019
921       Dear_Occupant       0.316775
5807   guy_incognito784       0.314246
622          Caligineus       0.284794
177   Affordable_Z_Jobs       0.283852
197            Aleucard       0.279748
1476       Go_Eagles_Go       0.279011
4293          [deleted]       0.274273
608      CFRProflcopter       0.273817


In [16]:
X_features = features_df.drop('author', axis=1).values

iso_features = IsolationForest(contamination=0.2, random_state=0)
iso_features.fit(X_features)
scores = -iso_features.decision_function(X_features)

result_features = features_df[["author"]].copy()
result_features['anomaly_score'] = scores
result_features = result_features.sort_values('anomaly_score', ascending=False)

print("Top 10 anomalies (Features):")
print(result_features.head(10))

Top 10 anomalies (Features):
                   author  anomaly_score
23630   Homeschooled-perv       0.312845
54843      ThrowFARaway98       0.307911
67424        bobbyjoechan       0.306771
59755   YeastCoastForever       0.302362
16693             EddyJ87       0.296064
34866      Mister_Squishy       0.289952
99461        nolimits2222       0.288515
73135              damacu       0.287724
120319   westward_jabroni       0.287438
37272         Nevermore60       0.286793
