In [7]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import SGDOneClassSVM
from sklearn.decomposition import PCA
from collections import Counter
from sklearn.cluster import MiniBatchKMeans

In [8]:
comments = pd.read_csv("data/filtered_comments.csv")

users = pd.read_csv("data/filtered_users.csv")

following = pd.read_csv("data/filtered_following.csv")

likes = pd.read_csv("data/filtered_likes.csv")

entries = pd.read_csv("data/filtered_entries.csv")

In [9]:
follower_count = following['FollowedID'].value_counts()
follower_count.name = 'FollowerCount'

following_count = following['FollowerID'].value_counts()
following_count.name = 'FollowingCount'

posts_created = entries['PostedBy'].value_counts()
posts_created.name = 'PostsCreated'

likes_per_post = likes.groupby('PostID').size().rename('LikesCount')
posts_with_likes = entries.merge(likes_per_post, on='PostID', how='left')
posts_with_likes['LikesCount'] = posts_with_likes['LikesCount'].fillna(0)
avg_likes_received = posts_with_likes.groupby('PostedBy')['LikesCount'].mean()
avg_likes_received.name = 'AvgLikesReceived'

likes_given = likes['userID'].value_counts()
likes_given.name = 'LikesGiven'

comments_per_post = comments.groupby('PostID').size().rename('CommentsCount')
posts_with_comments = entries.merge(comments_per_post, on='PostID', how='left')
posts_with_comments['CommentsCount'] = posts_with_comments['CommentsCount'].fillna(0)
avg_comments_received = posts_with_comments.groupby('PostedBy')['CommentsCount'].mean()
avg_comments_received.name = 'AvgCommentsReceived'

comments_given = comments['PostedBy'].value_counts()
comments_given.name = 'CommentsGiven'

feature_list = [
    follower_count,
    following_count,
    posts_created,
    avg_likes_received,
    likes_given,
    avg_comments_received,
    comments_given
]
anomaly_user_df = pd.concat(feature_list, axis=1).fillna(0)
anomaly_user_df.index.name = 'UserID'

scaler = StandardScaler()

scaled_features = scaler.fit_transform(anomaly_user_df)

scaled_anomaly_df = pd.DataFrame(scaled_features,
                                 index=anomaly_user_df.index,
                                 columns=anomaly_user_df.columns)


print("Mean and Standard Deviation: ")
print(scaled_anomaly_df.describe().loc[['mean', 'std']])

Mean and Standard Deviation: 
      FollowerCount  FollowingCount  PostsCreated  AvgLikesReceived  \
mean  -3.774247e-17    1.817230e-17 -1.234784e-17     -1.852177e-17   
std    1.000001e+00    1.000001e+00  1.000001e+00      1.000001e+00   

        LikesGiven  AvgCommentsReceived  CommentsGiven  
mean  4.295536e-19         1.604637e-17  -1.016367e-17  
std   1.000001e+00         1.000001e+00   1.000001e+00  


In [10]:
import warnings
warnings.filterwarnings('ignore', message='Determinant has increased')

CONTAMINATION_RATE = 0.0002 

outlier_results = {}

model = EllipticEnvelope(contamination=CONTAMINATION_RATE,support_fraction=0.9, random_state=42)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['MultivariateGaussian'] = scaled_anomaly_df.index[outliers_indices].tolist()

n_clusters = 20 
model = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
model.fit(scaled_anomaly_df)

distances_to_center = np.linalg.norm(scaled_anomaly_df - model.cluster_centers_[model.labels_], axis=1)

threshold = np.quantile(distances_to_center, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(distances_to_center > threshold)[0]
outlier_results['MiniBatchKMeans'] = scaled_anomaly_df.index[outliers_indices].tolist()

k = 5
model = NearestNeighbors(n_neighbors=k)
model.fit(scaled_anomaly_df)
distances, _ = model.kneighbors(scaled_anomaly_df)
kth_distances = distances[:, k-1]
threshold = np.quantile(kth_distances, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(kth_distances > threshold)[0]
outlier_results['kNN_Distance'] = scaled_anomaly_df.index[outliers_indices].tolist()

model = LocalOutlierFactor(n_neighbors=20, contamination=CONTAMINATION_RATE, novelty=False)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['LOF'] = scaled_anomaly_df.index[outliers_indices].tolist()

model = IsolationForest(contamination=CONTAMINATION_RATE, random_state=42)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['IsolationForest'] = scaled_anomaly_df.index[outliers_indices].tolist()

model = SGDOneClassSVM(nu=CONTAMINATION_RATE, random_state=42)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['OneClassSVM'] = scaled_anomaly_df.index[outliers_indices].tolist()

n_components = 5
model = PCA(n_components=n_components)
transformed = model.fit_transform(scaled_anomaly_df)
reconstructed = model.inverse_transform(transformed)
reconstruction_error = np.linalg.norm(scaled_anomaly_df - reconstructed, axis=1)
threshold = np.quantile(reconstruction_error, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(reconstruction_error > threshold)[0]
outlier_results['PCA_Reconstruction'] = scaled_anomaly_df.index[outliers_indices].tolist()

all_outliers = [user_id for method_outliers in outlier_results.values() for user_id in method_outliers]

outlier_counts = Counter(all_outliers)

summary_df = pd.DataFrame(outlier_counts.items(), columns=['UserID', 'Anomaly_Count'])
summary_df = summary_df.sort_values(by='Anomaly_Count', ascending=False).reset_index(drop=True)

print(f"\nTotal unique users flagged as outliers: {len(summary_df)}")
print("\nTop 20 most anomalous users:")
print(summary_df.head(20))

detailed_summary_df = scaled_anomaly_df.copy()
for method, outliers in outlier_results.items():
    detailed_summary_df[method] = detailed_summary_df.index.isin(outliers)

detailed_summary_df['Anomaly_Count'] = detailed_summary_df.iloc[:, -len(outlier_results):].sum(axis=1)
detailed_summary_df = detailed_summary_df.sort_values(by='Anomaly_Count', ascending=False)

print("\nDetailed breakdown for users flagged by 3 or more algorithms:")
print(detailed_summary_df[detailed_summary_df['Anomaly_Count'] >= 3])




Total unique users flagged as outliers: 374

Top 20 most anomalous users:
                UserID  Anomaly_Count
0           laurabotts              5
1         ortakpencere              5
2        pattonroberta              5
3               yesari              5
4               1geeky              5
5                kami1              5
6      sidepodcastlive              4
7                falob              4
8        wishiwerel33t              4
9            musiclion              4
10          jluvisions              4
11             salardo              4
12         dicemarshal              4
13          romanussum              4
14              golaqa              4
15         kelrapunzel              4
16             nazlikk              4
17        showhabercom              4
18        tahaakcakaya              4
19  lermontovlermontov              4

Detailed breakdown for users flagged by 3 or more algorithms:
                FollowerCount  FollowingCount  PostsCreated  Avg

In [11]:
import pandas as pd
from scipy.stats import percentileofscore

population_averages = anomaly_user_df.mean().to_dict()


all_user_ranks = anomaly_user_df.rank(ascending=False, method='min').astype(int)

detailed_stats_df = anomaly_user_df.merge(
    summary_df,
    left_index=True,
    right_on='UserID',
    how='right'
)

detailed_stats_df = detailed_stats_df.set_index('UserID').sort_values(by='Anomaly_Count', ascending=False)
top_20_anomalous_users = detailed_stats_df.head(20)

metrics_config = {
    'Follower Count': ('FollowerCount', '{:,}'),
    'Following Count': ('FollowingCount', '{:,}'),
    'Posts Created': ('PostsCreated', '{:,}'),
    'Likes Received per Post (on average)': ('AvgLikesReceived', '{:.2f}'),
    'Likes Given': ('LikesGiven', '{:,}'),
    'Comments Received per Post (on average)': ('AvgCommentsReceived', '{:.2f}'),
    'Comments Given': ('CommentsGiven', '{:,}')
}

for user_id, stats in top_20_anomalous_users.iterrows():
    print(f"Stats for UserID: {user_id}, Flagged {int(stats['Anomaly_Count'])} times")
    print("Type\tValue\tRank\tTop in %")

    for display_name, (col_name, fmt) in metrics_config.items():
        value = stats[col_name]
        percentile = percentileofscore(anomaly_user_df[col_name], value)
        top_in_percent = 100 - percentile
        rank = all_user_ranks.loc[user_id, col_name]
        print(f"{display_name}\t{fmt.format(value)}\t{rank}\t{top_in_percent:.2f}")
    # print the algorithms that flagged this user
    flagged_by = [method for method, outliers in outlier_results.items() if user_id in outliers]
    print(f"Flagged by: {', '.join(flagged_by)}")
    print("\n")

Stats for UserID: laurabotts, Flagged 5 times
Type	Value	Rank	Top in %
Follower Count	646.0	4556	0.93
Following Count	490.0	4324	0.89
Posts Created	1,397.0	789	0.16
Likes Received per Post (on average)	0.52	3736	0.77
Likes Given	3,948.0	2	0.00
Comments Received per Post (on average)	1.94	3615	0.74
Comments Given	3,210.0	79	0.02
Flagged by: MultivariateGaussian, MiniBatchKMeans, kNN_Distance, IsolationForest, PCA_Reconstruction


Stats for UserID: pattonroberta, Flagged 5 times
Type	Value	Rank	Top in %
Follower Count	1,453.0	1280	0.26
Following Count	147.0	19283	3.97
Posts Created	1,389.0	793	0.16
Likes Received per Post (on average)	0.66	2992	0.61
Likes Given	2,707.0	5	0.00
Comments Received per Post (on average)	3.28	2013	0.41
Comments Given	4,365.0	36	0.01
Flagged by: MultivariateGaussian, MiniBatchKMeans, kNN_Distance, IsolationForest, PCA_Reconstruction


Stats for UserID: yesari, Flagged 5 times
Type	Value	Rank	Top in %
Follower Count	1,159.0	1837	0.38
Following Count	248.0	10611	