In [None]:
# Import libraries

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import SGDOneClassSVM
from sklearn.decomposition import PCA
from collections import Counter
from sklearn.cluster import MiniBatchKMeans


ImportError: cannot import name 'SGDOneClassSVM' from 'sklearn.svm' (/opt/anaconda3/envs/DL/lib/python3.10/site-packages/sklearn/svm/__init__.py)

In [None]:
# Load Comments
comments = pd.read_csv("data/filtered_comments.csv")

# Load Users
users = pd.read_csv("data/filtered_users.csv")

# Load Following
following = pd.read_csv("data/filtered_following.csv")

# Load Likes
likes = pd.read_csv("data/filtered_likes.csv")

# Load Entries/Posts
entries = pd.read_csv("data/filtered_entries.csv")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Outlier detection preparation of features for each user



# --- Follower Count ---
follower_count = following['FollowedID'].value_counts()
follower_count.name = 'FollowerCount'

# --- Following Count ---
following_count = following['FollowerID'].value_counts()
following_count.name = 'FollowingCount'

# --- Posts Created ---
posts_created = entries['PostedBy'].value_counts()
posts_created.name = 'PostsCreated'

# --- Likes Received per Post ---
likes_per_post = likes.groupby('PostID').size().rename('LikesCount')
posts_with_likes = entries.merge(likes_per_post, on='PostID', how='left')
posts_with_likes['LikesCount'] = posts_with_likes['LikesCount'].fillna(0)
avg_likes_received = posts_with_likes.groupby('PostedBy')['LikesCount'].mean()
avg_likes_received.name = 'AvgLikesReceived'

# --- Likes Given ---
likes_given = likes['userID'].value_counts()
likes_given.name = 'LikesGiven'

# --- Comments Received per Post ---
comments_per_post = comments.groupby('PostID').size().rename('CommentsCount')
posts_with_comments = entries.merge(comments_per_post, on='PostID', how='left')
posts_with_comments['CommentsCount'] = posts_with_comments['CommentsCount'].fillna(0)
avg_comments_received = posts_with_comments.groupby('PostedBy')['CommentsCount'].mean()
avg_comments_received.name = 'AvgCommentsReceived'

# --- Comments Given ---
comments_given = comments['PostedBy'].value_counts()
comments_given.name = 'CommentsGiven'

feature_list = [
    follower_count,
    following_count,
    posts_created,
    avg_likes_received,
    likes_given,
    avg_comments_received,
    comments_given
]
anomaly_user_df = pd.concat(feature_list, axis=1).fillna(0)
anomaly_user_df.index.name = 'UserID'

scaler = StandardScaler()

scaled_features = scaler.fit_transform(anomaly_user_df)

scaled_anomaly_df = pd.DataFrame(scaled_features,
                                 index=anomaly_user_df.index,
                                 columns=anomaly_user_df.columns)


print("\n--- Verification: Check Mean and Standard Deviation of Scaled Data ---")
print(scaled_anomaly_df.describe().loc[['mean', 'std']])


--- Verification: Check Mean and Standard Deviation of Scaled Data ---
      FollowerCount  FollowingCount  PostsCreated  AvgLikesReceived  \
mean  -2.134405e-17    8.297205e-18  9.206086e-18      1.870536e-17   
std    1.000001e+00    1.000001e+00  1.000001e+00      1.000001e+00   

        LikesGiven  AvgCommentsReceived  CommentsGiven  
mean -6.860587e-18         7.353141e-17  -4.397812e-18  
std   1.000001e+00         1.000001e+00   1.000001e+00  


In [None]:
# Outlier detection using different algorithms:

# This hyperparameter defines the percentage of data points you expect to be outliers.
# It's a critical parameter to tune based on your domain knowledge.
CONTAMINATION_RATE = 0.0001 

# Dictionary to store the results from each algorithm
outlier_results = {}

## 1. Statistical-based Approach: Multivariate Gaussian
print("1. Running Multivariate Gaussian (Elliptic Envelope)...")
# This method assumes the regular data comes from a Gaussian distribution and finds
# points that have a low probability of belonging to it (high Mahalanobis distance).
model = EllipticEnvelope(contamination=CONTAMINATION_RATE,support_fraction=0.9, random_state=42)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['MultivariateGaussian'] = scaled_anomaly_df.index[outliers_indices].tolist()



## 2. Clustering-based Approach: MiniBatchKMeans
print("2. Running Clustering-based (MiniBatchKMeans)...")
# MiniBatchKMeans is much more scalable than DBSCAN for large datasets.
# We identify outliers as points with the largest distance to their cluster centroid.
n_clusters = 20 # The number of clusters is a hyperparameter to tune
model = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
model.fit(scaled_anomaly_df)

# Calculate the distance of each point to its assigned cluster's center
distances_to_center = np.linalg.norm(scaled_anomaly_df - model.cluster_centers_[model.labels_], axis=1)

# Flag the top % of points with the largest distances as outliers
threshold = np.quantile(distances_to_center, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(distances_to_center > threshold)[0]
outlier_results['MiniBatchKMeans'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 3. Distance-based Approach: Largest Distances
print("3. Running Distance-based (k-NN)...")
# This identifies outliers as points with the largest distance to their k-th nearest neighbor.
# We calculate this distance for every point and then flag the top N% as outliers.
k = 5
model = NearestNeighbors(n_neighbors=k)
model.fit(scaled_anomaly_df)
distances, _ = model.kneighbors(scaled_anomaly_df)
kth_distances = distances[:, k-1] # Distance to the k-th neighbor
threshold = np.quantile(kth_distances, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(kth_distances > threshold)[0]
outlier_results['kNN_Distance'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 4. Density-based Approach: LOF
print("4. Running Density-based (LOF)...")
# Local Outlier Factor (LOF) measures the local density deviation of a data point
# with respect to its neighbors. Outliers are points in much sparser regions than their neighbors.
model = LocalOutlierFactor(n_neighbors=20, contamination=CONTAMINATION_RATE, novelty=False)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['LOF'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 5. Isolation-based Approach: iForest
print("5. Running Isolation-based (Isolation Forest)...")
# Isolation Forest isolates observations by randomly selecting a feature and then
# randomly selecting a split value. Anomalies are easier to isolate and thus have shorter path lengths.
model = IsolationForest(contamination=CONTAMINATION_RATE, random_state=42)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['IsolationForest'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 6. One-Class SVM
print("6. Running One-Class SVM...")
# OCSVM learns a decision boundary that encompasses the majority of the data.
# Points falling outside this boundary are considered outliers.
model = SGDOneClassSVM(nu=CONTAMINATION_RATE, random_state=42)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['OneClassSVM'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 7. Reconstruction Error Approach: PCA
print("7. Running Reconstruction Error (PCA)...")
# This approach uses PCA to project data into a lower-dimensional space and then
# projects it back. Outliers, which don't fit the main data patterns, will have a
# high reconstruction error.
n_components = 5 # Should be less than the number of features
model = PCA(n_components=n_components)
transformed = model.fit_transform(scaled_anomaly_df)
reconstructed = model.inverse_transform(transformed)
reconstruction_error = np.linalg.norm(scaled_anomaly_df - reconstructed, axis=1)
threshold = np.quantile(reconstruction_error, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(reconstruction_error > threshold)[0]
outlier_results['PCA_Reconstruction'] = scaled_anomaly_df.index[outliers_indices].tolist()


# --- Aggregate and Summarize Results ---
print("\n" + "="*50)
print("--- Anomaly Detection Summary ---")

# Flatten the list of all outliers found
all_outliers = [user_id for method_outliers in outlier_results.values() for user_id in method_outliers]

# Count how many times each user was flagged as an outlier
outlier_counts = Counter(all_outliers)

# Create a summary DataFrame
summary_df = pd.DataFrame(outlier_counts.items(), columns=['UserID', 'Anomaly_Count'])
summary_df = summary_df.sort_values(by='Anomaly_Count', ascending=False).reset_index(drop=True)

print(f"\nTotal unique users flagged as outliers: {len(summary_df)}")
print("\nTop 20 most anomalous users:")
print(summary_df.head(20))

# You can also create a detailed DataFrame showing which algorithm flagged which user
detailed_summary_df = scaled_anomaly_df.copy()
for method, outliers in outlier_results.items():
    detailed_summary_df[method] = detailed_summary_df.index.isin(outliers)

detailed_summary_df['Anomaly_Count'] = detailed_summary_df.iloc[:, -len(outlier_results):].sum(axis=1)
detailed_summary_df = detailed_summary_df.sort_values(by='Anomaly_Count', ascending=False)

print("\nDetailed breakdown for users flagged by 3 or more algorithms:")
print(detailed_summary_df[detailed_summary_df['Anomaly_Count'] >= 3])

1. Running Multivariate Gaussian (Elliptic Envelope)...




2. Running Clustering-based (MiniBatchKMeans)...
3. Running Distance-based (k-NN)...
4. Running Density-based (LOF)...




5. Running Isolation-based (Isolation Forest)...
6. Running One-Class SVM...
7. Running Reconstruction Error (PCA)...

--- Anomaly Detection Summary ---

Total unique users flagged as outliers: 54093

Top 20 most anomalous users:
               UserID  Anomaly_Count
0            mashable              7
1           rmproject              7
2             findpdf              7
3   mathieuvanmulders              7
4         sasparaguay              7
5       tyconsoftware              7
6          fftimetest              7
7             reaktor              7
8             mackley              7
9             kararli              7
10        nicolaquinn              7
11           maxton38              7
12            salardo              7
13             idechi              7
14   blogprofitexpert              7
15        disconnesso              7
16     adrianspillman              7
17            caponsu              7
18            jvetrau              7
19          winenight         

In [None]:
import pandas as pd
from scipy.stats import percentileofscore

# This section assumes your 'anomaly_user_df' (containing all user stats)
# and 'summary_df' (with anomaly counts) have already been created.

# --- 1. Pre-calculate Ranks and Averages for the entire population ---

# Calculate the average (mean) for each metric across the entire population
population_averages = anomaly_user_df.mean().to_dict()

# Pre-calculate the rank for every user across all metrics.
# We do this once for efficiency instead of re-calculating inside the loop.
# - ascending=False: Makes the highest value get Rank #1.
# - method='min': Ensures that if users are tied, they all receive the same top rank.
print("Pre-calculating ranks for all users...")
all_user_ranks = anomaly_user_df.rank(ascending=False, method='min').astype(int)
print("Ranks calculated.")


# --- 2. Identify the top 20 anomalous users (your original code) ---

# Merge the stats of all users with the anomaly summary report
detailed_stats_df = anomaly_user_df.merge(
    summary_df,
    left_index=True,
    right_on='UserID',
    how='right'
)
# Set UserID as the index and sort by anomaly count to find the top offenders
detailed_stats_df = detailed_stats_df.set_index('UserID').sort_values(by='Anomaly_Count', ascending=False)
top_20_anomalous_users = detailed_stats_df.head(20)


# --- 3. Print the enhanced statistics for the top 20 users ---

print("\n" + "="*100)
print("--- Individual Statistics for Top 20 Anomalous Users (vs. Population Average, with Ranks) ---")
print("="*100 + "\n")

# Loop through each of the top 20 users
for user_id, stats in top_20_anomalous_users.iterrows():
    print(f"👤 --- Stats for UserID: {user_id} --- 👤")
    print(f"🚩 Flagged as an Anomaly by {stats['Anomaly_Count']} Algorithms")
    print("-" * 95)
    
    # For each metric, fetch the pre-calculated rank and print the full comparison
    
    # Followers
    percentile = percentileofscore(anomaly_user_df['FollowerCount'], stats['FollowerCount'])
    rank = all_user_ranks.loc[user_id, 'FollowerCount']
    print(f"  - {'Followers':<22}: {stats['FollowerCount']:<9,} (Avg: {population_averages['FollowerCount']:<8.2f} | Rank: #{rank:<6} | Top {100 - percentile:.2f}%)")

    # Following
    percentile = percentileofscore(anomaly_user_df['FollowingCount'], stats['FollowingCount'])
    rank = all_user_ranks.loc[user_id, 'FollowingCount']
    print(f"  - {'Following':<22}: {stats['FollowingCount']:<9,} (Avg: {population_averages['FollowingCount']:<8.2f} | Rank: #{rank:<6} | Top {100 - percentile:.2f}%)")

    # Posts Created
    percentile = percentileofscore(anomaly_user_df['PostsCreated'], stats['PostsCreated'])
    rank = all_user_ranks.loc[user_id, 'PostsCreated']
    print(f"  - {'Posts Created':<22}: {stats['PostsCreated']:<9,} (Avg: {population_averages['PostsCreated']:<8.2f} | Rank: #{rank:<6} | Top {100 - percentile:.2f}%)")

    # Avg Likes Received
    percentile = percentileofscore(anomaly_user_df['AvgLikesReceived'], stats['AvgLikesReceived'])
    rank = all_user_ranks.loc[user_id, 'AvgLikesReceived']
    print(f"  - {'Avg Likes Received':<22}: {stats['AvgLikesReceived']:<9.2f} (Avg: {population_averages['AvgLikesReceived']:<8.2f} | Rank: #{rank:<6} | Top {100 - percentile:.2f}%)")
    
    # Likes Given
    percentile = percentileofscore(anomaly_user_df['LikesGiven'], stats['LikesGiven'])
    rank = all_user_ranks.loc[user_id, 'LikesGiven']
    print(f"  - {'Likes Given':<22}: {stats['LikesGiven']:<9,} (Avg: {population_averages['LikesGiven']:<8.2f} | Rank: #{rank:<6} | Top {100 - percentile:.2f}%)")

    # Avg Comments Received
    percentile = percentileofscore(anomaly_user_df['AvgCommentsReceived'], stats['AvgCommentsReceived'])
    rank = all_user_ranks.loc[user_id, 'AvgCommentsReceived']
    print(f"  - {'Avg Comments Received':<22}: {stats['AvgCommentsReceived']:<9.2f} (Avg: {population_averages['AvgCommentsReceived']:<8.2f} | Rank: #{rank:<6} | Top {100 - percentile:.2f}%)")

    # Comments Given
    percentile = percentileofscore(anomaly_user_df['CommentsGiven'], stats['CommentsGiven'])
    rank = all_user_ranks.loc[user_id, 'CommentsGiven']
    print(f"  - {'Comments Given':<22}: {stats['CommentsGiven']:<9,} (Avg: {population_averages['CommentsGiven']:<8.2f} | Rank: #{rank:<6} | Top {100 - percentile:.2f}%)")

   
    


    print("\n")

Pre-calculating ranks for all users...
Ranks calculated.

--- Individual Statistics for Top 20 Anomalous Users (vs. Population Average, with Ranks) ---

👤 --- Stats for UserID: mashable --- 👤
🚩 Flagged as an Anomaly by 7.0 Algorithms
-----------------------------------------------------------------------------------------------
  - Followers             : 42,487.0  (Avg: 38.12    | Rank: #1      | Top 0.00%)
  - Following             : 9,938.0   (Avg: 38.12    | Rank: #158    | Top 0.03%)
  - Posts Created         : 113.0     (Avg: 22.44    | Rank: #17469  | Top 3.62%)
  - Avg Likes Received    : 0.22      (Avg: 0.01     | Rank: #7575   | Top 1.56%)
  - Likes Given           : 1.0       (Avg: 1.11     | Rank: #10381  | Top 2.63%)
  - Avg Comments Received : 0.04      (Avg: 0.06     | Rank: #31310  | Top 6.46%)
  - Comments Given        : 2.0       (Avg: 5.88     | Rank: #24997  | Top 5.67%)


👤 --- Stats for UserID: ikaro --- 👤
🚩 Flagged as an Anomaly by 7.0 Algorithms
----------------