In [1]:
# Import libraries

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA
from collections import Counter
from sklearn.cluster import MiniBatchKMeans


In [None]:
# Load Comments
comments = pd.read_csv("data/filtered_comments.csv", delimiter="\t", header=None, names=["EntryID", "PostID", "PostedBy", "SourceName", "SourceURL", "GeoX", "GeoY", "Timestamp", "Text", "NumImg", "ImgURL", "NumVideos", "VideosURL"])

# Load Users
users = pd.read_csv("data/filtered_users.csv", delimiter="|", header=None, names=["ID", "Type", "Name", "ReservedField", "Description"], on_bad_lines='skip')

# Load Following
following = pd.read_csv("data/filtered_following.csv", delimiter="\t", header=None, names=["FollowedID", "FollowerID", "Timestamp"], on_bad_lines='skip')

# Load Likes
likes = pd.read_csv("data/filtered_likes.csv", delimiter="\t", header=None, names=["userID", "PostID", "Timestamp"], on_bad_lines='skip')


# Load Entries/Posts
entries = pd.read_csv("data/filtered_entries.csv", delimiter="\t", header=None, names=["PostID", "PostedBy", "SourceName", "SourceURL", "GeoX", "GeoY", "Timestamp", "Text", "NumImg", "ImgURL", "NumVideos", "VideosURL"], on_bad_lines='skip')

(3749891, 13)
                                             EntryID  \
0  e/ed12adf025b5491da54c4ff2c8c5377a/c/c2dbc8151...   
1  e/7f6fb13b5a99449bb9dcbb3f8693be73/c/7477fb677...   
2  e/624ca9226b6526ebdb69f9b46df482c7/c/32c6bf5bc...   
3  e/2fdf59e075094fe1847137af34eda0f7/c/eb22afe81...   
4  e/4d8de05f989d43a4b90bfbfc59751e1c/c/d472f7b81...   

                               PostID      PostedBy  SourceName  \
0  e/ed12adf025b5491da54c4ff2c8c5377a  koenigdublin         NaN   
1  e/7f6fb13b5a99449bb9dcbb3f8693be73  ilportalinux         NaN   
2  e/624ca9226b6526ebdb69f9b46df482c7    guardianuk         NaN   
3  e/2fdf59e075094fe1847137af34eda0f7          nahi  m.ctor.org   
4  e/4d8de05f989d43a4b90bfbfc59751e1c       miocaro         NaN   

                SourceURL GeoX GeoY            Timestamp  \
0                     NaN   \N   \N  2010-08-06 15:06:32   
1                     NaN   \N   \N  2010-08-06 15:06:32   
2                     NaN   \N   \N  2010-08-06 14:45:07   
3  htt

In [None]:
# Outlier detection preparation of features for each user



# --- Follower Count ---
follower_count = following['FollowedID'].value_counts()
follower_count.name = 'FollowerCount'

# --- Following Count ---
following_count = following['FollowerID'].value_counts()
following_count.name = 'FollowingCount'

# --- Posts Created ---
posts_created = entries['PostedBy'].value_counts()
posts_created.name = 'PostsCreated'

# --- Likes Received per Post ---
likes_per_post = likes.groupby('PostID').size().rename('LikesCount')
posts_with_likes = entries.merge(likes_per_post, on='PostID', how='left')
posts_with_likes['LikesCount'] = posts_with_likes['LikesCount'].fillna(0)
avg_likes_received = posts_with_likes.groupby('PostedBy')['LikesCount'].mean()
avg_likes_received.name = 'AvgLikesReceived'

# --- Likes Given ---
likes_given = likes['userID'].value_counts()
likes_given.name = 'LikesGiven'

# --- Comments Received per Post ---
comments_per_post = comments.groupby('PostID').size().rename('CommentsCount')
posts_with_comments = entries.merge(comments_per_post, on='PostID', how='left')
posts_with_comments['CommentsCount'] = posts_with_comments['CommentsCount'].fillna(0)
avg_comments_received = posts_with_comments.groupby('PostedBy')['CommentsCount'].mean()
avg_comments_received.name = 'AvgCommentsReceived'

# --- Comments Given ---
comments_given = comments['PostedBy'].value_counts()
comments_given.name = 'CommentsGiven'

feature_list = [
    follower_count,
    following_count,
    posts_created,
    avg_likes_received,
    likes_given,
    avg_comments_received,
    comments_given
]
anomaly_user_df = pd.concat(feature_list, axis=1).fillna(0)
anomaly_user_df.index.name = 'UserID'

scaler = StandardScaler()

scaled_features = scaler.fit_transform(anomaly_user_df)

scaled_anomaly_df = pd.DataFrame(scaled_features,
                                 index=anomaly_user_df.index,
                                 columns=anomaly_user_df.columns)


print("\n--- Verification: Check Mean and Standard Deviation of Scaled Data ---")
print(scaled_anomaly_df.describe().loc[['mean', 'std']])

In [None]:
# Outlier detection using different algorithms:

# This hyperparameter defines the percentage of data points you expect to be outliers.
# It's a critical parameter to tune based on your domain knowledge.
CONTAMINATION_RATE = 0.05 

# Dictionary to store the results from each algorithm
outlier_results = {}

## 1. Statistical-based Approach: Multivariate Gaussian
print("1. Running Multivariate Gaussian (Elliptic Envelope)...")
# This method assumes the regular data comes from a Gaussian distribution and finds
# points that have a low probability of belonging to it (high Mahalanobis distance).
model = EllipticEnvelope(contamination=CONTAMINATION_RATE,support_fraction=0.9, random_state=42)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['MultivariateGaussian'] = scaled_anomaly_df.index[outliers_indices].tolist()



## 2. Clustering-based Approach: MiniBatchKMeans
print("2. Running Clustering-based (MiniBatchKMeans)...")
# MiniBatchKMeans is much more scalable than DBSCAN for large datasets.
# We identify outliers as points with the largest distance to their cluster centroid.
n_clusters = 20 # The number of clusters is a hyperparameter to tune
model = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
model.fit(scaled_anomaly_df)

# Calculate the distance of each point to its assigned cluster's center
distances_to_center = np.linalg.norm(scaled_anomaly_df - model.cluster_centers_[model.labels_], axis=1)

# Flag the top % of points with the largest distances as outliers
threshold = np.quantile(distances_to_center, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(distances_to_center > threshold)[0]
outlier_results['MiniBatchKMeans'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 3. Distance-based Approach: Largest Distances
print("3. Running Distance-based (k-NN)...")
# This identifies outliers as points with the largest distance to their k-th nearest neighbor.
# We calculate this distance for every point and then flag the top N% as outliers.
k = 5
model = NearestNeighbors(n_neighbors=k)
model.fit(scaled_anomaly_df)
distances, _ = model.kneighbors(scaled_anomaly_df)
kth_distances = distances[:, k-1] # Distance to the k-th neighbor
threshold = np.quantile(kth_distances, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(kth_distances > threshold)[0]
outlier_results['kNN_Distance'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 4. Density-based Approach: LOF
print("4. Running Density-based (LOF)...")
# Local Outlier Factor (LOF) measures the local density deviation of a data point
# with respect to its neighbors. Outliers are points in much sparser regions than their neighbors.
model = LocalOutlierFactor(n_neighbors=20, contamination=CONTAMINATION_RATE, novelty=False)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['LOF'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 5. Isolation-based Approach: iForest
print("5. Running Isolation-based (Isolation Forest)...")
# Isolation Forest isolates observations by randomly selecting a feature and then
# randomly selecting a split value. Anomalies are easier to isolate and thus have shorter path lengths.
model = IsolationForest(contamination=CONTAMINATION_RATE, random_state=42)
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['IsolationForest'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 6. One-Class SVM
print("6. Running One-Class SVM...")
# OCSVM learns a decision boundary that encompasses the majority of the data.
# Points falling outside this boundary are considered outliers.
model = OneClassSVM(nu=CONTAMINATION_RATE, kernel="rbf", gamma="auto")
predictions = model.fit_predict(scaled_anomaly_df)
outliers_indices = np.where(predictions == -1)[0]
outlier_results['OneClassSVM'] = scaled_anomaly_df.index[outliers_indices].tolist()

## 7. Reconstruction Error Approach: PCA
print("7. Running Reconstruction Error (PCA)...")
# This approach uses PCA to project data into a lower-dimensional space and then
# projects it back. Outliers, which don't fit the main data patterns, will have a
# high reconstruction error.
n_components = 5 # Should be less than the number of features
model = PCA(n_components=n_components)
transformed = model.fit_transform(scaled_anomaly_df)
reconstructed = model.inverse_transform(transformed)
reconstruction_error = np.linalg.norm(scaled_anomaly_df - reconstructed, axis=1)
threshold = np.quantile(reconstruction_error, 1 - CONTAMINATION_RATE)
outliers_indices = np.where(reconstruction_error > threshold)[0]
outlier_results['PCA_Reconstruction'] = scaled_anomaly_df.index[outliers_indices].tolist()


# --- Aggregate and Summarize Results ---
print("\n" + "="*50)
print("--- Anomaly Detection Summary ---")

# Flatten the list of all outliers found
all_outliers = [user_id for method_outliers in outlier_results.values() for user_id in method_outliers]

# Count how many times each user was flagged as an outlier
outlier_counts = Counter(all_outliers)

# Create a summary DataFrame
summary_df = pd.DataFrame(outlier_counts.items(), columns=['UserID', 'Anomaly_Count'])
summary_df = summary_df.sort_values(by='Anomaly_Count', ascending=False).reset_index(drop=True)

print(f"\nTotal unique users flagged as outliers: {len(summary_df)}")
print("\nTop 20 most anomalous users:")
print(summary_df.head(20))

# You can also create a detailed DataFrame showing which algorithm flagged which user
detailed_summary_df = scaled_anomaly_df.copy()
for method, outliers in outlier_results.items():
    detailed_summary_df[method] = detailed_summary_df.index.isin(outliers)

detailed_summary_df['Anomaly_Count'] = detailed_summary_df.iloc[:, -len(outlier_results):].sum(axis=1)
detailed_summary_df = detailed_summary_df.sort_values(by='Anomaly_Count', ascending=False)

print("\nDetailed breakdown for users flagged by 3 or more algorithms:")
print(detailed_summary_df[detailed_summary_df['Anomaly_Count'] >= 3])