# Fake Social Media Account Detection – Proof of Concept (POC)
by Hemanth Vignesh, Srujitha Jasmine 
**Goal:**  
Build and evaluate multiple supervised learning models to detect **fake social media accounts** using a custom dataset (`fake_dataset.xlsx`).

This POC will:
1. Load and explore the dataset  
2. Perform basic data cleaning and preprocessing  
3. Train and compare different classification models  
4. Select a final model (Gradient Boosting) based on performance  
5. Summarize insights for deployment (used later in `train_model.py` and Streamlit app)


In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

import joblib

In [None]:
# Adjust path if needed (assumes notebook and file in same folder)
DATA_PATH = "fake_dataset.xlsx"

df = pd.read_excel(DATA_PATH)
df.head()

In [None]:
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

print("\nInfo:")
print(df.info())

print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
# Try to detect the target column (same as app/train script logic)
possible_targets = ["is_fake", "label", "fake", "target", "class", "isbot", "bot"]
target_col = None

for c in df.columns:
    if c.lower() in possible_targets:
        target_col = c
        break

if target_col is None:
    # fallback: any column with exactly 2 unique values
    for c in df.columns:
        if df[c].dropna().nunique() == 2:
            target_col = c
            break

print("Detected target column:", target_col)

df[target_col].value_counts().plot(kind="bar")
plt.title(f"Class distribution for '{target_col}'")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
df_clean = df.copy()

# Drop high-cardinality or non-useful text columns explicitly
cols_to_drop = []

# 'username' is usually too unique → remove
if "username" in df_clean.columns:
    cols_to_drop.append("username")

# drop any obvious IDs if present
id_like = ["id", "user_id", "account_id", "handle", "uuid"]
for c in df_clean.columns:
    if any(k in c.lower() for k in id_like):
        cols_to_drop.append(c)

cols_to_drop = list(set(cols_to_drop))
print("Dropping columns:", cols_to_drop)

df_clean = df_clean.drop(columns=cols_to_drop, errors="ignore")

# Handle missing values (simple strategy, different from production pipeline)
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_clean.select_dtypes(exclude=[np.number]).columns.tolist()

print("\nNumeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

# Fill numeric NaNs with median
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())

# Fill categorical NaNs with mode
for c in categorical_cols:
    if df_clean[c].isnull().any():
        df_clean[c] = df_clean[c].fillna(df_clean[c].mode()[0])

df_clean.isnull().sum()

In [None]:
# Define X and y
X = df_clean.drop(columns=[target_col])
y = df_clean[target_col]

# If target is not numeric (e.g., "Fake"/"Real"), convert to 0/1
if y.dtype == "O":
    print("Converting string labels to 0/1...")
    y = y.map(lambda v: 1 if str(v).lower() in ["fake", "1", "yes", "true"] else 0)

print("y value counts:")
print(y.value_counts())

# One-hot encode categorical columns (e.g., platform)
X_encoded = pd.get_dummies(X, drop_first=True)
print("Shape after encoding:", X_encoded.shape)

X_encoded.head()

In [None]:
RANDOM_STATE = 42

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y  # good for classification
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Only scale numeric columns (all are numeric after get_dummies)
X_train_scaled.loc[:, :] = scaler.fit_transform(X_train)
X_test_scaled.loc[:, :] = scaler.transform(X_test)

X_train_scaled.head()

In [None]:
results = {}

# 1. Logistic Regression
log_reg = LogisticRegression(
    max_iter=500,
    n_jobs=-1,
)
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)

results["LogisticRegression"] = {
    "accuracy": accuracy_score(y_test, y_pred_lr),
    "precision": precision_score(y_test, y_pred_lr, zero_division=0),
    "recall": recall_score(y_test, y_pred_lr, zero_division=0),
    "f1": f1_score(y_test, y_pred_lr, zero_division=0),
}

# 2. Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

results["RandomForest"] = {
    "accuracy": accuracy_score(y_test, y_pred_rf),
    "precision": precision_score(y_test, y_pred_rf, zero_division=0),
    "recall": recall_score(y_test, y_pred_rf, zero_division=0),
    "f1": f1_score(y_test, y_pred_rf, zero_division=0),
}

# 3. XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    eval_metric="logloss"
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

results["XGBoost"] = {
    "accuracy": accuracy_score(y_test, y_pred_xgb),
    "precision": precision_score(y_test, y_pred_xgb, zero_division=0),
    "recall": recall_score(y_test, y_pred_xgb, zero_division=0),
    "f1": f1_score(y_test, y_pred_xgb, zero_division=0),
}

# 4. Gradient Boosting (this is the one we finally use in deployment)
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=RANDOM_STATE
)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

results["GradientBoosting"] = {
    "accuracy": accuracy_score(y_test, y_pred_gb),
    "precision": precision_score(y_test, y_pred_gb, zero_division=0),
    "recall": recall_score(y_test, y_pred_gb, zero_division=0),
    "f1": f1_score(y_test, y_pred_gb, zero_division=0),
}

pd.DataFrame(results).T

In [None]:
cm = confusion_matrix(y_test, y_pred_gb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix – Gradient Boosting")
plt.show()

print("Classification report – Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))

In [None]:
importances = gb.feature_importances_
feature_names = X_encoded.columns

fi_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

fi_df.head(15)

## Summary of POC

- Dataset: `fake_dataset.xlsx` with features such as:
  - `platform`, `has_profile_pic`, `bio_length`, `followers`, `following`,
    `follower_following_ratio`, `account_age_days`, `posts`, `posts_per_day`,
    `caption_similarity_score`, `content_similarity_score`, `follow_unfollow_rate`,
    `spam_comments_rate`, `generic_comment_rate`, `suspicious_links_in_bio`,
    `verified`, `username_length`, `digits_count`, `digit_ratio`, `special_char_count`,
    `repeat_char_count`, etc.
- Target variable: **`is_fake`** (binary: real vs fake)

### Modelling Approach in POC

1. Performed basic cleaning:
   - Dropped high-cardinality `username` and ID-like columns
   - Filled missing numeric values with median
   - Filled missing categorical values with mode
2. One-hot encoded categorical fields (e.g., `platform`) using `pd.get_dummies`.
3. Compared four supervised classifiers:
   - Logistic Regression (with scaling)
   - Random Forest
   - XGBoost
   - Gradient Boosting
4. Evaluated using accuracy, precision, recall, F1-score, and confusion matrix.

**Final Choice:**  
Gradient Boosting gave a strong balance of performance and interpretability,  
so it was selected as the final model family for deployment.  

In the **deployment phase**, a more robust training script (`train_model.py`)  
is used, which:
- Builds a reusable preprocessing pipeline (imputation, scaling, encoding)
- Saves the final model as `outputs/best_model.joblib`
- Is consumed by the Streamlit app (`app_fake_checker.py`) for live predictions.