# 🤖 FraudLens Model Training Notebook
This notebook loads scraped Facebook page data, processes it, and trains a logistic regression fraud classifier.

In [None]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
from pathlib import Path
import re


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'… see more', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

def preprocess(df):
    df['About Cleaned'] = df['About'].fillna("").apply(clean_text)
    df['Recommendation Sentiment'] = df['Recommendation'].fillna("").apply(lambda x: 1 if "recommend" in x.lower() else 0)
    df['Cleaned Content'] = df['Post Content'].fillna("").apply(clean_text)
    df['Post Length'] = df['Cleaned Content'].apply(len)
    df['Num Comments'] = df['Comments'].apply(len)
    df['Total Reactions'] = df['Reactions'].apply(lambda x: sum(x.values()) if isinstance(x, dict) else 0)
    df['Angry Ratio'] = df['Reactions'].apply(lambda x: x.get('Angry', 0) / sum(x.values()) if sum(x.values()) > 0 else 0)
    df['Sad Ratio'] = df['Reactions'].apply(lambda x: x.get('Sad', 0) / sum(x.values()) if sum(x.values()) > 0 else 0)
    df['Haha Ratio'] = df['Reactions'].apply(lambda x: x.get('Haha', 0) / sum(x.values()) if sum(x.values()) > 0 else 0)
    df['Love Ratio'] = df['Reactions'].apply(lambda x: x.get('Love', 0) / sum(x.values()) if sum(x.values()) > 0 else 0)
    return df


In [None]:
data_dir = Path("data")  # Adjust path if needed
combined_df = []
reviews_corpus = []

for file in data_dir.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        posts = pd.DataFrame(data["Posts"])
        posts["About"] = data.get("About", "")
        posts["Recommendation"] = data.get("Recommendation", "")
        posts["Reviews"] = [data.get("Reviews", [])] * len(posts)
        combined_df.append(posts)
        reviews_corpus.extend([r["Review"] for r in data.get("Reviews", []) if r.get("Review")])

df = pd.concat(combined_df, ignore_index=True)
df = preprocess(df)
print("✅ Loaded and preprocessed:", len(df), "posts")
df.head()


In [None]:
# Vectorize text
tfidf = TfidfVectorizer(max_features=100)
X_text = tfidf.fit_transform(reviews_corpus + df['Cleaned Content'].tolist())
y_dummy = [0] * len(reviews_corpus) + [1] * len(df)
clf_text = LogisticRegression().fit(X_text, y_dummy)
df['Text_Prob'] = clf_text.predict_proba(tfidf.transform(df['Cleaned Content']))[:, 1]

# Anomaly detection
features = ['Post Length', 'Num Comments', 'Total Reactions', 'Angry Ratio', 'Sad Ratio', 'Haha Ratio', 'Love Ratio', 'Recommendation Sentiment']
X_behavior = df[features].fillna(0)
anomaly_model = IsolationForest(contamination=0.25, random_state=42)
df['Anomaly_Score'] = -anomaly_model.fit(X_behavior).decision_function(X_behavior)

# Simulate blockchain trust
np.random.seed(42)
df['Trust_Score'] = np.random.uniform(0.5, 1.0, len(df))

# Final score fusion
df['FraudLens_Score'] = (
    0.35 * df['Text_Prob'] +
    0.35 * df['Anomaly_Score'] +
    0.2 * (1 - df['Trust_Score']) +
    0.1 * df['Recommendation Sentiment']
)
df['Fraud_Prediction'] = df['FraudLens_Score'].apply(lambda x: 1 if x > 0.5 else 0)
df[['Post Content', 'FraudLens_Score', 'Fraud_Prediction']].head()


In [None]:
df.to_csv("fraudlens_training_predictions.csv", index=False)
print("✅ Predictions saved to 'fraudlens_training_predictions.csv'")
