In [3]:
import json

# -----------------------------
# Load & Clean Data
# -----------------------------
def load_data(filename):
    with open(filename, "r") as file:
        return json.load(file)

def clean_data(data):
    # Remove users with missing names
    data["users"] = [u for u in data["users"] if u["name"].strip()]

    # Deduplicate friends
    for u in data["users"]:
        u["friends"] = list(set(u["friends"]))

    # Remove inactive users
    data["users"] = [u for u in data["users"] if u["friends"] or u["liked_pages"]]

    # Deduplicate pages
    unique_pages = {}
    for p in data["pages"]:
        unique_pages[p["id"]] = p
    data["pages"] = list(unique_pages.values())

    return data

# -----------------------------
# Display Functions
# -----------------------------
def display_summary(data):
    print("\n===== Dataset Summary =====")
    print(f"Total Users : {len(data['users'])}")
    print(f"Total Pages : {len(data['pages'])}")

    # Show a sample of first 5 users
    print("\nSample Users:")
    for u in data["users"][:5]:
        print(f"- {u['name']} (ID: {u['id']}), Friends: {len(u['friends'])}, Liked Pages: {len(u['liked_pages'])}")

    print("\nSample Pages:")
    for p in data["pages"][:5]:
        print(f"- {p['id']}: {p['name']}")

# -----------------------------
# Analysis Functions
# -----------------------------
def most_connected_user(data):
    user = max(data["users"], key=lambda u: len(u["friends"]))
    return user["name"], len(user["friends"])

def most_popular_page(data):
    page_count = {}
    for u in data["users"]:
        for p in u["liked_pages"]:
            page_count[p] = page_count.get(p, 0) + 1

    most_liked_page = max(page_count, key=page_count.get)
    page_name = next(p["name"] for p in data["pages"] if p["id"] == most_liked_page)
    return page_name, page_count[most_liked_page]

# -----------------------------
# Recommendation: People You May Know
# -----------------------------
def find_people_you_may_know(user_id, data):
    user_friends = {u["id"]: set(u["friends"]) for u in data["users"]}
    if user_id not in user_friends:
        return []

    direct = user_friends[user_id]
    suggestions = {}

    for f in direct:
        for mutual in user_friends.get(f, []):
            if mutual != user_id and mutual not in direct:
                suggestions[mutual] = suggestions.get(mutual, 0) + 1

    return sorted(suggestions.items(), key=lambda x: x[1], reverse=True)

# -----------------------------
# Recommendation: Pages You Might Like
# -----------------------------
def find_pages_you_might_like(user_id, data):
    user_pages = {u["id"]: set(u["liked_pages"]) for u in data["users"]}
    if user_id not in user_pages:
        return []

    liked = user_pages[user_id]
    suggestions = {}

    for other, pages in user_pages.items():
        if other != user_id:
            shared = liked.intersection(pages)
            for p in pages:
                if p not in liked:
                    suggestions[p] = suggestions.get(p, 0) + len(shared)

    return sorted(suggestions.items(), key=lambda x: x[1], reverse=True)

# -----------------------------
# Run the Analysis
# -----------------------------
if __name__ == "__main__":
    data = load_data("massive_data.json")  # ✅ load your dataset
    data = clean_data(data)

    # Summary
    display_summary(data)

    # Insights
    user, friends = most_connected_user(data)
    print(f"\nMost Connected User: {user} ({friends} friends)")

    page, count = most_popular_page(data)
    print(f"Most Popular Page : {page} (liked by {count} users)")

    # Recommendations for a sample user
    user_id = 1  # Example: Amit
    print(f"\n--- Recommendations for User {user_id} ---")

    people = find_people_you_may_know(user_id, data)
    print("People You May Know:")
    for pid, score in people[:5]:
        name = next(u["name"] for u in data["users"] if u["id"] == pid)
        print(f"- {name} (ID {pid}), Mutual Friends: {score}")

    pages = find_pages_you_might_like(user_id, data)
    print("\nPages You Might Like:")
    for pgid, score in pages[:5]:
        name = next(p["name"] for p in data["pages"] if p["id"] == pgid)
        print(f"- {name} (ID {pgid}), Score: {score}")



===== Dataset Summary =====
Total Users : 30
Total Pages : 27

Sample Users:
- Amit (ID: 1), Friends: 5, Liked Pages: 2
- Priya (ID: 2), Friends: 5, Liked Pages: 2
- Rahul (ID: 3), Friends: 5, Liked Pages: 2
- Sara (ID: 4), Friends: 5, Liked Pages: 1
- Neha (ID: 5), Friends: 5, Liked Pages: 2

Sample Pages:
- 101: Python Developers
- 102: Data Science Enthusiasts
- 103: AI & ML Community
- 104: Web Dev Hub
- 105: Blockchain Innovators

Most Connected User: Amit (5 friends)
Most Popular Page : Python Developers (liked by 3 users)

--- Recommendations for User 1 ---
People You May Know:
- Kunal (ID 7), Mutual Friends: 2
- Anjali (ID 8), Mutual Friends: 2
- Ravi (ID 9), Mutual Friends: 1
- Sneha (ID 10), Mutual Friends: 1
- Arjun (ID 11), Mutual Friends: 1

Pages You Might Like:
- AI & ML Community (ID 103), Score: 2
- Blockchain Innovators (ID 105), Score: 1
- Cloud Computing Pros (ID 107), Score: 1
- Web Dev Hub (ID 104), Score: 0
- Cybersecurity Experts (ID 106), Score: 0
