In [40]:
username="kojied"

In [35]:
from data_fetcher import fetch_user_posts, fetch_user_comments
from reddit_client import reddit
import pandas as pd
user = reddit.redditor(username)
posts=fetch_user_posts(user, limit=None)
comments=fetch_user_comments(user, limit=None)
activity_df = pd.DataFrame(posts + comments)
activity_df.to_csv(f"Data/{username}_activity.csv")

In [36]:
import pandas as pd
import re
import nltk
from datetime import datetime
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nrclex import NRCLex
from detoxify import Detoxify
from bertopic import BERTopic
from collections import defaultdict, Counter

# === NLTK Resources ===
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("vader_lexicon")

# === Load & Clean ===
df = pd.read_csv("Data/kojied_activity.csv")
df["text"] = df["text"].fillna("")
df["title"] = df["title"].fillna("")
df["content"] = df["title"] + ". " + df["text"]
stop_words = set(stopwords.words("english"))

# === Sentiment & Emotion ===
vader = SentimentIntensityAnalyzer()
df["sentiment_score"] = df["content"].apply(lambda x: vader.polarity_scores(x)["compound"])
df["toxicity"] = df["content"].apply(lambda x: Detoxify("original").predict(x)["toxicity"])

def extract_emotion(text):
    emotions = NRCLex(text).top_emotions
    return emotions[0] if emotions else ("neutral", 0)

df[["top_emotion", "emotion_score"]] = df["content"].apply(lambda x: pd.Series(extract_emotion(x)))

# === Topic Modeling ===
print("Running topic modeling...")
topic_model = BERTopic(verbose=False)
topics, _ = topic_model.fit_transform(df["content"].tolist())
df["topic"] = topics

# === Tokenize for Trait Analysis ===
def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    return [w for w in tokens if w.isalpha() and w not in stop_words]

df["tokens"] = df["content"].apply(preprocess)
tokens = [word for row in df["tokens"] for word in row]
word_counts = Counter(tokens)

# === Big Five Traits (OCEAN) ===
OCEAN = {
    "openness": {"imagine", "explore", "curious", "creative", "adventure", "philosophy", "novel"},
    "conscientiousness": {"plan", "organize", "routine", "discipline", "goal", "work", "task"},
    "extraversion": {"party", "fun", "people", "chat", "friends", "energy", "meet"},
    "agreeableness": {"kind", "thank", "help", "please", "cooperate", "gentle", "appreciate"},
    "neuroticism": {"worry", "anxious", "sad", "angry", "regret", "fear", "nervous"},
}
trait_scores = {trait: sum(word_counts.get(w, 0) for w in words) for trait, words in OCEAN.items()}
total_words = len(tokens)
trait_normalized = {k: round(v / total_words, 4) for k, v in trait_scores.items()}

# === MBTI ===
MBTI = {"I": 0, "E": 0, "S": 0, "N": 0, "T": 0, "F": 0, "J": 0, "P": 0}
patterns = {
    "I": {"alone", "quiet", "reflect"},
    "E": {"chat", "talk", "meet", "group"},
    "S": {"facts", "detail", "experience"},
    "N": {"imagine", "abstract", "dream", "future"},
    "T": {"think", "logic", "reason"},
    "F": {"feel", "care", "emotion"},
    "J": {"plan", "organized", "decide"},
    "P": {"flexible", "explore", "spontaneous"},
}
for key, words in patterns.items():
    MBTI[key] = sum(word_counts.get(w, 0) for w in words)
mbti_final = (
    "I" if MBTI["I"] >= MBTI["E"] else "E"
) + (
    "S" if MBTI["S"] >= MBTI["N"] else "N"
) + (
    "T" if MBTI["T"] >= MBTI["F"] else "F"
) + (
    "J" if MBTI["J"] >= MBTI["P"] else "P"
)

# === Find Extremes ===
most_pos = df.loc[df["sentiment_score"].idxmax()]
most_neg = df.loc[df["sentiment_score"].idxmin()]
most_toxic = df.loc[df["toxicity"].idxmax()]
top_emotion = df.loc[df["emotion_score"].idxmax()]

# === Extract Age, Occupation from text if possible ===
def extract_age(text):
    match = re.search(r"\b(\d{1,2})\s*(years old|yo)\b", text.lower())
    return match.group(1) if match else None

def extract_occupation(text):
    jobs = ["student", "developer", "designer", "engineer", "teacher", "artist", "manager"]
    for word in text.lower().split():
        if word in jobs:
            return word
    return None

age = next((extract_age(c) for c in df["content"] if extract_age(c)), "Unknown")
occupation = next((extract_occupation(c) for c in df["content"] if extract_occupation(c)), "Unknown")

# === Save Qualitative Persona ===
with open("UserProfile/qualitative_persona.txt", "w", encoding="utf-8") as f:
    f.write("=== Qualitative Persona ===\n")
    f.write(f"Generated: {datetime.utcnow().isoformat()}\n\n")
    f.write(f"🧑 Name: {df.get('author', pd.Series(['Unknown'])).iloc[0]}\n")
    f.write(f"📍 Age: {age or 'Unknown'}\n")
    f.write(f"💼 Occupation: {occupation or 'Unknown'}\n")
    f.write("🌍 Location: Unknown\n\n")

    f.write("📝 Background:\n")
    f.write("User appears active on Reddit, posting about diverse themes like ")
    top_words = [word for word, _ in word_counts.most_common(5)]
    f.write(", ".join(top_words) + ".\n\n")

    f.write("🎯 Goals & Needs:\n")
    f.write("- Personal growth and self-improvement.\n" if "improve" in tokens else "- Unknown\n")
    f.write("- Possibly career-focused.\n" if "work" in tokens or "study" in tokens else "")
    f.write("\n\n")

    f.write("😣 Pain Points:\n")
    f.write(f"- \"{most_neg['content'][:100]}...\"\n" if most_neg["sentiment_score"] < -0.4 else "- Not strongly negative.\n")
    f.write(f"- \"{most_toxic['content'][:100]}...\"\n" if most_toxic["toxicity"] > 0.5 else "")
    f.write("\n\n")

    f.write("🧠 Personality Traits (OCEAN):\n")
    for trait, score in trait_normalized.items():
        f.write(f"- {trait.title()}: {score}\n")
    f.write(f"- MBTI Estimate: {mbti_final}\n\n")

    f.write("💡 Motivations:\n")
    if top_emotion["emotion_score"] > 0.3:
        f.write(f"- Dominant Emotion: {top_emotion['top_emotion']} – \"{top_emotion['content'][:80]}...\"\n")
    else:
        f.write("- No strong emotion found.\n")

    f.write("\n📱 Tech Behavior:\n")
    f.write("- Uses Reddit actively. Mentions YouTube, Instagram if found.\n")
    if "reddit" in tokens:
        f.write("- Engages in online discussions.\n")
    f.write("\n")
    f.write("🗣️ Representative Quote:\n")
    f.write(f"\"{most_pos['content'][:180]}...\"\n")

print("✅ Qualitative persona saved to qualitative_persona.txt")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sibaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sibaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sibaj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Running topic modeling...
✅ Qualitative persona saved to qualitative_persona.txt


In [37]:
import pandas as pd
from datetime import datetime
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from detoxify import Detoxify
from nrclex import NRCLex
from bertopic import BERTopic
import nltk

nltk.download("vader_lexicon")
nltk.download("punkt")

# === Load CSV ===
df = pd.read_csv("Data/kojied_activity.csv")  # Replace with your filename
df["text"] = df["text"].fillna("")  # Ensure no NaN

# === Combine title and text for better topic modeling ===
df["content"] = df["title"].fillna("") + ". " + df["text"].fillna("")

# === Prepare a helper to fetch ID ===
def get_id(row):
    return row["id"]

# === Sentiment Analysis ===
vader = SentimentIntensityAnalyzer()
df["sentiment_score"] = df["content"].apply(lambda x: vader.polarity_scores(str(x))["compound"])

# === Toxicity Detection ===
df["toxicity"] = df["content"].apply(lambda x: Detoxify("original").predict(str(x))["toxicity"])

# === Emotion Detection ===
def top_emotion(text):
    emotions = NRCLex(text).top_emotions
    return emotions[0] if emotions else ("neutral", 0)

df[["top_emotion", "emotion_score"]] = df["content"].apply(lambda x: pd.Series(top_emotion(x)))

# === Topic Modeling ===
print("Running BERTopic topic modeling...")
topic_model = BERTopic(verbose=True)
topics, _ = topic_model.fit_transform(df["content"].tolist())
df["topic"] = topics
topic_info = topic_model.get_topic_info()

# === Extract Most Relevant Posts/Comments Per Trait ===
most_pos = df.loc[df["sentiment_score"].idxmax()]
most_neg = df.loc[df["sentiment_score"].idxmin()]
most_toxic = df.loc[df["toxicity"].idxmax()]

emotions = {}
for emotion in df["top_emotion"].unique():
    top_row = df[df["top_emotion"] == emotion].sort_values("emotion_score", ascending=False).head(1)
    if not top_row.empty:
        top = top_row.iloc[0]
        emotions[emotion] = {
            "text": top["content"],
            "id": get_id(top),
            "score": round(top["emotion_score"], 3)
        }

topic_examples = {}
for topic_id in df["topic"].unique():
    if topic_id == -1:
        continue
    top_topic = df[df["topic"] == topic_id].sort_values("score", ascending=False).head(1)
    if not top_topic.empty:
        top = top_topic.iloc[0]
        topic_examples[str(topic_id)] = {
            "topic_words": topic_model.get_topic(topic_id),
            "example_text": top["content"],
            "id": get_id(top),
            "score": top["score"]
        }

# === Save to TXT instead of JSON ===
with open("UserProfile/persona_profile.txt", "w", encoding="utf-8") as f:
    f.write("=== Persona Profile Report ===\n")
    f.write(f"Generated on: {datetime.utcnow().isoformat()}\n\n")
    
    f.write(f"Posts Analyzed: {len(df)}\n\n")

    # Sentiment
    f.write(">> Sentiment Analysis:\n")
    f.write(f"Average Sentiment Score: {round(df['sentiment_score'].mean(), 3)}\n")
    f.write(f"Most Positive (Score {most_pos['sentiment_score']}): [{get_id(most_pos)}]\n{most_pos['content']}\n\n")
    f.write(f"Most Negative (Score {most_neg['sentiment_score']}): [{get_id(most_neg)}]\n{most_neg['content']}\n\n")

    # Toxicity
    f.write(">> Toxicity:\n")
    f.write(f"Average Toxicity Score: {round(df['toxicity'].mean(), 3)}\n")
    f.write(f"Most Toxic (Score {most_toxic['toxicity']}): [{get_id(most_toxic)}]\n{most_toxic['content']}\n\n")

    # Emotions
    f.write(">> Emotion Highlights:\n")
    for emotion, details in emotions.items():
        f.write(f"{emotion.title()} (Score {details['score']}): [{details['id']}]\n{details['text']}\n\n")

    # Topics
    f.write(">> Topic Insights:\n")
    for topic_id, details in topic_examples.items():
        f.write(f"Topic {topic_id} Keywords: {', '.join([word for word, _ in details['topic_words']])}\n")
        f.write(f"Top Example (Score {details['score']}): [{details['id']}]\n{details['example_text']}\n\n")

print("✅ Profile saved to persona_profile.txt")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sibaj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sibaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2025-07-15 21:51:38,368 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic topic modeling...


Batches: 100%|██████████| 12/12 [00:02<00:00,  4.41it/s]
2025-07-15 21:51:44,916 - BERTopic - Embedding - Completed ✓
2025-07-15 21:51:44,916 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-15 21:51:45,145 - BERTopic - Dimensionality - Completed ✓
2025-07-15 21:51:45,147 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-15 21:51:45,166 - BERTopic - Cluster - Completed ✓
2025-07-15 21:51:45,169 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-15 21:51:45,219 - BERTopic - Representation - Completed ✓


✅ Profile saved to persona_profile.txt


In [38]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict, Counter
from datetime import datetime

nltk.download("punkt")
nltk.download("stopwords")

# === Load CSV ===
df = pd.read_csv("Data/kojied_activity.csv")
df["text"] = df["text"].fillna("")
df["content"] = df["title"].fillna("") + ". " + df["text"].fillna("")

# === Preprocessing ===
stop_words = set(stopwords.words("english"))

def preprocess(text):
    tokens = word_tokenize(str(text).lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

df["tokens"] = df["content"].apply(preprocess)

# === Define Behavior Categories ===
behaviors = {
    "work_study": {"study", "exam", "task", "goal", "work", "deadline", "project", "submit", "assignment", "focus"},
    "home_life": {"home", "family", "clean", "cook", "chores", "house", "room", "kitchen", "laundry"},
    "entertainment": {"movie", "anime", "game", "binge", "netflix", "series", "watch", "film", "tv", "play", "meme"},
    "social_life": {"party", "hangout", "friend", "group", "talk", "meet", "invite", "event"},
    "values_beliefs": {"god", "faith", "religion", "ethics", "moral", "belief", "spiritual"},
    "health_fitness": {"gym", "workout", "exercise", "diet", "run", "protein", "calorie", "fitness", "training"},
    "brand_behavior": {"nike", "apple", "amazon", "brand", "shop", "purchase", "wear", "logo", "store"},
    "online_activity": {"reddit", "youtube", "scroll", "comment", "post", "stream", "instagram", "social", "media"},
    "travel": {"trip", "travel", "flight", "train", "road", "vacation", "passport", "journey", "airport", "adventure"},
    "self_improvement": {"motivation", "growth", "improve", "discipline", "change", "habit", "learn"},
    "emotions": {"happy", "sad", "angry", "regret", "anxious", "depressed", "excited", "lonely"},
    "finance": {"money", "buy", "spend", "save", "budget", "stock", "invest", "purchase", "salary", "debt", "bank"},
    "relationships": {"love", "relationship", "girlfriend", "boyfriend", "dating", "crush", "romantic", "partner", "ex"},
    "daily_routine": {"breakfast", "coffee", "sleep", "nap", "wake", "bed", "dream", "morning", "night", "alarm"},
    "identity_expression": {"gender", "race", "lgbtq", "identity", "culture", "indian", "asian", "american"},
}

# === Analyze Behavior ===
behavior_matches = defaultdict(list)
behavior_scores = defaultdict(int)

for idx, row in df.iterrows():
    tokens = set(row["tokens"])
    for category, keywords in behaviors.items():
        match_count = len(tokens & keywords)
        if match_count > 0:
            behavior_scores[category] += match_count
            behavior_matches[category].append((row["id"], row["content"], match_count))

# === Normalize Frequencies ===
total_hits = sum(behavior_scores.values())
behavior_normalized = {
    k: round(v / total_hits, 3) if total_hits > 0 else 0
    for k, v in behavior_scores.items()
}

# === Sort & select top 3 examples per category ===
top_examples = {
    category: sorted(posts, key=lambda x: -x[2])[:3]
    for category, posts in behavior_matches.items()
}

# === Save to TXT Report ===
with open("UserProfile/behavior_profile.txt", "w", encoding="utf-8") as f:
    f.write("=== Behavior & Habit Profile ===\n")
    f.write(f"Generated: {datetime.utcnow().isoformat()}\n")
    f.write(f"Posts Analyzed: {len(df)}\n\n")

    f.write(">> Behavioral Category Frequency (normalized):\n")
    for category in sorted(behavior_normalized, key=behavior_normalized.get, reverse=True):
        f.write(f"{category.replace('_', ' ').title():<22}: {behavior_normalized[category]}\n")
    f.write("\n")

    for category in sorted(behaviors.keys()):
        f.write(f">> {category.upper()} ({behavior_scores[category]} matches)\n")
        examples = top_examples.get(category, [])
        if examples:
            for i, (id_val, content, score) in enumerate(examples, 1):
                f.write(f"  [{i}] ID: {id_val} | Score: {score}\n")
                f.write(f"      {content[:300].strip()}...\n")
        else:
            f.write("  No strong matches found.\n")
        f.write("\n")

print("✅ Extended behavior report saved to behavior_profile.txt")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sibaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sibaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Extended behavior report saved to behavior_profile.txt


In [39]:
import os
from pathlib import Path
from datetime import datetime

# Folder path
folder = Path("UserProfile")

# Files to combine
files = ["qualitative_persona.txt", "persona_profile.txt", "behavior_profile.txt"]
output_file = folder / "combined_user_profile.txt"

# Combine contents
with open(output_file, "w", encoding="utf-8") as out:
    out.write("=== COMBINED USER PROFILE ===\n")
    out.write(f"Generated: {datetime.utcnow().isoformat()}\n\n")
    
    for filename in files:
        file_path = folder / filename
        if file_path.exists():
            out.write(f"\n\n--- {filename.upper()} ---\n\n")
            content = file_path.read_text(encoding="utf-8")
            out.write(content.strip())
        else:
            out.write(f"\n\n--- {filename.upper()} NOT FOUND ---\n")

print(f"✅ Combined profile saved to: {output_file}")


✅ Combined profile saved to: UserProfile\combined_user_profile.txt


In [65]:
from agents import Reddit_JSON_Formatter_Agent, Reddit_Personality_agent
import textwrap

CHUNK_SIZE = 5500  # Adjust as per agent's token limits and safety margin

async def convert_persona_to_json(markdown_persona: str):
    print("[+] Splitting persona into manageable chunks...")

    # Split the input into chunks
    chunks = textwrap.wrap(markdown_persona, CHUNK_SIZE, break_long_words=False, break_on_hyphens=False)

    json_prompt_base = """
You will be given a user persona written in markdown with section headings such as Biodata, Motivations, Personality, etc.
In "AI_insigts", you explain how you got the corresponding attributes details and they must contain ids of those posts or comments.
Each AI_insights must be back by the post id's . If you are guessing some attributes , write how you guessed it in AI_insights if possible with ids.
You can add new attributes in personality , motivation.
Please double-check each word , each commas, each bracket ,each literals and characters before giving the final output as it gets difficult to debug in frontend
"""

    json_schema_header = """
#json
{
  "persona": {
    "reddit_username": "{username}",
    "name": null,
    "photo_url": "",
    "demographics": {
      "age": null,
      "occupation": null,
      "location": null,
      "marital_status": null,
      "tier": null,
      "archetype": null,
      "AI_insights": ""
    },
    "quote": "",
    "traits": {
      "content": [],
      "AI_insights": ""
    },
    "motivations": {
      "content": [],
      "AI_insights": ""
    },
    "personality": {
      "content": [],
      "AI_insights": ""
    },
    "behaviors_habits": {
      "content": [],
      "AI_insights": ""
    },
    "goals_needs": {
      "content": [],
      "AI_insights": ""
    },
    "pain_points": {
      "content": [],
      "AI_insights": ""
    },
    "tools_technology": {
      "content": [],
      "AI_insights": ""
    }
  }
}
"""

    final_output = ""

    # Process each chunk
    for idx, chunk in enumerate(chunks):
        print(f"[+] Processing chunk {idx + 1}/{len(chunks)}...")
        full_prompt = f"{json_prompt_base}\n\nSchema:\n{json_schema_header}\n\nChunk {idx + 1}:\n{chunk}\n\nReturn only JSON."
        response = await Reddit_Personality_agent.arun(full_prompt)
        final_output += response.content.strip() + "\n"
    json_prompt_base="""
Combine all these into a single json like structured below
In "AI_insigts", you explain how you got the corresponding attributes details and they must contain ids of those posts or comments.
Each AI_insights must be back by the post id's . If you are guessing some attributes , write how you guessed it in AI_insights if possible with ids.
You can add new attributes in personality , motivation.
Please double-check each word , each commas, each bracket ,each literals and characters before giving the final output as it gets difficult to debug in frontend"""
    json_schema_header = """
#json
{
  "persona": {
    "reddit_username": "{username}",
    "name": null,
    "photo_url": "",
    "demographics": {
      "age": null,
      "occupation": null,
      "location": null,
      "marital_status": null,
      "tier": null,
      "archetype": null,
      "AI_insights": ""
    },
    "quote": "",
    "traits": {
      "content": [],
      "AI_insights": ""
    },
    "motivations": {
      "content": [],
      "AI_insights": ""
    },
    "personality": {
      "content": [],
      "AI_insights": ""
    },
    "behaviors_habits": {
      "content": [],
      "AI_insights": ""
    },
    "goals_needs": {
      "content": [],
      "AI_insights": ""
    },
    "pain_points": {
      "content": [],
      "AI_insights": ""
    },
    "tools_technology": {
      "content": [],
      "AI_insights": ""
    }
  }
}
"""
    final_prompt= f"{json_prompt_base}\n\nSchema:\n{json_schema_header}\n\nReturn only a single JSON."
    final_persona=await Reddit_JSON_Formatter_Agent.arun(final_prompt)

    return final_persona


In [None]:
from pathlib import Path

# Path to the combined file
file_path = Path(f"UserProfile/combined_{username}_profile.txt")

# Read the entire content into a string
combined_text_str = file_path.read_text(encoding="utf-8")
persona_json=await convert_persona_to_json(combined_text_str)

[+] Splitting persona into manageable chunks...
[+] Processing chunk 1/5...
[+] Processing chunk 2/5...
[+] Processing chunk 3/5...
[+] Processing chunk 4/5...
[+] Processing chunk 5/5...


In [58]:
import re

raw_string = persona_json # Replace with your full string

# Step 1: Remove code block markers (```json or ``` at start/end)
cleaned = re.sub(r"```json|```", "", raw_string)

# Step 2: Remove all newline escape sequences
cleaned = cleaned.replace("\\n", "")

# Step 3: Remove all backslashes
cleaned = cleaned.replace("\\", "")

# Optional: strip leading/trailing whitespace
cleaned = cleaned.strip()

# Result is a flat, clean JSON string
print(cleaned)

{
  "persona": {
    "reddit_username": "unknown", // Extracted from markdown: "Name: Unknown"
    "name": null,
    "photo_url": "", // No photo or URL provided
    "demographics": {
      "age": null, // Extracted from markdown: "Age: Unknown"
      "occupation": "developer", // Extracted from markdown: "💼 Occupation: developer"
      "location": null, // Extracted from markdown: "Location: Unknown"
      "marital_status": null,
      "tier": null,
      "archetype": "ESTJ", // Extracted from markdown: "MBTI Estimate: ESTJ"
      "AI_insights": "Age was unknown, but inferred occupation as developer due to their activity on Reddit and interest in tech. Their personality archetype (ESTJ) suggests they may be assertive and organized."
    },
    "quote": "First of all you look good! But if you want to become more "conventionally attractive", here are some of my suggestions. Just personal opinions, take them or leave them.",
    "traits": {
      "content": [
        "OCEAN: Openness 0.0

In [None]:
with open(f"{username}_full_persona_2nd_approach.txt", "w", encoding="utf-8") as file:
    file.write(cleaned)
