In [None]:
import praw
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import re
from dotenv import load_dotenv

load_dotenv()

REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT")

reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT
)

def is_image(url):
    return any(url.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif'])

def download_image(url, save_dir='images'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    filename = os.path.join(save_dir, url.split("/")[-1])
    try:
        r = requests.get(url, stream=True)
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
        return filename
    except:
        return None

def extract_post_data(post, data_type):
    return {
        'type': data_type,
        'title': getattr(post, 'title', ''),
        'body': getattr(post, 'body', ''),
        'score': post.score,
        'url': post.url,
        'selftext': getattr(post, 'selftext', ''),
        'subreddit': str(post.subreddit),
        'image_path': download_image(post.url) if is_image(post.url) else None
    }

def scrape_subreddit(subreddit_name, limit=10):
    subreddit = reddit.subreddit(subreddit_name)
    data = []
    for post in tqdm(subreddit.hot(limit=limit)):
        item = extract_post_data(post, 'subreddit_post')
        post.comments.replace_more(limit=0)
        item['comments'] = [comment.body for comment in post.comments.list()]
        item['author'] = str(post.author)
        item['num_comments'] = post.num_comments
        data.append(item)
    return data, None

def scrape_user(username, limit=10):
    user = reddit.redditor(username)
    data = []
    for post in tqdm(user.submissions.new(limit=limit)):
        item = extract_post_data(post, 'user_post')
        item['username'] = username
        data.append(item)
    for comment in tqdm(user.comments.new(limit=limit)):
        item = {
            'type': 'user_comment',
            'body': comment.body,
            'score': comment.score,
            'link_title': comment.link_title,
            'subreddit': str(comment.subreddit),
            'username': username
        }
        data.append(item)
    return data, username

def scrape_post(post_url):
    submission = reddit.submission(url=post_url)
    submission.comments.replace_more(limit=0)
    comments = [comment.body for comment in submission.comments.list()]
    return {
        'type': 'post',
        'title': submission.title,
        'author': str(submission.author),
        'selftext': submission.selftext,
        'url': submission.url,
        'score': submission.score,
        'subreddit': str(submission.subreddit),
        'image_path': download_image(submission.url) if is_image(submission.url) else None,
        'comments': comments
    }, None

def scrape_reddit_from_url(url):
    if re.match(r'https?://(www\.)?reddit\.com/r/[^/]+/?$', url):
        sub_name = url.split('/r/')[1].strip('/')
        return scrape_subreddit(sub_name, limit=10)
    elif re.match(r'https?://(www\.)?reddit\.com/user/[^/]+/?$', url) or re.match(r'https?://(www\.)?reddit\.com/user/[^/]+/comments/?$', url):
        username = url.split('/user/')[1].strip('/')
        return scrape_user(username, limit=10)
    elif re.match(r'https?://(www\.)?reddit\.com/r/.+/comments/.+/?$', url):
        post, _ = scrape_post(url)
        return [post], None
    else:
        print("Unsupported URL format")
        return [], None

reddit_url = input("Paste Reddit URL (user / subreddit / post): ").strip()
scraped_data, extracted_username = scrape_reddit_from_url(reddit_url)
df = pd.json_normalize(scraped_data)
df.to_csv("scraped_reddit_data.csv", index=False)
print("Data saved to scraped_reddit_data.csv")


Paste Reddit URL (user / subreddit / post): https://www.reddit.com/user/spez/


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Data saved to scraped_reddit_data.csv





In [None]:
import pandas as pd
import requests
import json
import re
import textwrap
from PIL import Image, ImageDraw, ImageFont
import os

CSV_PATH = "D:\reddit_persona_generator\scraped_reddit_data.csv"

MODEL = "mistralai/mistral-7b-instruct:free"

load_dotenv()
API_KEY = os.getenv("API_KEY")

df = pd.read_csv(CSV_PATH)

def extract_username(df):
    for col in ['author', 'username']:
        if col in df.columns and df[col].dropna().nunique() == 1:
            return df[col].dropna().unique()[0]
    return "Redditor"

def prepare_prompt(df, forced_username=None):
    titles = next((df[col].dropna().tolist() for col in ['title', 'post_title'] if col in df.columns), [])
    comments = next((df[col].dropna().tolist() for col in ['body', 'selftext', 'comment_body', 'text', 'content'] if col in df.columns), [])
    subreddits = df['subreddit'].dropna().unique().tolist() if 'subreddit' in df.columns else []
    avg_score = int(df['score'].mean()) if 'score' in df.columns else 0
    username = forced_username if forced_username else extract_username(df)
    sample_activity = "\n".join([f"Title: {t}" for t in titles[:5]] + [f"Comment: {c}" for c in comments[:5]])
    subreddit_str = ", ".join(subreddits[:5])

    prompt = f"""
You are a senior user researcher and behavioral analyst working for a UX team. Based on a Reddit user's recent activity, generate a detailed persona.

### USER DATA
- Reddit Username: {username}
- Subreddits Participated: {subreddit_str}
- Average Upvote Score: {avg_score}
- Sample Activity:
{sample_activity}

### TASK
Generate a realistic persona including:

{{
  "name": "{username}",
  "age": "Estimated Age or Unknown",
  "location": "Location or Unknown",
  "motivations": ["..."],
  "frustrations": ["..."],
  "goals": ["..."],
  "interests": ["..."],
  "personality": {{
    "Introvert": 70,
    "Intuitive": 65,
    "Thinking": 85,
    "Perceiving": 60
  }},
  "tone": "...",
  "behavior": "..."
}}
Only return valid JSON.
"""
    return prompt, username

def call_mistral(prompt):
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": MODEL,
        "messages": [{"role": "user", "content": prompt}]
    }
    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 200:
        content = response.json()["choices"][0]["message"]["content"]
        match = re.search(r'\{[\s\S]+\}', content)
        if match:
            return json.loads(match.group(0))
        raise ValueError("Could not parse JSON.")
    else:
        raise Exception(f"API Error: {response.status_code}\n{response.text}")

def draw_full_persona_card(persona, username, filename="persona_card.png"):
    W, H = 1600, 1200
    img = Image.new("RGB", (W, H), "white")
    draw = ImageDraw.Draw(img)

    FONT_DIR = "fonts"
    try:
        title_font = ImageFont.truetype(f"{FONT_DIR}/Poppins-Bold.ttf", 40)
        section_font = ImageFont.truetype(f"{FONT_DIR}/Poppins-Bold.ttf", 28)
        text_font = ImageFont.truetype(f"{FONT_DIR}/Poppins-Regular.ttf", 18)
        small_font = ImageFont.truetype(f"{FONT_DIR}/Poppins-Regular.ttf", 15)
    except:
        title_font = ImageFont.load_default()
        section_font = title_font
        text_font = title_font
        small_font = title_font

    spacing = 35
    draw.ellipse((80, 80, 380, 380), fill="#f2f2f2", outline="#ccc")

    name = persona.get("name", username)
    age = str(persona.get("age", "Unknown"))
    location = persona.get("location", "Unknown")

    draw.text((420, 100), name, font=title_font, fill="#ec6608")
    draw.text((420, 170), "AGE", font=section_font, fill="#ec6608")
    draw.text((520, 170), age, font=text_font, fill="black")
    draw.text((420, 220), "LOCATION", font=section_font, fill="#ec6608")
    draw.text((580, 220), location, font=text_font, fill="black")

    y_start = 400
    x_left = 80
    x_right = 850


    def draw_section(title, lines, x, y):
        draw.text((x, y), title.upper(), font=section_font, fill="#ec6608")
        y += spacing
        wrapper = textwrap.TextWrapper(width=50)
        for item in lines:
            for line in wrapper.wrap(item):
                draw.text((x, y), f"- {line}", font=text_font, fill="black")
                y += spacing
        return y + spacing // 2

    def draw_personality(p, x, y):
        draw.text((x, y), "PERSONALITY", font=section_font, fill="#ec6608")
        y += spacing
        for trait, value in p.items():
            draw.text((x, y), trait.upper(), font=small_font, fill="black")
            bx, by = x + 160, y + 8
            bw = 300
            val = int(value) if isinstance(value, int) else 50
            draw.rectangle([bx, by, bx + bw, by + 15], fill="#ddd")
            draw.rectangle([bx, by, bx + int(bw * val / 100), by + 15], fill="#ec6608")
            y += spacing
        return y + spacing // 2

    y_left = y_start
    y_left = draw_section("Motivations", persona.get("motivations", []), x_left, y_left)
    y_left = draw_section("Frustrations", persona.get("frustrations", []), x_left, y_left)
    y_left = draw_section("Goals", persona.get("goals", []), x_left, y_left)
    y_left = draw_section("Interests", persona.get("interests", []), x_left, y_left)

    y_right = y_start
    y_right = draw_personality(persona.get("personality", {}), x_right, y_right)
    y_right = draw_section("Tone", [persona.get("tone", "")], x_right, y_right)
    y_right = draw_section("Behaviour", [persona.get("behavior", "")], x_right, y_right)

    img.save(filename)
    print(f"Persona card saved as {filename}")

prompt, username = prepare_prompt(df)
persona = call_mistral(prompt)
draw_full_persona_card(persona, username)


Persona card saved as persona_card.png


In [14]:
with open(f"{username}_persona.txt", "w") as f:
    json.dump(persona, f, indent=2)
print(f"Persona text saved as {username}_persona.txt")

Persona text saved as spez_persona.txt
