In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the driver
driver = webdriver.Chrome(options=chrome_options)

try:
    # Navigate to the URL
    driver.get("https://bsky.app/profile/aoc.bsky.social/post/3limt2yh6sk2j")
    
    # Wait for content to load (adjust timeout as needed)
    time.sleep(5)  # Allow time for dynamic content to load
    
    # Find all text elements with their containing divs
    elements = driver.find_elements(By.XPATH, "//div[string-length(text()) > 0]")
    
    # Print text and div info for each element
    for element in elements:
        try:
            text = element.text.strip()
            if text:  # Only print if there's actual text
                print("Text:", text)
                print("Div class:", element.get_attribute("class"))
                print("-" * 50)
        except:
            continue

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    # Close the browser
    driver.quit()


Text: Post
Div class: css-146c3p1 r-8akbws r-krxsd3 r-dnmrzs r-1udh08x r-1udbk01
--------------------------------------------------
Text: Alexandria Ocasio-Cortez
Div class: css-146c3p1 r-dnmrzs r-1udh08x r-1udbk01 r-3s2u2q r-1iln25a
--------------------------------------------------
Text: ‪@aoc.bsky.social‬
Div class: css-146c3p1 r-dnmrzs r-1udh08x r-1udbk01 r-3s2u2q r-1iln25a
--------------------------------------------------
Text: Follow
Div class: css-146c3p1
--------------------------------------------------
Text: There are moments in these deportation fights when I can just FEEL how pissed off they are that I’m Puerto Rican 😂 they want to threaten me with it so bad 😂
Div class: css-146c3p1 r-1xnzce8
--------------------------------------------------
Text: February 20, 2025 at 12:32 PM
Div class: css-146c3p1
--------------------------------------------------
Text: Everybody can reply
Div class: css-146c3p1
--------------------------------------------------
Text: 4.2K reposts
Div c

In [2]:
import json

# Set up Chrome options and initialize driver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode without opening browser window
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=chrome_options)

try:
    # Navigate to the URL and wait for content to load dynamically
    driver.get("https://bsky.app/profile/aoc.bsky.social/post/3limt2yh6sk2j")
    
    time.sleep(5)  # Allow time for dynamic content to load
    
    # Find elements with specific classes for posts and comments
    primary_posts = driver.find_elements(By.CLASS_NAME, "r-1xnzce8")
    potential_comments = driver.find_elements(By.CLASS_NAME, "css-146c3p1")

    # Initialize data dictionary
    data = {}
    primary_post = ""
    comments = []
    current_user = None

    # First find the main post
    for post in primary_posts:
        try:
            text = post.text.strip()
            if text:
                # Remove backslashes and following characters until space
                text = ' '.join(word.split('\\')[0] for word in text.split())
                primary_post = text
                break
        except:
            continue

    # Then collect comments
    seen_texts = set()  # To avoid duplicates
    for comment in potential_comments:
        try:
            text = comment.text.strip()
            # Skip if text contains bsky.social
            if "bsky.social" in text:
                continue
                
            # Skip empty texts, numeric values, metadata-like short texts, likes and reposts
            if (text and not text.replace(".", "").isdigit() and 
                len(text) > 10 and text not in seen_texts and
                "reply" not in text.lower() and
                not text.startswith("February") and
                not text.endswith("likes") and 
                not text.endswith("reposts")):
                
                # Remove backslashes and following characters until space
                text = ' '.join(word.split('\\')[0] for word in text.split())
                comments.append(text)
                seen_texts.add(text)
        except:
            continue

    # Create JSON structure  
    data = {
        "primary_post": primary_post,
        "comments": comments
    }

    # Add current date and time to data
    from datetime import datetime
    now = datetime.now()
    data["timestamp"] = now.isoformat()
    
    # Create filename with timestamp
    filename = f'bsky_data_{now.strftime("%Y%m%d_%H%M%S")}.json'
    
    # Write to JSON file
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    # Close the browser to free up system resources
    driver.quit()


In [3]:
import gradio as gr

def scrape_bsky(url):
    try:
        # Initialize webdriver with same options as working code
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox") 
        chrome_options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(options=chrome_options)
        
        # Get the page
        driver.get(url)
        time.sleep(5)  # Wait for page to load
        
        # Get primary post using same class as working code
        primary_posts = driver.find_elements(By.CLASS_NAME, "r-1xnzce8")
        primary_post = ""
        for post in primary_posts:
            try:
                text = post.text.strip()
                if text:
                    text = ' '.join(word.split('\\')[0] for word in text.split())
                    primary_post = text
                    break
            except:
                continue
        
        # Get comments using same class as working code
        comments = []
        seen_texts = set()
        potential_comments = driver.find_elements(By.CLASS_NAME, "css-146c3p1")
        
        for comment in potential_comments:
            try:
                text = comment.text.strip()
                if "bsky.social" in text:
                    continue
                    
                if (text and not text.replace(".", "").isdigit() and 
                    len(text) > 10 and text not in seen_texts and
                    "reply" not in text.lower() and
                    not text.startswith("February") and
                    not text.endswith("likes") and 
                    not text.endswith("reposts")):
                    
                    text = ' '.join(word.split('\\')[0] for word in text.split())
                    comments.append(text)
                    seen_texts.add(text)
            except:
                continue

        # Create JSON structure
        data = {
            "primary_post": primary_post,
            "comments": comments,
            "timestamp": datetime.now().isoformat()
        }
        
        # Format output string
        output = f"Primary Post:\n{primary_post}\n\nFirst 3 Comments:\n"
        for i, comment in enumerate(comments[1:4], 1):
            output += f"{i}. {comment}\n"
            
        return output, data
        
    except Exception as e:
        return f"An error occurred: {str(e)}", None
    
    finally:
        driver.quit()

# Create Gradio interface
iface = gr.Interface(
    fn=scrape_bsky,
    inputs=gr.Textbox(label="Enter Bluesky Post URL"),
    outputs=[
        gr.Textbox(label="Output Text"),
        gr.JSON(label="Full JSON Data")
    ],
    title="Bluesky Post Scraper",
    description="Enter a Bluesky post URL to scrape the post and its comments"
)

iface.launch()


  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




In [4]:
%pip install diffusers transformers
import torch
from diffusers import DiffusionPipeline

torch.device("mps")

pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16)
pipe = pipe.to("mps")

prompt = "Extract the sentiment from the following social media post, and visualize that sentiment as pixel art: There are moments in these deportation fights when I can just FEEL how pissed off they are that I'm Puerto Rican 😂 they want to threaten me with it so bad 😂"
image = pipe(prompt).images[0]


Note: you may need to restart the kernel to use updated packages.


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


Loading pipeline components...: 100%|██████████| 6/6 [00:05<00:00,  1.02it/s]
100%|██████████| 50/50 [01:21<00:00,  1.64s/it]


In [6]:
import gradio as gr

def scrape_bsky(url):
    try:
        # Initialize webdriver with same options as working code
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox") 
        chrome_options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(options=chrome_options)
        
        # Get the page
        driver.get(url)
        time.sleep(5)  # Wait for page to load
        
        # Get primary post using same class as working code
        primary_posts = driver.find_elements(By.CLASS_NAME, "r-1xnzce8")
        primary_post = ""
        for post in primary_posts:
            try:
                text = post.text.strip()
                if text:
                    text = ' '.join(word.split('\\')[0] for word in text.split())
                    primary_post = text
                    break
            except:
                continue
        
        # Get comments using same class as working code
        comments = []
        seen_texts = set()
        potential_comments = driver.find_elements(By.CLASS_NAME, "css-146c3p1")
        
        for comment in potential_comments:
            try:
                text = comment.text.strip()
                if "bsky.social" in text:
                    continue
                    
                if (text and not text.replace(".", "").isdigit() and 
                    len(text) > 10 and text not in seen_texts and
                    "reply" not in text.lower() and
                    not text.startswith("February") and
                    not text.endswith("likes") and 
                    not text.endswith("reposts")):
                    
                    text = ' '.join(word.split('\\')[0] for word in text.split())
                    comments.append(text)
                    seen_texts.add(text)
            except:
                continue

        # Create JSON structure
        data = {
            "primary_post": primary_post,
            "comments": comments,
            "timestamp": datetime.now().isoformat()
        }
        
        # Format output string
        output = f"Primary Post:\n{primary_post}\n\nFirst 3 Comments:\n"
        for i, comment in enumerate(comments[1:4], 1):
            output += f"{i}. {comment}\n"
            
        return output, data
        
    except Exception as e:
        return f"An error occurred: {str(e)}", None
    
    finally:
        driver.quit()

# Use the primary post as prompt for stable diffusion
try:
    prompt = f"Extract the sentiment from the following social media post, and visualize that sentiment as pixel art: {primary_post}"
    image = pipe(prompt).images[0]
    return output, data, image
except Exception as e:
    print(f"Error generating image: {str(e)}")
    return output, data, None

# Create Gradio interface
iface = gr.Interface(
    fn=scrape_bsky,
    inputs=gr.Textbox(label="Enter Bluesky Post URL"),
    outputs=[
        gr.Textbox(label="Output Text"),
        gr.JSON(label="Full JSON Data")
    ],
    title="Bluesky Post Scraper",
    description="Enter a Bluesky post URL to scrape the post and its comments"
)

iface.launch()


SyntaxError: 'return' outside function (3021323447.py, line 77)