In [1]:
import requests
import pandas as pd
from datetime import datetime
import time

# 📚 Define keywords for coins
coin_keywords = {
    "bitcoin": ["bitcoin", "btc"],
    "ethereum": ["ethereum", "eth"],
    "solana": ["solana", "sol"],
    "cardano": ["cardano", "ada"],
    "dogecoin": ["dogecoin", "doge"],
    "ripple": ["ripple", "xrp"],
    "litecoin": ["litecoin", "ltc"],
    "polkadot": ["polkadot", "dot"],
    "chainlink": ["chainlink", "link"],
    "polygon": ["polygon", "matic"]
}

def detect_coin(title):
    title_lower = title.lower()
    detected_coins = []
    for coin, keywords in coin_keywords.items():
        for keyword in keywords:
            if keyword in title_lower:
                detected_coins.append(coin)
                break  # No need to check further if one keyword matched
    return detected_coins if detected_coins else ["unknown"]

def scrape_reddit(subreddit="CryptoCurrency", pages=20, delay=2):
    headers = {"User-Agent": "Mozilla/5.0"}
    after = None
    posts = []

    for page in range(pages):
        url = f"https://www.reddit.com/r/{subreddit}/.json"
        if after:
            url += f"?after={after}"

        print(f"📦 Fetching r/{subreddit} page {page + 1}")
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"❌ Failed at page {page + 1} | Status:", response.status_code)
            break

        data = response.json()
        children = data["data"]["children"]
        after = data["data"].get("after", None)

        for post in children:
            p = post["data"]
            title = p.get("title", "")
            coin_tags = detect_coin(title)
            posts.append({
                "subreddit": subreddit,
                "title": title,
                "author": p.get("author", ""),
                "created_utc": datetime.utcfromtimestamp(p.get("created_utc", 0)).strftime('%Y-%m-%d %H:%M:%S'),
                "score": p.get("score", 0),
                "num_comments": p.get("num_comments", 0),
                "url": p.get("url", ""),
                "detected_coins": ", ".join(coin_tags)  # Store detected coins
            })

        if not after:
            print(f"🚫 No more pages for r/{subreddit}")
            break

        time.sleep(delay)

    return posts

# 🔁 Combine from multiple subreddits
if __name__ == "__main__":
    all_subreddits = ["CryptoCurrency", "Bitcoin", "CryptoMarkets", "CryptoTechnology", "CryptoNews"]
    all_posts = []

    for sub in all_subreddits:
        posts = scrape_reddit(subreddit=sub, pages=20)
        all_posts.extend(posts)

    df = pd.DataFrame(all_posts)
    df.drop_duplicates(subset=["title", "url"], inplace=True)
    df.to_csv("reddit_crypto_bulk_tagged.csv", index=False)

    print(f"✅ Collected {len(df)} total posts across {len(all_subreddits)} subreddits.")
    print(df.head())


📦 Fetching r/CryptoCurrency page 1


  "created_utc": datetime.utcfromtimestamp(p.get("created_utc", 0)).strftime('%Y-%m-%d %H:%M:%S'),


📦 Fetching r/CryptoCurrency page 2
📦 Fetching r/CryptoCurrency page 3
📦 Fetching r/CryptoCurrency page 4
📦 Fetching r/CryptoCurrency page 5
📦 Fetching r/CryptoCurrency page 6
📦 Fetching r/CryptoCurrency page 7
📦 Fetching r/CryptoCurrency page 8
📦 Fetching r/CryptoCurrency page 9
📦 Fetching r/CryptoCurrency page 10
📦 Fetching r/CryptoCurrency page 11
📦 Fetching r/CryptoCurrency page 12
📦 Fetching r/CryptoCurrency page 13
📦 Fetching r/CryptoCurrency page 14
🚫 No more pages for r/CryptoCurrency
📦 Fetching r/Bitcoin page 1
📦 Fetching r/Bitcoin page 2
📦 Fetching r/Bitcoin page 3
📦 Fetching r/Bitcoin page 4
📦 Fetching r/Bitcoin page 5
📦 Fetching r/Bitcoin page 6
📦 Fetching r/Bitcoin page 7
📦 Fetching r/Bitcoin page 8
📦 Fetching r/Bitcoin page 9
📦 Fetching r/Bitcoin page 10
📦 Fetching r/Bitcoin page 11
📦 Fetching r/Bitcoin page 12
📦 Fetching r/Bitcoin page 13
📦 Fetching r/Bitcoin page 14
📦 Fetching r/Bitcoin page 15
📦 Fetching r/Bitcoin page 16
📦 Fetching r/Bitcoin page 17
📦 Fetching r/Bitcoi

In [2]:
import requests
import pandas as pd
from datetime import datetime
import time

# Define keyword-to-crypto mappings
CRYPTO_KEYWORDS = {
    "Bitcoin": ["bitcoin", "btc"],
    "Ethereum": ["ethereum", "eth"],
    "Solana": ["solana", "sol"],
    "Cardano": ["cardano", "ada"],
    "Ripple": ["ripple", "xrp"],
    "Dogecoin": ["dogecoin", "doge"],
    "Polkadot": ["polkadot", "dot"],
    "Chainlink": ["chainlink", "link"],
    "Litecoin": ["litecoin", "ltc"]
}

def detect_crypto(title):
    title_lower = title.lower()
    for crypto_name, keywords in CRYPTO_KEYWORDS.items():
        if any(keyword in title_lower for keyword in keywords):
            return crypto_name
    return "Unknown"

def scrape_reddit_bulk(subreddits, pages_per_sub=20, delay=2):
    headers = {"User-Agent": "Mozilla/5.0"}
    all_posts = []

    for subreddit in subreddits:
        after = None
        print(f"\n🚀 Starting r/{subreddit}...")
        for page in range(pages_per_sub):
            url = f"https://www.reddit.com/r/{subreddit}/.json"
            if after:
                url += f"?after={after}"

            print(f"📦 Fetching page {page + 1} of r/{subreddit}")
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print(f"❌ Failed at page {page + 1} | Status Code:", response.status_code)
                break

            data = response.json()
            posts = data["data"]["children"]
            after = data["data"].get("after", None)

            for post in posts:
                p = post["data"]
                title = p.get("title", "")
                crypto_detected = detect_crypto(title)

                all_posts.append({
                    "subreddit": subreddit,
                    "title": title,
                    "author": p.get("author", ""),
                    "created_utc": datetime.utcfromtimestamp(p.get("created_utc", 0)).strftime('%Y-%m-%d %H:%M:%S'),
                    "score": p.get("score", 0),
                    "num_comments": p.get("num_comments", 0),
                    "url": p.get("url", ""),
                    "crypto_name": crypto_detected
                })

            if not after:
                print("🚫 No more pages.")
                break
            time.sleep(delay)

    return pd.DataFrame(all_posts)

# 🔁 Run the collector
if __name__ == "__main__":
    target_subreddits = [
        "CryptoCurrency",
        "Bitcoin",
        "CryptoMarkets",
        "CryptoTechnology",
        "CryptoNews"
    ]

    df = scrape_reddit_bulk(target_subreddits, pages_per_sub=20)
    df.drop_duplicates(subset=["title", "url"], inplace=True)
    df.to_csv("reddit_crypto_2000_posts.csv", index=False)

    print(f"\n✅ DONE: Collected {len(df)} Reddit posts total across {len(target_subreddits)} subs.")
    print(df.sample(5))



🚀 Starting r/CryptoCurrency...
📦 Fetching page 1 of r/CryptoCurrency


  "created_utc": datetime.utcfromtimestamp(p.get("created_utc", 0)).strftime('%Y-%m-%d %H:%M:%S'),


📦 Fetching page 2 of r/CryptoCurrency
📦 Fetching page 3 of r/CryptoCurrency
📦 Fetching page 4 of r/CryptoCurrency
📦 Fetching page 5 of r/CryptoCurrency
📦 Fetching page 6 of r/CryptoCurrency
📦 Fetching page 7 of r/CryptoCurrency
📦 Fetching page 8 of r/CryptoCurrency
📦 Fetching page 9 of r/CryptoCurrency
📦 Fetching page 10 of r/CryptoCurrency
📦 Fetching page 11 of r/CryptoCurrency
📦 Fetching page 12 of r/CryptoCurrency
📦 Fetching page 13 of r/CryptoCurrency
🚫 No more pages.

🚀 Starting r/Bitcoin...
📦 Fetching page 1 of r/Bitcoin
📦 Fetching page 2 of r/Bitcoin
📦 Fetching page 3 of r/Bitcoin
📦 Fetching page 4 of r/Bitcoin
📦 Fetching page 5 of r/Bitcoin
📦 Fetching page 6 of r/Bitcoin
📦 Fetching page 7 of r/Bitcoin
📦 Fetching page 8 of r/Bitcoin
📦 Fetching page 9 of r/Bitcoin
📦 Fetching page 10 of r/Bitcoin
📦 Fetching page 11 of r/Bitcoin
📦 Fetching page 12 of r/Bitcoin
📦 Fetching page 13 of r/Bitcoin
📦 Fetching page 14 of r/Bitcoin
📦 Fetching page 15 of r/Bitcoin
📦 Fetching page 16 of r/Bit

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

# Keyword mapping to detect crypto name from headline
CRYPTO_KEYWORDS = {
    "Bitcoin": ["bitcoin", "btc"],
    "Ethereum": ["ethereum", "eth"],
    "Solana": ["solana", "sol"],
    "Cardano": ["cardano", "ada"],
    "Ripple": ["ripple", "xrp"],
    "Dogecoin": ["dogecoin", "doge"],
    "Polkadot": ["polkadot", "dot"],
    "Chainlink": ["chainlink", "link"],
    "Litecoin": ["litecoin", "ltc"]
}

def detect_crypto(text):
    text = text.lower()
    for name, keywords in CRYPTO_KEYWORDS.items():
        if any(keyword in text for keyword in keywords):
            return name
    return "Unknown"

def scroll_to_bottom(driver, scrolls=100, delay=2):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for i in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)
        new_height = driver.execute_script("return document.body.scrollHeight")
        print(f"📜 Scroll {i+1}/{scrolls}")
        if new_height == last_height:
            print("🚫 Reached end of page.")
            break
        last_height = new_height

def scrape_cointelegraph_bitcoin_articles():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0")

    driver_path = "/opt/homebrew/bin/chromedriver"  # Adjust if needed
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=options)

    url = "https://cointelegraph.com/tags/bitcoin"
    print(f"🌐 Opening: {url}")
    driver.get(url)
    time.sleep(5)

    scroll_to_bottom(driver, scrolls=150, delay=2)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = soup.select("a.post-card-inline__title-link")

    articles = []
    for a in links:
        headline = a.text.strip()
        href = a.get("href", "")
        if headline and href:
            full_url = "https://cointelegraph.com" + href if href.startswith("/") else href
            crypto_detected = detect_crypto(headline)
            articles.append({
                "source": "CoinTelegraph",
                "tag": "bitcoin",
                "headline": headline,
                "url": full_url,
                "scraped_at": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
                "crypto_name": crypto_detected
            })

    driver.quit()
    return pd.DataFrame(articles)

# Run it
if __name__ == "__main__":
    df = scrape_cointelegraph_bitcoin_articles()
    df.drop_duplicates(subset="url", inplace=True)
    df.to_excel("cointelegraph_bitcoin_scroll_fixed.xlsx", index=False)

    print(f"\n✅ Scraped {len(df)} Bitcoin articles from CoinTelegraph")
    print(df.sample(min(5, len(df))))


🌐 Opening: https://cointelegraph.com/tags/bitcoin
📜 Scroll 1/150
📜 Scroll 2/150
📜 Scroll 3/150
📜 Scroll 4/150
📜 Scroll 5/150
📜 Scroll 6/150
📜 Scroll 7/150
📜 Scroll 8/150
📜 Scroll 9/150
📜 Scroll 10/150
📜 Scroll 11/150
📜 Scroll 12/150
📜 Scroll 13/150
📜 Scroll 14/150
📜 Scroll 15/150
📜 Scroll 16/150
📜 Scroll 17/150
📜 Scroll 18/150
📜 Scroll 19/150
📜 Scroll 20/150
📜 Scroll 21/150
📜 Scroll 22/150
📜 Scroll 23/150
📜 Scroll 24/150
📜 Scroll 25/150
📜 Scroll 26/150
📜 Scroll 27/150
📜 Scroll 28/150
📜 Scroll 29/150
📜 Scroll 30/150
🚫 Reached end of page.


  "scraped_at": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),



✅ Scraped 450 Bitcoin articles from CoinTelegraph
            source      tag  \
381  CoinTelegraph  bitcoin   
379  CoinTelegraph  bitcoin   
357  CoinTelegraph  bitcoin   
46   CoinTelegraph  bitcoin   
440  CoinTelegraph  bitcoin   

                                              headline  \
381  Bitcoin breaks $86K as US tariff 'Liberation D...   
379  Price analysis 4/2: BTC, ETH, XRP, BNB, SOL, D...   
357  Malta regulator fines OKX crypto exchange $1.2...   
46   Bitcoin ETFs on $3B ‘bender,’ log first full w...   
440  Bitcoin price drops 3% on hot US PCE data as a...   

                                                   url           scraped_at  \
381  https://cointelegraph.com/news/bitcoin-price-8...  2025-04-30 03:57:17   
379  https://cointelegraph.com/news/price-analysis-...  2025-04-30 03:57:17   
357  https://cointelegraph.com/news/malta-fines-okx...  2025-04-30 03:57:17   
46   https://cointelegraph.com/news/us-spot-bitcoin...  2025-04-30 03:57:17   
440  https://coint

In [4]:
import requests
import pandas as pd
from datetime import datetime
import time

# Keyword mapping to detect crypto names
CRYPTO_KEYWORDS = {
    "Bitcoin": ["bitcoin", "btc"],
    "Ethereum": ["ethereum", "eth"],
    "Solana": ["solana", "sol"],
    "Cardano": ["cardano", "ada"],
    "Ripple": ["ripple", "xrp"],
    "Dogecoin": ["dogecoin", "doge"],
    "Polkadot": ["polkadot", "dot"],
    "Chainlink": ["chainlink", "link"],
    "Litecoin": ["litecoin", "ltc"],
    "Binance Coin": ["binance", "bnb"]
}

def detect_crypto(title):
    """Detect crypto name based on title text."""
    title = title.lower()
    for crypto_name, keywords in CRYPTO_KEYWORDS.items():
        if any(keyword in title for keyword in keywords):
            return crypto_name
    return "Unknown"

def scrape_cryptoslate_wpapi(pages=10):
    base_url = "https://cryptoslate.com/wp-json/wp/v2/posts?page={}"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    all_articles = []

    for page in range(1, pages + 1):
        url = base_url.format(page)
        print(f"🔄 Fetching page {page}: {url}")
        resp = requests.get(url, headers=headers)

        if resp.status_code != 200:
            print(f"❌ Failed at page {page} | Status {resp.status_code}")
            break

        data = resp.json()
        if not data:
            print("🚫 No data returned.")
            break

        for item in data:
            title = item.get("title", {}).get("rendered", "")
            detected = detect_crypto(title)
            all_articles.append({
                "title": title,
                "link": item.get("link", ""),
                "date": item.get("date", ""),
                "scraped_at": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
                "crypto_name": detected
            })

        time.sleep(1)

    return pd.DataFrame(all_articles)

# ✅ Run it
if __name__ == "__main__":
    df = scrape_cryptoslate_wpapi(pages=25)
    df.to_excel("cryptoslate_articles_wpapi.xlsx", index=False)
    print(f"✅ Saved {len(df)} articles to cryptoslate_articles_wpapi.xlsx")
    print(df.sample(5))


🔄 Fetching page 1: https://cryptoslate.com/wp-json/wp/v2/posts?page=1


  "scraped_at": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),


🔄 Fetching page 2: https://cryptoslate.com/wp-json/wp/v2/posts?page=2
🔄 Fetching page 3: https://cryptoslate.com/wp-json/wp/v2/posts?page=3
🔄 Fetching page 4: https://cryptoslate.com/wp-json/wp/v2/posts?page=4
🔄 Fetching page 5: https://cryptoslate.com/wp-json/wp/v2/posts?page=5
🔄 Fetching page 6: https://cryptoslate.com/wp-json/wp/v2/posts?page=6
🔄 Fetching page 7: https://cryptoslate.com/wp-json/wp/v2/posts?page=7
🔄 Fetching page 8: https://cryptoslate.com/wp-json/wp/v2/posts?page=8
🔄 Fetching page 9: https://cryptoslate.com/wp-json/wp/v2/posts?page=9
🔄 Fetching page 10: https://cryptoslate.com/wp-json/wp/v2/posts?page=10
🔄 Fetching page 11: https://cryptoslate.com/wp-json/wp/v2/posts?page=11
🔄 Fetching page 12: https://cryptoslate.com/wp-json/wp/v2/posts?page=12
🔄 Fetching page 13: https://cryptoslate.com/wp-json/wp/v2/posts?page=13
🔄 Fetching page 14: https://cryptoslate.com/wp-json/wp/v2/posts?page=14
🔄 Fetching page 15: https://cryptoslate.com/wp-json/wp/v2/posts?page=15
🔄 Fetchi

In [5]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

cryptos = {
    "Bitcoin": "bitcoin",
    "Ethereum": "ethereum",
    "Solana": "solana",
    "Cardano": "cardano",
    "Ripple": "ripple",
    "Dogecoin": "dogecoin",
    "Polkadot": "polkadot",
    "Chainlink": "chainlink",
    "Litecoin": "litecoin",
    "Binance Coin": "binancecoin"
}

def get_price(slug):
    url = f"https://www.coingecko.com/en/coins/{slug}"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    price_tag = soup.select_one("span[data-target='price.price']")
    if not price_tag:
        price_tag = soup.select_one(".no-wrap")
    
    try:
        price = price_tag.text.strip().replace("$", "").replace(",", "")
        return float(price)
    except:
        return None

def scrape_prices():
    results = []
    for name, slug in cryptos.items():
        print(f"🔍 Scraping price for {name}...")
        price = get_price(slug)
        results.append({"crypto_name": name, "price_usd": price})
        time.sleep(2)  # to avoid rate-limiting
    return pd.DataFrame(results)

if __name__ == "__main__":
    df = scrape_prices()
    df.to_csv("crypto_prices.csv", index=False)
    print("✅ Prices saved to crypto_prices.csv")
    print(df)


🔍 Scraping price for Bitcoin...
🔍 Scraping price for Ethereum...
🔍 Scraping price for Solana...
🔍 Scraping price for Cardano...
🔍 Scraping price for Ripple...
🔍 Scraping price for Dogecoin...
🔍 Scraping price for Polkadot...
🔍 Scraping price for Chainlink...
🔍 Scraping price for Litecoin...
🔍 Scraping price for Binance Coin...
✅ Prices saved to crypto_prices.csv
    crypto_name price_usd
0       Bitcoin      None
1      Ethereum      None
2        Solana      None
3       Cardano      None
4        Ripple      None
5      Dogecoin      None
6      Polkadot      None
7     Chainlink      None
8      Litecoin      None
9  Binance Coin      None


In [10]:
#!/usr/bin/env python3
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import argparse
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

# ─── CONFIGURE LOGGING ─────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s"
)
logger = logging.getLogger("scraper")

# ─── UTILITIES ──────────────────────────────────────────────────────────────────
def convert_abbrev(value_str):
    """
    Convert strings with K/M/B/T suffix to float.
    E.g. '3.5K'->3500, '2.1M'->2100000, '3.077T'->3077000000000
    """
    multipliers = {"K": 1e3, "M": 1e6, "B": 1e9, "T": 1e12}
    if not value_str:
        return None
    suffix = value_str[-1].upper()
    try:
        if suffix in multipliers:
            num = float(value_str[:-1].replace(",", ""))
            return num * multipliers[suffix]
        return float(value_str.replace(",", ""))
    except ValueError:
        return None

# ─── PARSERS ────────────────────────────────────────────────────────────────────
def parse_price(soup):
    """
    Grabs the first <span class="no-wrap">…</span>,
    which always contains the USD price (e.g. "$64,321.12").
    """
    tag = soup.select_one("span.no-wrap")
    if tag:
        match = re.search(r"\$\s*([\d,]+\.?\d*)", tag.text)
        if match:
            return float(match.group(1).replace(",", ""))
    return None

def parse_change_24h(soup):
    tag = soup.select_one('span[data-target="percent-change.percent"]')
    if tag:
        return float(tag.text.strip().replace("%", "").replace(",", ""))
    return None

def parse_market_cap(soup):
    tag = soup.select_one('div[data-target="metric-market-cap.number"], span[data-target="price.market_cap"]')
    if tag:
        text = tag.text.strip().replace("$", "")
        return convert_abbrev(text)
    return None

def parse_volume_24h(soup):
    tag = soup.select_one('div[data-target="metric-volume.number"], span[data-target="price.total_volume"]')
    if tag:
        text = tag.text.strip().replace("$", "")
        return convert_abbrev(text)
    return None

# ─── WORKER ─────────────────────────────────────────────────────────────────────
def get_coin_data(name, slug, session, retries=3):
    url = f"https://www.coingecko.com/en/coins/{slug}"
    backoff = 1
    for attempt in range(1, retries + 1):
        try:
            resp = session.get(url, timeout=10)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            return {
                "crypto_name":    name,
                "price_usd":      parse_price(soup),
                "change_24h":     parse_change_24h(soup),
                "market_cap_usd": parse_market_cap(soup),
                "volume_24h_usd": parse_volume_24h(soup),
            }
        except Exception as e:
            logger.warning(f"{name} (attempt {attempt}/{retries}) failed: {e}")
            time.sleep(backoff)
            backoff *= 2
    logger.error(f"{name} → giving up after {retries} attempts")
    return {
        "crypto_name":    name,
        "price_usd":      None,
        "change_24h":     None,
        "market_cap_usd": None,
        "volume_24h_usd": None,
    }

# ─── MAIN SCRAPER ───────────────────────────────────────────────────────────────
def scrape_all(cryptos, max_workers):
    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as exe:
        futures = {
            exe.submit(get_coin_data, name, slug, session): name
            for name, slug in cryptos.items()
        }
        for fut in as_completed(futures):
            results.append(fut.result())
    return pd.DataFrame(results)

# ─── ENTRYPOINT ─────────────────────────────────────────────────────────────────
def main():
    parser = argparse.ArgumentParser(description="Scrape live crypto metrics from CoinGecko")
    parser.add_argument("--input-csv", help="CSV with columns crypto_name,slug")
    parser.add_argument("--workers",   type=int, default=5, help="Number of threads")
    parser.add_argument("--output",    default="crypto_prices.csv", help="Output filename (.csv or .json)")
    args, _ = parser.parse_known_args()  # ignore unexpected flags in Jupyter

    if args.input_csv:
        df = pd.read_csv(args.input_csv)
        cryptos = dict(zip(df["crypto_name"], df["slug"]))
    else:
        cryptos = {
            "Bitcoin":      "bitcoin",
            "Ethereum":     "ethereum",
            "Solana":       "solana",
            "Cardano":      "cardano",
            "Ripple":       "ripple",
            "Dogecoin":     "dogecoin",
            "Polkadot":     "polkadot",
            "Chainlink":    "chainlink",
            "Litecoin":     "litecoin",
            "Binance Coin": "binancecoin"
        }

    logger.info(f"Starting scrape of {len(cryptos)} coins with {args.workers} workers")
    df_out = scrape_all(cryptos, args.workers)

    if args.output.endswith(".json"):
        df_out.to_json(args.output, orient="records", lines=True)
    else:
        df_out.to_csv(args.output, index=False)

    logger.info(f"Done. Results written to {args.output}")

if __name__ == "__main__":
    main()


2025-04-29 23:45:39,525 INFO [scraper] Starting scrape of 10 coins with 5 workers
2025-04-29 23:45:41,161 INFO [scraper] Done. Results written to crypto_prices.csv


In [3]:
import requests
import pandas as pd

# ─── 1) Your coins and their CoinGecko slugs ─────────────────────────────────
COINS = {
    "Bitcoin":      "bitcoin",
    "Ethereum":     "ethereum",
    "Solana":       "solana",
    "Cardano":      "cardano",
    "Ripple":       "ripple",
    "Dogecoin":     "dogecoin",
    "Polkadot":     "polkadot",
    "Chainlink":    "chainlink",
    "Litecoin":     "litecoin",
    "Binance Coin": "binancecoin"
}

# ─── 2) Call the free Simple Price endpoint ────────────────────────────────────
url = "https://api.coingecko.com/api/v3/simple/price"
params = {
    "ids":           ",".join(COINS.values()),
    "vs_currencies": "usd",
    "include_24hr_change":  "true",
    "include_market_cap":   "true",
    "include_24hr_vol":     "true"
}

resp = requests.get(url, params=params)
resp.raise_for_status()
data = resp.json()

# ─── 3) Build a DataFrame from the JSON ────────────────────────────────────────
rows = []
for name, slug in COINS.items():
    info = data.get(slug, {})
    rows.append({
        "crypto_name":    name,
        "price_usd":      info.get("usd"),
        "change_24h":     info.get("usd_24h_change"),
        "market_cap_usd": info.get("usd_market_cap"),
        "volume_24h_usd": info.get("usd_24h_vol"),
    })

df = pd.DataFrame(rows)

# ─── 4) Save or display ─────────────────────────────────────────────────────────
print(df)
df.to_csv("crypto_prices.csv", index=False)


    crypto_name     price_usd  change_24h  market_cap_usd  volume_24h_usd
0       Bitcoin  95048.000000    0.129847    1.887727e+12    2.705251e+10
1      Ethereum   1809.890000    0.167181    2.185740e+11    1.346471e+10
2        Solana    148.690000    0.896299    7.700588e+10    3.343215e+09
3       Cardano      0.689970   -1.293744    2.486668e+10    6.299620e+08
4        Ripple      2.200000   -2.131066    1.287257e+11    2.762327e+09
5      Dogecoin      0.175171   -0.247776    2.611191e+10    9.856175e+08
6      Polkadot      4.100000   -0.849258    6.248930e+09    1.403477e+08
7     Chainlink     14.630000    0.050085    9.612856e+09    3.226683e+08
8      Litecoin     84.670000   -0.932678    6.419429e+09    3.343250e+08
9  Binance Coin    600.710000   -0.492257    8.765211e+10    7.060881e+08


In [19]:
import pandas as pd
# Your fixed paths:
reddit      = pd.read_csv("reddit_crypto_bulk_tagged.csv")
cointele    = pd.read_excel("cointelegraph_bitcoin_scroll_fixed.xlsx")
cryptoslate = pd.read_excel("cryptoslate_articles_wpapi.xlsx")
prices      = pd.read_csv("crypto_prices.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'reddit_crypto_bulk_tagged.csv'