In [None]:

from users import *
import polars as pl

warpcast_hub_key = os.getenv("WARPCAST_HUB_KEY")

users = get_all_users_from_warpcast(warpcast_hub_key)


In [None]:
warpcast_data = [extract_warpcast_user_data(u) for u in users]

# Define the column names and data types for the DataFrame
users_cols = {
    'fid': pl.UInt32,
    'username': pl.Utf8,
    'display_name': pl.Utf8,
    'verified': pl.Boolean,
    'pfp_url': pl.Utf8,
    'follower_count': pl.UInt32,
    'following_count': pl.UInt32,
    'bio_text': pl.Utf8,
    'location_place_id': pl.Utf8,
}


def extract_warpcast_user_data(user):
    return {
        'fid': user['fid'],
        'username': user['username'],
        'display_name': user['displayName'],
        'verified': user['pfp']['verified'] if 'pfp' in user else False,
        'pfp_url': user['pfp']['url'] if 'pfp' in user else '',
        'follower_count': user['followerCount'],
        'following_count': user['followingCount'],
        'bio_text': user['profile']['bio']['text'] if 'bio' in user['profile'] else None,
        'location_place_id': user['profile']['location']['placeId'] if 'location' in user['profile'] else None
    }


users_df = pl.DataFrame(warpcast_data, schema=users_cols)
users_df.write_parquet('users.parquet')


In [None]:
locations_col = {
    'place_id': pl.Utf8,
    'description': pl.Utf8,
}


def get_warpcast_location(user) -> Optional[dict]:
    if 'location' in user['profile']:
        place_id = user['profile']['location'].get('placeId')
        if place_id:
            description = user['profile']['location'].get('description')
            return {'place_id': place_id, 'description': description}
    return None


locations = [get_warpcast_location(u) for u in users]
print(locations)

locations = [l for l in locations if l is not None]

locations_df = pl.DataFrame(locations, schema=locations_col)
locations_df.write_parquet('locations.parquet')


In [18]:
import duckdb
# get users where external_address is not null
df = duckdb.query('''
SELECT * from 'users.parquet' WHERE location_place_id IS NOT NULL;
''').pl()

print(df)


shape: (10607, 9)
┌───────┬───────────┬────────────┬──────────┬───┬────────────┬────────────┬───────────┬────────────┐
│ fid   ┆ username  ┆ display_na ┆ verified ┆ … ┆ follower_c ┆ following_ ┆ bio_text  ┆ location_p │
│ ---   ┆ ---       ┆ me         ┆ ---      ┆   ┆ ount       ┆ count      ┆ ---       ┆ lace_id    │
│ u32   ┆ str       ┆ ---        ┆ bool     ┆   ┆ ---        ┆ ---        ┆ str       ┆ ---        │
│       ┆           ┆ str        ┆          ┆   ┆ u32        ┆ u32        ┆           ┆ str        │
╞═══════╪═══════════╪════════════╪══════════╪═══╪════════════╪════════════╪═══════════╪════════════╡
│ 11066 ┆ mbn       ┆ Michael    ┆ false    ┆ … ┆ 1          ┆ 43         ┆ Constantl ┆            │
│       ┆           ┆            ┆          ┆   ┆            ┆            ┆ y curious ┆            │
│ 11065 ┆ beautiful ┆ José       ┆ false    ┆ … ┆ 1          ┆ 50         ┆ beautiful ┆ ChIJwVPhxK │
│       ┆ brain     ┆ Miguel     ┆          ┆   ┆            ┆           

In [8]:

# connect to test.db sqlite, create session, then run "get all users" with session.Query
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
from models import Base, Cast, EthTransaction
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import polars as pl


def extract_cast_data(c):
    return {
        'hash': c.hash,
        'thread_hash': c.thread_hash,
        'parent_hash': c.parent_hash,
        'text': c.text,
        'timestamp': c.timestamp,
        'author_fid': c.author_fid,
    }


engine = create_engine(os.getenv("PLANETSCALE_URL"))
with sessionmaker(bind=engine)() as session:
    # Define the batch size
    batch_size = 10000

    casts_col = {
        'hash': pl.Utf8,
        'thread_hash': pl.Utf8,
        'parent_hash': pl.Utf8,
        'text': pl.Utf8,
        'timestamp': pl.Int64,
        'author_fid': pl.Int64,
    }

    # Create an empty dataframe to hold the results
    casts_df = pl.DataFrame([], schema=casts_col)

    # Initialize the offset
    offset = 0

    while True:
        # Query the database for the next batch of records
        batch = session.query(Cast).offset(offset).limit(batch_size).all()

        batch_dict = [extract_cast_data(c) for c in batch]

        # Exit the loop if no more records are returned
        if len(batch) == 0:
            break

        # Append the batch of records to the dataframe
        casts_df = casts_df.extend(pl.DataFrame(batch_dict, schema=casts_col))

        # Increment the offset for the next batch
        offset += batch_size

        print(f'Processed {offset} records...')

    # Print the final dataframe
    print(casts_df)

    casts_df.write_parquet('casts.parquet')


Processed 10000 records...
Processed 20000 records...
Processed 30000 records...
Processed 40000 records...
Processed 50000 records...
Processed 60000 records...
Processed 70000 records...
Processed 80000 records...
Processed 90000 records...
Processed 100000 records...
Processed 110000 records...
Processed 120000 records...
Processed 130000 records...
Processed 140000 records...
Processed 150000 records...
Processed 160000 records...
Processed 170000 records...
Processed 180000 records...
Processed 190000 records...
Processed 200000 records...
Processed 210000 records...
Processed 220000 records...
Processed 230000 records...
Processed 240000 records...
Processed 250000 records...
Processed 260000 records...
Processed 270000 records...
Processed 280000 records...
Processed 290000 records...
Processed 300000 records...
Processed 310000 records...
Processed 320000 records...
Processed 330000 records...
Processed 340000 records...
Processed 350000 records...
Processed 360000 records...
P

In [19]:
casts_df = pl.read_parquet('casts.parquet')

# write duckdb query to check whether there's duplicate PK (cast hash)

import duckdb
print(duckdb.query('''
SELECT * FROM casts.parquet LIMIT 10;
''').fetchall())

# print(df)

[('0x00000b36420eea02fd4376ca498f6094d409b2da', '0x68b70fdc670a9195cdf82a14c428804fbced4db5', '0x68b70fdc670a9195cdf82a14c428804fbced4db5', 'Almost Famous?', 1658526131698, 528), ('0x00001732145ad29b69ed1618e4c8ca30ad4aeabd', '0xdd8be11ca5d988305693d0534f46cbe468c6a773', '', 'recast:farcaster://casts/0xf09b94460d519e2644f54e3027902dfcb5cbf53d8ca68dabcb152a1143be24d8', 1666722965031, 473), ('0x000024c7bd518bdcf77df364b36221fe33e919b2', '0x000024c7bd518bdcf77df364b36221fe33e919b2', '', '💎🤲', 1665794567820, 3542), ('0x000034a1ba77caa6d853b2370218535bc80df366', '0x01b1f2074e747c2ea450aaed39c4d01307f76776', '0x2db5f4bc66bc88e9cfd8416496cfc08fd8c252f3', 'Warpcast clients will only render the one image per cast at this time. However, other clients may handle it differently and render all.', 1677860563000, 302), ('0x0000521e3c2e35501e34e0f59cbbc9823d2cbda6', '0x5ec66df926511e22522baf6214b8ee615d2ced2e', '0x5ec66df926511e22522baf6214b8ee615d2ced2e', 'would be interested as well 👀', 165948463440