## Database

---


In [1]:
# PATHs

# The directory where the dune data is stored
ORI_DATA_PATH = './ori_data'

# The directory where the three databases are stored
DATABASE_PATH = './database'

# The directory where the scraped tweets data is stored
TWEET_PATH = './ori_data/tweets'


In [2]:
import time
import os
import json
import pandas as pd
import numpy as np
from datetime import date


In [3]:
today = str(date.today())

dune_bt_dict = json.load(
    open('{}/punkBought_{}.json'.format(ORI_DATA_PATH, '2022-05-08')))
dune_tf_dict = json.load(
    open('{}/punkTransfer_{}.json'.format(ORI_DATA_PATH, '2022-05-08')))


In [7]:
# read transaction database
tx_db = pd.read_csv('{}/tx_db.csv'.format(DATABASE_PATH), index_col=0)

# read cryptopunk database
punk_db = pd.read_csv('{}/punk_db.csv'.format(DATABASE_PATH), index_col=0)
punk_db['attributes'] = punk_db['attributes'].apply(eval)

# read trader database
trader_db = pd.read_csv('{}/trader_db.csv'.format(DATABASE_PATH), index_col=0)

### 2. CryptoPunk

- **punk_id**
- img_url
- type (Alien, Ape, Zombie, Female, Male)
- skin_tone
- attr_count
- attributes
- current_owner: trader_id
- avg_price


In [8]:

def get_avg_price(punk_id):
    tx_contains_punk_id = tx_db[tx_db['punk_id'] == punk_id]
    if tx_contains_punk_id.shape[0] == 0:
        return 0.0
    price_list = tx_contains_punk_id['eth_price'].tolist()
    avg_price = np.mean(price_list)
    return avg_price


def create_punk_db(CSV_PATH=ORI_DATA_PATH):
    punk_db = pd.read_csv('{}/{}'.format(CSV_PATH, 'punk_info.csv'))

    # rename columns
    punk_db.columns = ['punk_id', 'type', 'gender',
                       'skin_tone', 'attr_count', 'attributes']

    # strip type, gender, skin_tone
    punk_db['type'] = punk_db['type'].apply(lambda x: x.strip())
    punk_db['gender'] = punk_db['gender'].apply(lambda x: x.strip())
    punk_db['skin_tone'] = punk_db['skin_tone'].apply(lambda x: x.strip())
    punk_db['skin_tone'] = punk_db['skin_tone'].apply(
        lambda x: 'Non-human' if x == '' else x)
    
    # set skin_tone color
    skin_tones = ['Medium', 'Dark', 'Light', 'Albino', 'Non-human']
    colors = ['#DB9065', '#A4031F', '#F2A359', '#F2DC5D', '#8DFFCD']
    color_by_skin_tone = dict(zip(skin_tones, colors))
    
    punk_db['skin_tone_color'] = punk_db['skin_tone'].apply(lambda x: color_by_skin_tone[x])

    # make attributes as list
    punk_db['attributes'] = punk_db['attributes'].apply(
        lambda x: [i.strip() for i in x.split('/')])

    # get punk avg price
    punk_db['avg_price'] = punk_db['punk_id'].apply(get_avg_price)

    # set punk image url
    punk_db['img_url'] = punk_db['punk_id'].apply(
        lambda x: 'https://www.larvalabs.com/cryptopunks/cryptopunk{}.png'.format(x))

    # set punk_id as index
    punk_db.set_index('punk_id', inplace=True)

    return punk_db


punk_db = create_punk_db(ORI_DATA_PATH)
punk_db.to_csv('{}/punk_db.csv'.format(DATABASE_PATH), index=True)
print('Cryptopunk database saved to {}/punk_db.csv'.format(DATABASE_PATH))
punk_db


Cryptopunk database saved to ./database/punk_db.csv


Unnamed: 0_level_0,type,gender,skin_tone,attr_count,attributes,skin_tone_color,avg_price,img_url
punk_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Human,Female,Medium,3,"[Green Eye Shadow, Earring, Blonde Bob]",#DB9065,0.000000,https://www.larvalabs.com/cryptopunks/cryptopu...
1,Human,Male,Dark,2,"[Smile, Mohawk]",#A4031F,31.000000,https://www.larvalabs.com/cryptopunks/cryptopu...
2,Human,Female,Light,1,[Wild Hair],#F2A359,0.000000,https://www.larvalabs.com/cryptopunks/cryptopu...
3,Human,Male,Dark,3,"[Wild Hair, Nerd Glasses, Pipe]",#A4031F,0.000000,https://www.larvalabs.com/cryptopunks/cryptopu...
4,Human,Male,Medium,4,"[Big Shades, Wild Hair, Earring, Goat]",#DB9065,0.000000,https://www.larvalabs.com/cryptopunks/cryptopu...
...,...,...,...,...,...,...,...,...
9995,Human,Female,Albino,2,"[Purple Eye Shadow, Straight Hair Dark]",#F2DC5D,0.000000,https://www.larvalabs.com/cryptopunks/cryptopu...
9996,Human,Male,Light,4,"[Cigarette, Earring, Crazy Hair, Smile]",#F2A359,0.000000,https://www.larvalabs.com/cryptopunks/cryptopu...
9997,Zombie,Male,Non-human,2,"[Front Beard, Cap Forward]",#8DFFCD,99.990000,https://www.larvalabs.com/cryptopunks/cryptopu...
9998,Human,Female,Medium,3,"[Wild White Hair, Black Lipstick, Clown Eyes G...",#DB9065,54.333333,https://www.larvalabs.com/cryptopunks/cryptopu...


### 1. Transaction Database

- **tx_id**
- date_time
- from: trader_id
- yo: trader_id
- eth_price
- punk_id: punk_id


In [9]:
def index_trader(buyer, seller):
    trader = list(set(list(buyer.unique()) + list(seller.unique())))

    return {trader[i]: i for i in range(len(trader))}


In [10]:
def create_tx_db(dune_bt_dict, dune_tf_dict):
    temp_idx = 0
    # punkBought data frame
    bt_data = dict()
    for year in dune_bt_dict:
        data_list = dune_bt_dict[year]['data']['get_result_by_result_id']
        bt_data.update({temp_idx+i: data_list[i]['data']
                       for i in range(len(data_list))})
        temp_idx += len(data_list)

    # punkTransfer data frame
    tf_data = dict()
    for year in dune_tf_dict:
        data_list = dune_tf_dict[year]['data']['get_result_by_result_id']
        tf_data.update({temp_idx+i: data_list[i]['data']
                       for i in range(len(data_list))})
        temp_idx += len(data_list)

    # concatenate data frames
    tx_db = pd.concat([pd.DataFrame.from_dict(bt_data, orient='index'),
                       pd.DataFrame.from_dict(tf_data, orient='index')])

    # remove the wrong tx
    tx_db = tx_db[tx_db['eth_price'] < 10000]

    # set date_time
    tx_db['date_time'] = pd.to_datetime(tx_db['_date'] + ' ' + tx_db['_time'])

    # remove txs whose buyer is \x0000000000000000000000000000000000000000
    tx_db = tx_db[tx_db['buyer'] !=
                  '\\x0000000000000000000000000000000000000000']

    # reindex seller and buyer using trader_id
    trader_index_dict = index_trader(tx_db['buyer'], tx_db['seller'])
    tx_db['from'] = tx_db['seller'].apply(lambda x: trader_index_dict[x])
    tx_db['to'] = tx_db['buyer'].apply(lambda x: trader_index_dict[x])

    # sort by date_time
    tx_db.sort_values(by='date_time', inplace=True)

    # index transaction
    tx_db['tx_id'] = range(len(tx_db))
    tx_db = tx_db.loc[:, ['tx_id', 'date_time',
                          'from', 'to', 'eth_price', 'punk_id']]
    tx_db.set_index('tx_id', inplace=True)

    tx_db['punk_skin_tone'] = tx_db['punk_id'].apply(lambda x: punk_db.loc[x, 'skin_tone'])
    tx_db['punk_skin_tone_color'] = tx_db['punk_id'].apply(lambda x: punk_db.loc[x, 'skin_tone_color'])
    
    print('Total {} transactions'.format(len(tx_db)))
    print('Total {} unique traders'.format(len(trader_index_dict)))
    print('Total {} unique punk_id'.format(len(tx_db['punk_id'].unique())))

    return tx_db, trader_index_dict


tx_db, trader_index_dict = create_tx_db(dune_bt_dict, dune_tf_dict)
tx_db.to_csv('{}/tx_db.csv'.format(DATABASE_PATH), index=True)
print('Transaction database saved to {}/tx_db.csv'.format(DATABASE_PATH))
tx_db


Total 16824 transactions
Total 5912 unique traders
Total 6157 unique punk_id
Transaction database saved to ./database/tx_db.csv


Unnamed: 0_level_0,date_time,from,to,eth_price,punk_id,punk_skin_tone,punk_skin_tone_color
tx_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2017-06-23 21:06:32,3702,4940,0.01,3134,Medium,#DB9065
1,2017-06-23 21:06:45,3702,4476,0.04,5719,Medium,#DB9065
2,2017-06-23 21:06:53,4940,3568,0.10,5056,Albino,#F2DC5D
3,2017-06-23 22:06:08,3702,4476,0.06,5624,Light,#F2A359
4,2017-06-23 22:06:12,3702,4476,0.03,6548,Albino,#F2DC5D
...,...,...,...,...,...,...,...
16819,2022-05-05 21:05:57,3032,3330,70.00,3314,Medium,#DB9065
16820,2022-05-06 14:05:57,2745,1449,0.00,3095,Medium,#DB9065
16821,2022-05-06 17:05:48,2049,3093,58.00,6995,Light,#F2A359
16822,2022-05-07 08:05:43,438,1852,65.95,3941,Light,#F2A359


#### TX database for each year

In [11]:
tx_db_per_year = {}

years = ['2017', '2018', '2019', '2020', '2021', '2022']

for year in years:
    this_year = '{}-01-01'.format(year)
    next_year = str(int(year) + 1)
    next_year = '{}-01-01'.format(next_year)
    tx_db_per_year[year] = tx_db[tx_db['date_time'].apply(lambda x: this_year<=str(x)<next_year)]

    tx_db_per_year[year].to_csv('{}/tx_db_{}.csv'.format(DATABASE_PATH, year), index=True)
    print('Transaction database for {} saved to {}/tx_db_{}.csv'.format(year, DATABASE_PATH, year))

Transaction database for 2017 saved to ./database/tx_db_2017.csv
Transaction database for 2018 saved to ./database/tx_db_2018.csv
Transaction database for 2019 saved to ./database/tx_db_2019.csv
Transaction database for 2020 saved to ./database/tx_db_2020.csv
Transaction database for 2021 saved to ./database/tx_db_2021.csv
Transaction database for 2022 saved to ./database/tx_db_2022.csv


### 3. Trader

- **trader_id**
- address
- tx_involved_count
- tx_involved: list of tx_id
- frequent_skin_tone
- frequent_gender


In [12]:

def find_most_frequent_skin_tone(tx_involved_list, tx_db):
    skin_tones = list()
    for tx_id in tx_involved_list:
        punk_id = tx_db.loc[tx_id, 'punk_id']
        skin_tones.append(punk_db.loc[punk_id, 'skin_tone'])
    return max(set(skin_tones), key=skin_tones.count)


def find_most_frequent_gender(tx_involved_list, tx_db):
    genders = list()
    for tx_id in tx_involved_list:
        punk_id = tx_db.loc[tx_id, 'punk_id']
        genders.append(punk_db.loc[punk_id, 'gender'])
    return max(set(genders), key=genders.count)


def create_trader_db(tx_db, trader_index_dict):
    trader_db = pd.DataFrame.from_dict(
        trader_index_dict, orient='index', columns=['trader_id'])

    # set address for each trader
    trader_db.reset_index(inplace=True)
    trader_db.rename(columns={'index': 'address'}, inplace=True)

    # set trader_id as index
    trader_db.sort_values(by='trader_id', inplace=True)

    # get the involved tx
    # trader_db['tx_involved_count'] = trader_db['trader_id'].apply(
    #     lambda x: len(tx_db[tx_db['from'] == x]) + len(tx_db[tx_db['to'] == x]))
    trader_db['tx_involved'] = trader_db['trader_id'].apply(lambda x: sorted(list(tx_db[tx_db['from'] == x].index) +
                                                            list(tx_db[tx_db['to'] == x].index)))

    # # get the skin tone that each trader most frequently trades with
    # trader_db['frequent_skin_tone'] = trader_db['tx_involved'].apply(
    #     lambda x: find_most_frequent_skin_tone(x, tx_db))

    # # get the gender that each trader most frequently trades with
    # trader_db['frequent_gender'] = trader_db['tx_involved'].apply(
    #     lambda x: find_most_frequent_gender(x, tx_db))

    # set trader_id as index
    trader_db.set_index('trader_id', inplace=True)

    return trader_db


In [13]:
trader_db = create_trader_db(tx_db, trader_index_dict)
trader_db.to_csv('{}/trader_db.csv'.format(DATABASE_PATH), index=True)
print('Trader database saved to {}/trader_db.csv'.format(DATABASE_PATH))
trader_db

Trader database saved to ./database/trader_db.csv


Unnamed: 0_level_0,address,tx_involved
trader_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,\xad35308a98a8e45a277308326f6f2152fb25193d,[14652]
1,\x7fccd576b808736f84eabeb03b49515251049435,[9630]
2,\xcc778603d10bdb4865ff6589b4f48ce8f9887744,[13810]
3,\x1e5fe56aa8f66909fc7f2239749897e171380b65,[14693]
4,\x77d550883410f4d1d88c2bf79132f375cfed31ef,[9882]
...,...,...
5907,\x0545043519f288dc0d8b547259872b166ca77165,[13484]
5908,\xeed4ab63daec420797fb407c4fb762be1a8ec580,"[7575, 7710, 12948, 14135]"
5909,\x69c488bcda156379b6661f08a35db627e5d467dd,[16040]
5910,\x19bafb19c71ace7dd16843f4c11f81faa6fbf62e,"[5184, 13132]"


In [14]:
def get_trader_db_by_year(trader_db, year, tx_db_per_year):
    tx_in_year = list(tx_db_per_year[year].index)
    trader_db_in_year = trader_db[trader_db['tx_involved'].apply(lambda x: len(set(x) & set(tx_in_year)) > 0)]
    trader_db_in_year['tx_involved'] = trader_db_in_year['tx_involved'].apply(lambda x: list(set(x) & set(tx_in_year)))
    return trader_db_in_year

In [15]:
for year in years:
    trader_db_year = get_trader_db_by_year(trader_db, year, tx_db_per_year)
    trader_db_year.to_csv('{}/trader_db_{}.csv'.format(DATABASE_PATH, year), index=True)
    print('Trader database for {} saved to {}/trader_db_{}.csv'.format(year, DATABASE_PATH, year))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trader_db_in_year['tx_involved'] = trader_db_in_year['tx_involved'].apply(lambda x: list(set(x) & set(tx_in_year)))


Trader database for 2017 saved to ./database/trader_db_2017.csv
Trader database for 2018 saved to ./database/trader_db_2018.csv
Trader database for 2019 saved to ./database/trader_db_2019.csv
Trader database for 2020 saved to ./database/trader_db_2020.csv
Trader database for 2021 saved to ./database/trader_db_2021.csv
Trader database for 2022 saved to ./database/trader_db_2022.csv


### Tweets

In [22]:
csv_list = list()
csv_counter = 0
for root, dirs, files in os.walk(TWEET_PATH):
    for file in files:
        if file.endswith(".csv"):
            temp_tweets_df = pd.read_csv(
                '{}/{}'.format(TWEET_PATH, file), lineterminator='\n')
            csv_list.append(temp_tweets_df)
            csv_counter += 1
            print('{} files read: {} ===== {} tweets'.format(
                csv_counter, file, len(temp_tweets_df)))

tweets_df = pd.concat(csv_list, axis=0, ignore_index=True)
tweets_df.drop(['Unnamed: 0'], axis=1, inplace=True)
tweets_df.sort_values(by=['date'], inplace=True)
tweets_df['id'] = tweets_df['id'].apply(lambda x: int(x))

tweets_df.drop(['retweetedTweet', 'mentionedUsers'], axis=1, inplace=True)

tweets_df = tweets_df.loc[:, ['id', 'date', 'content', 'url', 'username']]

tweets_df.to_csv('{}/tweets_db.csv'.format(DATABASE_PATH), index=False)

print('Tweets database saved to {}/tweets_db.csv'.format(DATABASE_PATH))
print('Total tweets: {}'.format(len(tweets_df)))

tweets_df.head()


1 files read: contain_nft_transparency.csv ===== 8794 tweets
2 files read: contain_cryptopunk_gender.csv ===== 34 tweets
3 files read: contain_cryptopunk_accountability.csv ===== 3 tweets
4 files read: contain_nft_skin_tone.csv ===== 198 tweets
5 files read: contain_cryptopunk_fairness.csv ===== 1 tweets
6 files read: contain_nft_gender.csv ===== 6456 tweets
7 files read: contain_cryptopunk_transparency.csv ===== 14 tweets
8 files read: contain_cryptopunk_skin_color.csv ===== 7 tweets
9 files read: contain_cryptopunk_ethnicity.csv ===== 0 tweets
10 files read: contain_nft_skin_color.csv ===== 389 tweets
11 files read: contain_nft_informed_consent.csv ===== 6 tweets
12 files read: contain_cryptopunk_skin_tone.csv ===== 5 tweets
13 files read: contain_nft_trust.csv ===== 54405 tweets
14 files read: contain_cryptopunk_ethic.csv ===== 3 tweets
15 files read: contain_nft_accountability.csv ===== 1561 tweets
16 files read: contain_nft_fairness.csv ===== 1706 tweets
17 files read: contain_nft

Unnamed: 0,id,date,content,url,username
70311,1254597572,2009-02-26 18:08:50+00:00,@wkriesel Thanks for #education RT--love you...,https://twitter.com/MayaFrost_NFT/status/12545...,MayaFrost_NFT
70310,1389182849,2009-03-25 17:31:52+00:00,A World Apart: Lessons Learned from Successfu...,https://twitter.com/MayaFrost_NFT/status/13891...,MayaFrost_NFT
81531,13964319067,2010-05-14 08:14:27+00:00,EU privacy watchdogs say Facebook changes 'una...,https://twitter.com/MichaelM_NFT/status/139643...,MichaelM_NFT
81530,14245622932,2010-05-18 19:28:31+00:00,New apps restore Facebook privacy settings - h...,https://twitter.com/MichaelM_NFT/status/142456...,MichaelM_NFT
81529,14245676874,2010-05-18 19:29:44+00:00,"MySpace's New Privacy Pitch: Too Little, Too L...",https://twitter.com/MichaelM_NFT/status/142456...,MichaelM_NFT


## Data Engineering for Visualization

---

Visualization 1: `vis1.ipynb`

Visualization 2: `vis2.ipynb`

Visualization 3: `vis3.ipynb`

Visualization 4: `vis4.ipynb`

Visualization 5: `vis5.ipynb`
