## Data Cleaning for Visualization 1
---

In [46]:
import time
import json
import pandas as pd
import numpy as np
from plotly.colors import hex_to_rgb

In [6]:
# PATHs

## The directory where the dune data is stored
ORI_DATA_PATH = '../ori_data'

## The directory where the three databases are stored
DATABASE_PATH = '../database'

## The directory where the visualization data for each visualization is stored
VIS_DATA_PATH = '../../data'

## The directory where the scraped tweets data is stored
TWEET_PATH = '../ori_data/tweets'

In [7]:
# read transaction database
tx_db = pd.read_csv('{}/tx_db.csv'.format(DATABASE_PATH), index_col=0)

# read cryptopunk database
punk_db = pd.read_csv('{}/punk_db.csv'.format(DATABASE_PATH), index_col=0)
punk_db['attributes'] = punk_db['attributes'].apply(eval)

# read trader database
trader_db = pd.read_csv('{}/trader_db.csv'.format(DATABASE_PATH), index_col=0)


### 1. Sankey Diagram

<img src="../imgs/vis1_example.png" alt="drawing" width="500">

Data format

```
{
  "node": {
    "name": [
      "Human",
      "Zombie",
      "Ape",
      "Alien",
      "Female",
      "Male",
      "Medium",
      "Dark",
      "Light",
      "Albino",
      "Non-human",
      "0 attributes",
      "1 attributes",
      "2 attributes",
      "3 attributes",
      "4 attributes",
      "5 attributes",
      "6 attributes",
      "7 attributes"
    ],
    "color": [
      "#FFCF00",
      "#00916E",
      "#EE6123",
      "#FDDBC4",
      "#FF5C5C",
      "#5295CB",
      "#DB9065",
      "#A4031F",
      "#F2A359",
      "#F2DC5D",
      "#8DFFCD",
      "#EEF2FC",
      "#B7E5F2",
      "#98DAEC",
      "#41BBDC",
      "#239CBE",
      "#4455DA",
      "#2232AA",
      "#1B1367"
    ],
    "dict": {
      "Human": "#FFCF00",
      "Zombie": "#00916E",
      "Ape": "#EE6123",
      "Alien": "#FDDBC4",
      "Female": "#FF5C5C",
      "Male": "#5295CB",
      "Medium": "#DB9065",
      "Dark": "#A4031F",
      "Light": "#F2A359",
      "Albino": "#F2DC5D",
      "Non-human": "#8DFFCD",
      "0 attributes": "#EEF2FC",
      "1 attributes": "#B7E5F2",
      "2 attributes": "#98DAEC",
      "3 attributes": "#41BBDC",
      "4 attributes": "#239CBE",
      "5 attributes": "#4455DA",
      "6 attributes": "#2232AA",
      "7 attributes": "#1B1367"
    }
  },
  "link": {
    "source": [
      0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
      6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
      10, 10, 10, 10, 10, 10, 10, 10
    ],
    "target": [
      4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, 11, 12, 13, 14,
      15, 16, 17, 18, 11, 12, 13, 14, 15, 16, 17, 18, 11, 12, 13, 14, 15, 16,
      17, 18, 11, 12, 13, 14, 15, 16, 17, 18, 11, 12, 13, 14, 15, 16, 17, 18
    ],
    "value": [
      3840, 6039, 0, 88, 0, 24, 0, 9, 1174, 1101, 1145, 420, 0, 1857, 1723,
      1861, 598, 121, 2, 87, 991, 1433, 458, 55, 4, 1, 2, 79, 996, 1299, 402,
      41, 5, 0, 2, 80, 1064, 1357, 447, 55, 1, 0, 2, 61, 450, 382, 108, 14, 1,
      0, 0, 26, 59, 30, 5, 1, 0, 0
    ],
    "color": [
      "rgba(255,92,92,0.5)",
      ...
    ]
  }
}

```

In [76]:
def get_node_dict(punk_db):
    node_name = list()
    node_color = list()
    node_id_by_level = dict()
    
    node_count = 0
    
    # ['Human', 'Zombie', 'Ape', 'Alien']
    type_list = list(punk_db['type'].unique())
    node_name += type_list
    node_color += ['#FFCF00', '#00916E', '#EE6123', '#FDDBC4']
    node_count += len(type_list)
    node_id_by_level['type'] = type_list

    # ['Female', 'Male']
    gender_list = list(punk_db['gender'].unique())
    node_name += gender_list
    node_color += ['#FF5C5C', '#5295CB']
    node_count += len(gender_list)
    node_id_by_level['gender'] = gender_list

    # ['Medium', 'Dark', 'Light', 'Albino', 'Non-human]
    skin_tone_list = list(punk_db['skin_tone'].unique())
    node_name += skin_tone_list
    node_color += ['#DB9065', '#A4031F', '#F2A359', '#F2DC5D', '#8DFFCD']
    node_count += len(skin_tone_list)
    node_id_by_level['skin_tone'] = skin_tone_list

    # ['0 attributes', '1 attributes', '2 attributes', '3 attributes',
    # '4 attributes', '5 attributes', '6 attributes', '7 attributes']
    punk_db['attr_count_str'] = punk_db['attr_count'].apply(
        lambda x: f'{x} attributes')
    attr_count_list = list(punk_db['attr_count_str'].unique())
    attr_count_list = sorted(attr_count_list)
    node_name += attr_count_list
    node_color += ['#EEF2FC', '#B7E5F2', '#98DAEC',
                   '#41BBDC', '#239CBE', '#4455DA', '#2232AA', '#1B1367']
    node_count += len(attr_count_list)
    node_id_by_level['attr_count_str'] = attr_count_list

    return {'name': node_name, 'color': node_color}, node_id_by_level

In [77]:
def get_link_dict(node_df, node_name_by_level, punk_db):
    source_li = list()
    target_li = list()
    value_li = list()
    link_color_li = list()
    
    levels_list = ['type', 'gender', 'skin_tone', 'attr_count_str']
    
    # links between each pair of neighbor levels
    for level_idx in range(len(levels_list)-1):
        source_level = levels_list[level_idx]
        target_level = levels_list[level_idx+1]
        
        for source_node_name in node_name_by_level[source_level]:
            source_node_id = node_df.loc[source_node_name, 'index']
            for target_node_name in node_name_by_level[target_level]:
                target_node_id = node_df.loc[target_node_name, 'index']
                
                value = punk_db[(punk_db[source_level] == source_node_name) & (
                    punk_db[target_level] == target_node_name)].shape[0]
                
                link_color = convert_to_rgba(node_df.loc[target_node_name, 'color'])
                
                source_li.append(str(source_node_id))
                target_li.append(str(target_node_id))
                value_li.append(value)
                link_color_li.append(link_color)

    return {'source': source_li, 'target': target_li, 'value': value_li, 'color': link_color_li}

In [78]:
def between_skin_tone_and_attributes(item, source_skin_tone, target_attribute):
    return (item['skin_tone'] == source_skin_tone) and (target_attribute in item['attributes'])


def convert_to_rgba(hex_color, a=0.5):
    return 'rgba({},{},{},{})'.format(*hex_to_rgb(hex_color), a)


def get_dataset_vis1(tx_db, punk_db, trader_db):
    dataset_vis1 = dict()

    # node
    node, node_name_by_level = get_node_dict(punk_db)
    
    # Save node data
    dataset_vis1['node'] = node
    print('{} nodes are saved.'.format(len(node['name'])))
    
    
    # link
    node_df = pd.DataFrame.from_dict(node)
    node_df.reset_index(inplace=True)
    node_df.set_index('name', inplace=True)

    link = get_link_dict(node_df, node_name_by_level, punk_db)
    
    # Save link data
    dataset_vis1['link'] = link
    print('{} links are saved.'.format(len(link['source'])))

    return dataset_vis1


vis1_data = get_dataset_vis1(tx_db, punk_db, trader_db)
with open('{}/vis1_data.json'.format(VIS_DATA_PATH), 'w') as f:
    json.dump(vis1_data, f)


19 nodes are saved.
58 links are saved.
