#### Initailization

In [1]:
from feat_extract_log import *

In [2]:
train_design = ["RISCY-a", "RISCY-b", "RISCY-FPU-a", "RISCY-FPU-b"]
test_design_a = ["zero-riscy-a"]
test_design_b = ["zero-riscy-b"]

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
tile_size = 4
top_k = 5

#### Preprocessing

In [1]:
import numpy as np
from PIL import Image
import requests, base64
import json
import argparse
from io import BytesIO
import cv2
import heapq
import re


tile_size = 16
top_k = 20
image_size = 256


def get_label(label_path):
    with open(label_path, 'r') as f:
        logs = f.read()
    matches = re.findall(r"Total overcon =\s*([\d.]+)", logs)
    if matches:
        ans = float(matches[-1])
    else:
        ans = 0
        
    return ans

file_path = '/data2/NVIDIA/CircuitNet-N28/Dataset/congestion/feature/zero-riscy-a/7228-zero-riscy-a-1-c2-u0.9-m2-p4-f0.npy'
label_path = '/data2/NVIDIA/CircuitNet-N28/Dataset/logs/7228-zero-riscy-a-1-c2-u0.9-m2-p4-f0'
numpy_image = np.load(file_path)
batch_image = numpy_image.transpose(2,0,1)
image_features = []
image_inferences = []

for i, image in enumerate(batch_image):
    image_features.append(image)
    image_inferences.append(Image.fromarray(np.uint8(image * 255)))
    
get_label(label_path)

24.2

In [6]:
def get_all_features(logging_file_path):
    with open(logging_file_path, "r") as f:
        logging_file_string = f.read()
            
    final_features = {}
    
    for feat_func in feat_func_list:
        feat = feat_func(logging_file_string)
        final_features.update(feat)
        
    return final_features

#### Fitness Scoring

In [7]:
get_all_features(label_path)

{'total_wirelength': 2237883.0,
 'number_vias': 444319.0,
 'number_of_multi_cut_vias': 283.0,
 'number_of_single_cut_vias': 444036.0,
 'max_overcon': 37.0,
 'total_overcon': 24.2,
 'worst_layer_gcell_overcon_rate': 41.14,
 'hard_to_access_pins_ratio': 0.37574349442379185,
 'instance_blockages_count': 76180,
 'early_gr_overflow_percentage': 3.39,
 'horizontal_overflow_percentage': 0.08,
 'congestion_prediction_accuracy': -126.60149253731343,
 'initial_placement_efficiency': 0.015246226558926665,
 'area_based_congestion_density': 3.7037500000000003,
 'multi_layer_pin_access_variability': 38.06713524731668,
 'average_layer_congestion': 2.37875,
 'pin_density_variance_map': np.float64(146.6786921875),
 'non_default_routing_rule_usage': 403,
 'crosstalk_sensitive_zones': 20,
 'inter_macro_channel_congestion': 58.62}

In [2]:
import pandas as pd
from tqdm import tqdm
import os
import torch


def dataset_setting(designs):
    df_list = []
    for design in designs:
        feature_path = f"/data2/NVIDIA/CircuitNet-N28/Dataset/congestion/feature/{design}/" 
        label_path = f"/data2/NVIDIA/CircuitNet-N28/Dataset/logs/"

        labels = []
        ids = []

        for filename in tqdm(os.listdir(feature_path)):
            file_path = os.path.join(label_path, filename)
            log_file_path = file_path.replace(".npy", "")
            try:
                label = get_label(log_file_path)
            except:
                label = np.nan
                
            ids.append(filename)
            labels.append(label)
            
        df = pd.DataFrame({"id": ids,})

        for filename in tqdm(os.listdir(feature_path)):
            file_path = os.path.join(label_path, filename)
            log_file_path = file_path.replace(".npy", "")
            
            index = (df["id"] == filename)
            
            try:
                all_features = get_all_features(log_file_path)
            except:
                all_features = {}
            for key, value in all_features.items():
                df.loc[index, key] = value
                
        
        df['label'] = labels
        df_list.append(df)
        
    return pd.concat(df_list)

In [9]:
train_df = dataset_setting(train_design)
test_df_a = dataset_setting(test_design_a)
test_df_b = dataset_setting(test_design_b)

100%|██████████| 2003/2003 [00:00<00:00, 12223.06it/s]
100%|██████████| 2003/2003 [00:17<00:00, 114.46it/s]
100%|██████████| 1858/1858 [00:00<00:00, 6100.24it/s]
100%|██████████| 1858/1858 [00:20<00:00, 89.23it/s] 
100%|██████████| 1969/1969 [00:00<00:00, 5479.00it/s]
100%|██████████| 1969/1969 [00:22<00:00, 88.78it/s] 
100%|██████████| 1248/1248 [00:00<00:00, 4993.75it/s]
100%|██████████| 1248/1248 [00:21<00:00, 59.31it/s]
100%|██████████| 2042/2042 [00:00<00:00, 14284.30it/s]
100%|██████████| 2042/2042 [00:14<00:00, 139.26it/s]
100%|██████████| 1122/1122 [00:00<00:00, 11975.43it/s]
100%|██████████| 1122/1122 [00:10<00:00, 105.24it/s]


In [10]:
train_df = train_df[train_df['label'].notna()]
test_df_a = test_df_a[test_df_a['label'].notna()]
test_df_b = test_df_b[test_df_b['label'].notna()]

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_df = train_df[["id"] + list(feat_pool.keys()) + ["label"]]
test_df_a = test_df_a[["id"] + list(feat_pool.keys()) + ["label"]]
test_df_b = test_df_b[["id"] + list(feat_pool.keys()) + ["label"]]
train_df[list(feat_pool.keys())] = scaler.fit_transform(train_df[list(feat_pool.keys())])
test_df_a[list(feat_pool.keys())] = scaler.fit_transform(test_df_a[list(feat_pool.keys())]) 
test_df_b[list(feat_pool.keys())] = scaler.fit_transform(test_df_b[list(feat_pool.keys())])

In [12]:
train_df.reset_index(drop=True, inplace=True)
test_df_a.reset_index(drop=True, inplace=True)
test_df_b.reset_index(drop=True, inplace=True)

In [13]:
train_df.shape, test_df_a.shape, test_df_b.shape

((5597, 22), (1337, 22), (1122, 22))

In [14]:
train_df.to_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/train_df_log.csv", index=False)
test_df_a.to_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_log_a.csv", index=False)
test_df_b.to_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_log_b.csv", index=False)

In [15]:
train_df = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/train_df_log.csv")
test_df_a = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_log_a.csv")
test_df_b = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_log_b.csv")

In [16]:
train_df.reset_index(drop=True, inplace=True)
test_df_a.reset_index(drop=True, inplace=True)
test_df_b.reset_index(drop=True, inplace=True)

In [17]:
def id_to_design(name):
    for d in train_design:
        if d in name:
            return d
    return None

In [18]:
train_df["design"] = train_df["id"].apply(id_to_design)

In [19]:
preference_df_list = []
num_pairs = 50000

for design, group in train_df.groupby("design"):
    preference_df = pd.DataFrame(columns=["design", "chosen", "rejected", "chosen_score", "rejected_score"])
    group = group.reset_index(drop=True)
    num_samples = len(group)
    for i in tqdm(range(0, num_samples)):
        for j in range(i+1, num_samples):
            sample_a = group.iloc[i]
            sample_b = group.iloc[j]
            if sample_a["label"] > sample_b["label"]:
                chosen = sample_a["id"]
                rejected = sample_b["id"]
                chosen_score = sample_a["label"]
                rejected_score = sample_b["label"]
            else:
                chosen = sample_b["id"]
                rejected = sample_a["id"]
                chosen_score = sample_b["label"]
                rejected_score = sample_a["label"]
                
            preference_df = preference_df._append({"design": design, "chosen": chosen, "rejected": rejected, "chosen_score": chosen_score, "rejected_score": rejected_score}, ignore_index=True)
            
    preference_df = preference_df.sample(frac=1).reset_index(drop=True)
    preference_df = preference_df.sample(n=num_pairs)
    preference_df_list.append(preference_df)
            

  0%|          | 0/1169 [00:00<?, ?it/s]

  0%|          | 5/1169 [00:05<20:11,  1.04s/it]


KeyboardInterrupt: 

In [None]:
preference_df = pd.concat(preference_df_list)
preference_df.reset_index(drop=True, inplace=True)
preference_df.to_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/preference_df_log.csv", index=False)

### Mixed part

In [7]:
train_df = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/train_df.csv")
test_df_a = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_a.csv")
test_df_b = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_b.csv")

train_df_log = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/train_df_log.csv")
test_df_log_a = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_log_a.csv")
test_df_log_b = pd.read_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_log_b.csv")

In [8]:
train_df = pd.merge(train_df_log, train_df, on=['id','label'], how='inner').reset_index(drop=True)
test_df_a = pd.merge(test_df_log_a, test_df_a, on=['id','label'], how='inner').reset_index(drop=True)
test_df_b = pd.merge(test_df_log_b, test_df_b, on=['id','label'], how='inner').reset_index(drop=True)

In [9]:
train_df.shape, test_df_a.shape, test_df_b.shape

((5597, 47), (1337, 47), (1122, 47))

In [12]:
train_df.to_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/train_df_mixed.csv", index=False)
test_df_a.to_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_mixed_a.csv", index=False)
test_df_b.to_csv("/home/felixchaotw/mllm-physical-design/armo/dataset/test_df_mixed_b.csv", index=False)