# Machine Learning Model for Performance Prediction

## Introduction
This project focuses on developing a machine learning model to predict the performance of configurations based on various features extracted from data files. We use LightGBM's ranking model to assess the impact of configurations on performance.

## Data Preparation
The data used in this project resides in the `./data/tpugraphs/npz/tile/xla` directory. Each data file contains multiple features which are pre-processed and sampled for training the model.

## Model Training
* We employ a `LightGBM Ranker` to train our model. The ranker uses features such as node characteristics and configuration parameters to predict the performance ranks within different groups.
* The model's predictions are evaluated against a baseline to check the effectiveness of different configurations. The performance is measured by how well the model can predict the best configurations.

## Directory Structure
- `data/`: Contains the datasets in `.npz` format.
- `scripts/`: Contains the source code.

## Output
The predictions are saved in `result_xla.csv`, which contains the configurations predicted to perform best.


In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from lightgbm import LGBMRanker

from lightgbm import LGBMRegressor

In [2]:
tile_path = './data/tpugraphs/npz/tile/xla'

In [3]:
# Function to load and process data from files
def get_df(mode, sample_config=None):
    file_list = os.listdir(os.path.join(tile_path, mode))  # List files in the directory
    dlist = []
    select_file_list = pd.Series(file_list).sample(frac=1, random_state=42)  # Shuffle file list
    select_file_list = list(select_file_list)  # Convert to list

    for fid, f in tqdm(enumerate(select_file_list)):  # Iterate through files
        d = dict(np.load(os.path.join(tile_path, mode, f)))  # Load each file as a dictionary
        # Validate file contents
        assert (list(d.keys())) == ['node_feat', 'node_opcode', 'edge_index', 'config_feat', 'config_runtime', 'config_runtime_normalizers']

        # Extract feature dimensions
        node_feat_r = d['node_feat'].shape[0]
        edge_index_r = d['edge_index'].shape[0]

        cfg_num = len(d['config_feat'])
        if sample_config:
            sample_num = cfg_num
            if cfg_num > 5000:
                sample_num = 5000
            cfg_idxs = list(pd.Series(range(cfg_num)).sample(n=sample_num, random_state=42))
        else:
            cfg_idxs = range(cfg_num)  # Use original order if not sampling

        # Compute summary statistics for node features and configurations
        node_mean, node_sum, node_std = np.mean(d['node_feat'], axis=0), np.sum(d['node_feat'], axis=0), np.std(d['node_feat'], axis=0)
        config_mean, config_max, config_min, config_sum, config_std = np.mean(d['config_feat'], axis=0), np.min(d['config_feat'], axis=0), np.max(d['config_feat'], axis=0), np.sum(d['config_feat'], axis=0), np.std(d['config_feat'], axis=0)

        # Append processed data to the list
        for cfg_idx in cfg_idxs:
            config = d['config_feat'][cfg_idx]
            runtime = d['config_runtime'][cfg_idx]
            runtime_n = d['config_runtime_normalizers'][cfg_idx]
            l = [fid, f, node_feat_r, edge_index_r] + list(config) + [runtime, runtime_n]
            dlist.append(l)

    # Create a DataFrame from the list
    cols = ['fid', 'file', 'n', 'm' ] + ['c'+str(i) for i in range(len(l)-6)] + ['runtime', 'runtime_normalizers']
    df = pd.DataFrame(dlist, columns=cols)
    return df

In [6]:
train = get_df('train', sample_config=True)

5709it [00:34, 166.71it/s]


In [8]:
valid = get_df('valid', sample_config=True)
test = get_df('test', sample_config=False)

676it [00:02, 234.69it/s]
844it [00:04, 178.20it/s]
676it [00:02, 236.58it/s]
844it [00:04, 194.22it/s]


In [9]:
train.head()

Unnamed: 0,fid,file,n,m,c0,c1,c2,c3,c4,c5,...,c16,c17,c18,c19,c20,c21,c22,c23,runtime,runtime_normalizers
0,0,mlperf_nmt_batch_64_-2d518446041f67ae.npz,31,32,3.0,4.0,16.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,563141,374882
1,0,mlperf_nmt_batch_64_-2d518446041f67ae.npz,31,32,1.0,4.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1787211,375704
2,0,mlperf_nmt_batch_64_-2d518446041f67ae.npz,31,32,1.0,8.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1590034,375034
3,0,mlperf_nmt_batch_64_-2d518446041f67ae.npz,31,32,5.0,1.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1347065,375378
4,0,mlperf_nmt_batch_64_-2d518446041f67ae.npz,31,32,5.0,3.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1622638,375704


In [10]:
bin_num = 10

train['label'] = (train['runtime'] / train['runtime_normalizers'] * bin_num).astype(int)
valid['label'] = (valid['runtime'] / valid['runtime_normalizers'] * bin_num).astype(int)


# train['ori'] = train['label']
# valid['ori'] = valid['label']

# train['max'] = train.groupby('fid')['label'].apply(lambda x: x*0 + x.max())
# valid['max'] = valid.groupby('fid')['label'].apply(lambda x: x*0 + x.max())

# train['label'] = train.groupby('fid')['label'].apply(lambda x: x//(int(x.max()/bin_num)))
# valid['label'] = valid.groupby('fid')['label'].apply(lambda x: x//(int(x.max()/bin_num)))

# train['label'] = train.groupby('fid')['label'].apply(lambda x: x//(int(np.ceil(x.max()/bin_num))))
# valid['label'] = valid.groupby('fid')['label'].apply(lambda x: x//(int(np.ceil(x.max()/bin_num))))

# train['label'] = train.groupby('fid')['label'].apply(lambda x: bin_num - x//(int(np.ceil(x.max()/bin_num))))
# valid['label'] = valid.groupby('fid')['label'].apply(lambda x: bin_num - x//(int(np.ceil(x.max()/bin_num))))

In [11]:
valid.head()

Unnamed: 0,fid,file,n,m,c0,c1,c2,c3,c4,c5,...,c17,c18,c19,c20,c21,c22,c23,runtime,runtime_normalizers,label
0,0,tf2_bert_pretrain_dynamic_batch_size_257ca1e9f...,54,54,1.0,2.0,8.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,267084,56291,47
1,0,tf2_bert_pretrain_dynamic_batch_size_257ca1e9f...,54,54,1.0,2.0,1.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1064317,56378,188
2,0,tf2_bert_pretrain_dynamic_batch_size_257ca1e9f...,54,54,1.0,2.0,2.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,385250,56291,68
3,0,tf2_bert_pretrain_dynamic_batch_size_257ca1e9f...,54,54,1.0,4.0,2.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,349327,56358,61
4,0,tf2_bert_pretrain_dynamic_batch_size_257ca1e9f...,54,54,1.0,2.0,8.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125958,56378,22


In [0]:
# Define function for splitting data into bins
def split_bin(x):
    result_ = pd.qcut(x, q=bin_num, retbins=True, duplicates='drop')[1]
    r = pd.cut(x, bins=result_, include_lowest=True, labels=range(len(result_)-1)).tolist()
    r = pd.Series(r, index=x.index, ).fillna(0)
    r = bin_num - r
    return r

In [7]:
# Apply binning to training and validation data
train['label'] = train.groupby('fid', group_keys=False)['label'].apply(lambda x: split_bin(x))
train['label'] = train['label'].astype(int)

valid['label'] = valid.groupby('fid', group_keys=False)['label'].apply(lambda x: split_bin(x))
valid['label'] = valid['label'].astype(int)

In [8]:
print('start train')

label = 'label'
feats = ['n', 'm' ] + ['c'+str(i) for i in range(24)]
tr_x, tr_y = train[feats], train[label]
val_x, val_y = valid[feats], valid[label]

g_train = train.groupby(['fid'],)['label'].count().values
g_val = valid.groupby(['fid'], )['label'].count().values

start train


In [None]:
# Configure and train the model
model = LGBMRanker(n_estimators=300, n_jobs=100, random_state=64,early_stopping_rounds=10)
model.fit(tr_x, tr_y, 
          group=g_train, 
          eval_group=[g_val], 
          eval_set=[(val_x, val_y)],
          eval_at=[5],
         )



[1]	valid_0's ndcg@5: 0.745303
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's ndcg@5: 0.77097
[3]	valid_0's ndcg@5: 0.799257
[4]	valid_0's ndcg@5: 0.815569
[5]	valid_0's ndcg@5: 0.820369
[6]	valid_0's ndcg@5: 0.83667
[7]	valid_0's ndcg@5: 0.851308
[8]	valid_0's ndcg@5: 0.871638
[9]	valid_0's ndcg@5: 0.874978
[10]	valid_0's ndcg@5: 0.877763
[11]	valid_0's ndcg@5: 0.872856
[12]	valid_0's ndcg@5: 0.87111
[13]	valid_0's ndcg@5: 0.873603
[14]	valid_0's ndcg@5: 0.871797
[15]	valid_0's ndcg@5: 0.875599
[16]	valid_0's ndcg@5: 0.877075
[17]	valid_0's ndcg@5: 0.877016
[18]	valid_0's ndcg@5: 0.878004
[19]	valid_0's ndcg@5: 0.879114
[20]	valid_0's ndcg@5: 0.882184
[21]	valid_0's ndcg@5: 0.884394
[22]	valid_0's ndcg@5: 0.885451
[23]	valid_0's ndcg@5: 0.886954
[24]	valid_0's ndcg@5: 0.89008
[25]	valid_0's ndcg@5: 0.896522
[26]	valid_0's ndcg@5: 0.895201
[27]	valid_0's ndcg@5: 0.895939
[28]	valid_0's ndcg@5: 0.894633
[29]	valid_0's ndcg@5: 0.896722
[30]	valid_0's ndcg@5: 

In [None]:
val_pred = model.predict(val_x)
valid['pred'] = - val_pred
valid['rank'] = valid.groupby('fid')['pred'].rank(method='first')

k = 5
pred_best = valid[valid['rank']<=k].groupby('fid')['runtime'].min()
true_best = valid.groupby('fid')['runtime'].min()
scores = 2 - pred_best/true_best
score = scores.mean()
score

In [None]:
# Predict and evaluate the model on validation set
val_pred = model.predict(val_x)
valid['pred'] = - val_pred
valid['rank'] = valid.groupby('fid')['pred'].rank(method='first')

# Calculate scores based on the top k predictions
k = 5
pred_best = valid[valid['rank']<=k].groupby('fid')['runtime'].min()
true_best = valid.groupby('fid')['runtime'].min()
scores = 2 - pred_best/true_best
score = scores.mean()
score

In [None]:
def get_idx(x):
    return pd.Series(range(len(x)), index=x.index)

# Use the model to predict on test set and format the output
test['config_id'] = test.groupby('fid',group_keys=False)['file'].apply(get_idx)
test_x = test[feats]
test_pred = model.predict(test_x)
test['pred'] = - test_pred
test['rank'] = test.groupby('fid',group_keys=False)['pred'].rank(method='first')
test_top = test[test['rank']<=5]

In [None]:
test_top = test_top.groupby('file', as_index=False)['config_id'].apply(lambda x: ';'.join(list(x.astype(str))))
test_top['ID'] = 'tile:xla:' + test_top['file'].apply(lambda x: x.split('.')[0])
test_top['TopConfigs'] = test_top['config_id']
test_tile_df = test_top[['ID', 'TopConfigs']]

In [None]:
test_tile_df.to_csv('result_xla.csv', index=False)