In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

In [2]:
gpa_only_datasets = "SEG_preprocess_pipeline_retrain"

In [3]:
gpa_rip_datasets = [    
    'STREAM',
    'GEMM_STREAM'
]

In [4]:
dataset_types = ["train", "val", "test"]

In [5]:
result = []
result_header = []
for dataset_type in dataset_types:
    result_header.extend([
        f"{dataset_type}_set_num_oov_gpa", 
        f"{dataset_type}_oov_ratio_gpa", 
        f"{dataset_type}_num_oov_rip", 
        f"{dataset_type}_oov_ratio_rip"
        ])

In [6]:
vocab = np.genfromtxt("{}/static/vocabulary.csv".format(gpa_only_datasets), delimiter="\n", dtype=np.int64)

tmp = []
for dataset_type in dataset_types:
    dataset = np.genfromtxt(f"{gpa_only_datasets}/data/SEG_retrain_{dataset_type}_set.csv", delimiter="\n", dtype=np.int64)

    num_oov = pd.Series(dataset).value_counts()[np.where(vocab == -1)[0][0]]
    oov_ratio = num_oov / dataset.size

    tmp.extend([num_oov, oov_ratio, None, None])

result.append(tmp)

In [7]:
result

[[37973,
  0.23543161118723302,
  None,
  None,
  7825,
  0.1940579817969893,
  None,
  None,
  7743,
  0.2176283762893842,
  None,
  None]]

In [8]:
for model_name in gpa_rip_datasets:
    vocabulary_gpa = np.genfromtxt(f"{model_name}/static/vocabulary_gpa.csv", delimiter="\n", dtype=np.int64)
    vocabulary_rip = np.genfromtxt(f"{model_name}/static/vocabulary_rip.csv", delimiter="\n", dtype=np.uint64)
    
    tmp = []
    for dataset_type in dataset_types:
        dataset = pd.read_csv(f"{model_name}/data/{model_name}_{dataset_type}_set.csv")        

        num_oov_gpa = pd.Series(dataset["gpa"]).value_counts()[np.where(vocabulary_gpa == -1)[0][0]]
        num_oov_rip = pd.Series(dataset["rip"]).value_counts()[np.where(vocabulary_rip == 0)[0][0]]

        oov_ratio_gpa = num_oov_gpa / dataset.size
        oov_ratio_rip = num_oov_rip / dataset.size

        tmp.extend([num_oov_gpa, oov_ratio_gpa, num_oov_rip, oov_ratio_rip])
    result.append(tmp)

In [9]:
result

[[37973,
  0.23543161118723302,
  None,
  None,
  7825,
  0.1940579817969893,
  None,
  None,
  7743,
  0.2176283762893842,
  None,
  None],
 [1417701,
  0.25076518970549216,
  2030041,
  0.359076855045547,
  273517,
  0.1935203371218982,
  561748,
  0.39745120902010506,
  268756,
  0.21550546228999212,
  452780,
  0.36306747836573927],
 [1589345,
  0.1279771375675418,
  4299960,
  0.3462411071573051,
  510516,
  0.16443084232977512,
  1124786,
  0.36227955523575844,
  361264,
  0.13187320221355878,
  671602,
  0.2451567450757078]]

In [15]:
df_oov_ratio = pd.DataFrame(result, columns=result_header, index=["SEG", 'STREAM', 'GEMM_STREAM'])
df_oov_ratio

Unnamed: 0,train_set_num_oov_gpa,train_oov_ratio_gpa,train_num_oov_rip,train_oov_ratio_rip,val_set_num_oov_gpa,val_oov_ratio_gpa,val_num_oov_rip,val_oov_ratio_rip,test_set_num_oov_gpa,test_oov_ratio_gpa,test_num_oov_rip,test_oov_ratio_rip
SEG,37973,0.235432,,,7825,0.194058,,,7743,0.217628,,
STREAM,1417701,0.250765,2030041.0,0.359077,273517,0.19352,561748.0,0.397451,268756,0.215505,452780.0,0.363067
GEMM_STREAM,1589345,0.127977,4299960.0,0.346241,510516,0.164431,1124786.0,0.36228,361264,0.131873,671602.0,0.245157


In [16]:
df_oov_ratio.to_csv("oov_ratio.csv")