In [1]:
import pandas as pd
import re
import os
import matplotlib.pyplot as plt
from IPython.display import display
import warnings
warnings.simplefilter('always')
import importlib

from scripts.a_data_loading_cleaning import run_load_clean_diagnose_data

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Data Loading & Cleaning

In [2]:
csv_path = f"results/scenarios_results.csv"
df = run_load_clean_diagnose_data(csv_path)

âœ… Total generated tokens value is constant: 16384
Original distribution:
total_generated_tokens
16384    18
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------
Round 1: Verfifying FLOPs on raw df
NB: FLOPs values are NOT constant: [52638582308864 20248623316992]
Original distribution:
flops
20248623316992     7
52638582308864    11
Name: count, dtype: int64

Dominant FLOPs value: 52638582308864
- Affected rows count: 7
- Affected row indices: [1, 2, 3, 4, 5, 6, 7]
- Affected configs: ['A1_Single_GPU', 'R5_Tail_SingleGPU_Greedy', 'R6_Tail_DualGPU_TopP', 'R1_Ultralow_SingleGPU_Greedy', 'R2_Ultralow_DualGPU_TopP', 'R7_anti_platonic_ideal', 'A4_Parallel_Exploit']
----------------------------------------------------------------------------------------------------
FLOP Differentiators:
Round 2: Verfifying FLOPs on corrected df
NB: FLOPs values are NOT constant: [1.69499710e+13 5.26385823e+13]
Original distribution:


  if not verify_flops(df):
  verify_flops(df)


# Basic understanding on contents

In [3]:
print(f"number of observation: {len(df)}")
print(f"number of scenarios: {len(df['config_name'].unique())}")
print(f"distribution of scenarios:{df['config_name'].value_counts()}")

number of observation: 18
number of scenarios: 9
distribution of scenarios:config_name
A3_Quantisation_Focus           2
A1_Single_GPU                   2
R5_Tail_SingleGPU_Greedy        2
R6_Tail_DualGPU_TopP            2
R1_Ultralow_SingleGPU_Greedy    2
R2_Ultralow_DualGPU_TopP        2
R7_anti_platonic_ideal          2
A4_Parallel_Exploit             2
A2_2_GPU                        2
Name: count, dtype: int64


In [4]:
cols_to_describe = [
    'total_energy_kwh',
    'total_inference_time_sec',
    'average_latency_ms_per_batch',
    'throughput_queries_per_sec',
    'throughput_tokens_per_sec',
    'cpu_energy_total',
    'gpu_energy_total',
    'flops_per_token',
    'energy_per_token_kwh',
    'divergence_energy_flops'
]

df[cols_to_describe].describe()

Unnamed: 0,total_energy_kwh,total_inference_time_sec,average_latency_ms_per_batch,throughput_queries_per_sec,throughput_tokens_per_sec,cpu_energy_total,gpu_energy_total,flops_per_token,energy_per_token_kwh,divergence_energy_flops
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,0.040852,154.481158,4186.705095,8.867068,1134.984738,0.01115,0.029638,2123674000.0,2.493386e-06,1.345997e-15
std,0.087315,194.658768,1632.169858,11.182359,1431.341998,0.01894,0.068873,1120706000.0,5.329292e-06,2.870266e-15
min,0.00053,3.187716,2185.761334,0.229485,29.374021,0.000154,0.000345,1034544000.0,3.234939e-08,2.046611e-17
25%,0.002098,8.425436,3265.133095,0.455988,58.366507,0.000424,0.001216,1034544000.0,1.280304e-07,7.974139000000001e-17
50%,0.006432,56.742517,3714.91123,2.256235,288.798103,0.002136,0.003966,2123674000.0,3.92602e-07,1.94144e-16
75%,0.023395,280.709971,4919.139026,15.207897,1946.610809,0.012999,0.011159,3212804000.0,1.427945e-06,7.367583e-16
max,0.339962,557.771778,8265.675273,40.154142,5139.730233,0.068603,0.270992,3212804000.0,2.074961e-05,1.106886e-14


In [5]:
import scripts.b_exploratory_data_analysis as eda
importlib.reload(eda)

eda.plot_all_diagnostics(df)

ðŸ“Š Plotting histogram...


  right=ast.Str(s=sentinel),
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


TypeError: 'NoneType' object is not subscriptable

<Figure size 800x600 with 0 Axes>

To do:
- [X] get the outliers
- [ ] unpack the divergence plot more
- [ ] work on the correlation matrix

# Basic stats

In [None]:
from scripts.f_scenario_analysis import get_descriptive_stats
get_descriptive_stats(df)

In [None]:
(df['energy_per_token_kwh'] / df['energy_per_token_kwh'].mean()).plot(
    kind='hist',
    bins=50,
    title='Distribution of Normalised Energy per Token',
    xlabel='Energy per Token (normalised by mean) \n"How many times bigger (or smaller) is this value than the average?"',
    ylabel='Frequency'
)

# Absolute Numbers

In [None]:
from scripts.f_scenario_analysis import compare_energy_to_appliances
compare_energy_to_appliances(df)

# Artifical vs realistic

In [None]:
from scripts.f_scenario_analysis import artificial_v_realistic
artificial_v_realistic(df)

# Within-realistic

In [None]:
from scripts.f_scenario_analysis import within_realistic
within_realistic(df)