In [None]:
import pandas as pd
import csv
from system import *
from analye_model import *
from plot_rooflines import *
from operators import SoftMax, layer_norm, GEMM, attention
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Lab Layout 
Part 1 : Understanding various operators 
-  Writing number of operations for SoftMax, Batch Normalization, Q/K/V Multiplication, Attention - 3 point
-  Writing data movement for SoftMax, Batch Normalization, Q/K/V Multiplication, Attention  - 1 point

Part 2 : Runtime Computations - 1 points
- Compute time 
- Memory time 
- Roofline time 


Part 3 : Building Neural Networks  - 1 point
- Llama
- gpt3


Part 4 : Comparing the performance of NN on different HWs - 2 points

## A.1 Various operators

In [None]:
## Only run this once you have completed code in operators.py
softmax1 = SoftMax([2, 256])
softmax2 = SoftMax([254, 5])

ln1 = layer_norm([2, 8, 512])
ln2 = layer_norm([8, 128, 1024])

gemm1 = GEMM([32, 16, 8, 32])
gemm2 = GEMM([4, 128, 256, 128])

attn1 = attention([3, 256, 96, 128])
attn2 = attention([1, 256, 384, 12])

with open('output_a1.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['softmax1', list(softmax1.get_tensors()), softmax1.get_num_ops()])
    writer.writerow(['softmax2', list(softmax2.get_tensors()), softmax2.get_num_ops()])
    writer.writerow(['ln1',  list(ln1.get_tensors()), ln1.get_num_ops()])
    writer.writerow(['ln2',  list(ln2.get_tensors()), ln2.get_num_ops()])
    writer.writerow(['gemm1', list(gemm1.get_tensors()), gemm1.get_num_ops()])
    writer.writerow(['gemm2', list(gemm2.get_tensors()), gemm2.get_num_ops()])
    writer.writerow(['attn1', list(attn1.get_tensors()), attn1.get_num_ops()])
    writer.writerow(['attn2', list(attn2.get_tensors()), attn2.get_num_ops()])

## A.2 Runtime Computations.

In [None]:
example_network = [softmax1, softmax2, ln1, ln2, gemm1, gemm2, attn1, attn2]

In [None]:
## A100 https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
A100_GPU = System( offchip_mem_bw=1935,
                   flops=312, frequency=1095 ,
                   compute_efficiency=0.75, memory_efficiency=0.7)
## https://developer.nvidia.com/embedded/jetson-modules
jetson_nano = System( offchip_mem_bw=34, 
                 flops=20, frequency=625, 
                 compute_efficiency=0.85, memory_efficiency=0.75)

In [None]:
model_df = analysis_model(example_network, A100_GPU)

model_df.to_csv('output_a2.csv', index=False)

## A.3 Building Neural Networks

### TODO A.3.i : LLama 7B prefill

In [None]:
## For reference only.
batch_size = 2000
example_network = [ layer_norm([batch_size, 5, 128]),
            GEMM([batch_size, 8, 64, 512]),
            GEMM([batch_size, 8, 16, 32]),
            ]

In [None]:
def llama_7B_prefill(batch_size):
    ## Fill in the opertors of llama 7B-like prefill, please refer to the figure in pdf document.
    ## Refer the example_network to follow the network declaration
    model_arch = [
                 ]
    return model_arch

### TODO A.3.ii : gpt3 175B decode

In [None]:
def gpt3_175B_decode(batch_size):
    ## Fill in the opertors of gpt3 175B-like decode, please refer to the figure in pdf document.
    ## Refer the example_network to follow the network declaration
    model_arch = [
    ]
    return model_arch

### Running the networks

In [None]:

llama_on_a100_df = analysis_model(llama_7B_prefill(64), A100_GPU)
display(llama_on_a100_df)

dot_roofline(llama_on_a100_df, A100_GPU)
print(f'Total Cycles:{sum(llama_on_a100_df.loc[:, "Cycles"]):0.2f}, Total data (MB): {sum(llama_on_a100_df.loc[:, "Total Data (MB)"]):0.2f}')

In [None]:
gpt3_on_a100_df = analysis_model(gpt3_175B_decode(64), A100_GPU)
display(gpt3_on_a100_df)

dot_roofline(gpt3_on_a100_df, A100_GPU)
print(f'Total Cycles:{sum(gpt3_on_a100_df.loc[:, "Cycles"]):0.2f}, Total data (MB): {sum(gpt3_on_a100_df.loc[:, "Total Data (MB)"]):0.2f}')

In [None]:
llama_on_a100_df.to_csv('output_a3i.csv', index=False)
gpt3_on_a100_df.to_csv('output_a3ii.csv', index=False)

## A.4 Compare on different Hardwares

### TODO A.4.i
Generate csv for llama and gpt3 on jetson nano system, with batch size 4. <br>
Make sure to name the csv file 'output_a4i.csv' and 'output_a4ii.csv'

### TODO A.4.ii

Comment on the change in operator behavior between systems? Do they change, if so why?

### TODO A.4.iii

For running gpt3 175B decode, what changes would you suggest to on hardware specs that would help in optimizing the performance?