### Rewriting EKE code to run more effectively on NERSC

#### Fill in the project name and existing code location.

In [1]:
project_name = "EKE"
existing_code_location = "EKE-example-original"

#### Set up the AI

In [2]:
import json
import shutil
import glob
import os
from pathlib import Path

import gpt_engineer.steps as steps
from gpt_engineer.ai import AI, fallback_model
from gpt_engineer.db import DB, DBs

problem_statement = '''I have this scientific computation that I wrote. 
I would like to optimize it such that it runs faster (utilizing parallel computation) and is more interactive.
I will be running this on an HPC system that has support for dask and slurm.
I will want to interact with the computation via a Jupyter notebook.
'''

def set_up(project_name, existing_code_location):
    input_path = Path(project_name)
    input_path.mkdir(parents=True, exist_ok=True)

    prompt_file = input_path / "prompt"

    with open(prompt_file, "w") as file:
        file.write(problem_statement)

    input_path = input_path.absolute()
    print("The following location will be used for processing\nThe code will be output to the workspace directory of that location")
    print(input_path)
    
    model = "gpt-4"
    temperature = 0.1
    model = fallback_model(model)
    ai = AI(
        model_name=model,
        temperature=temperature,
    )

    memory_path = input_path / "memory"
    workspace_path = input_path / "workspace"
    archive_path = input_path / "archive"
    
    shutil.copytree(existing_code_location, workspace_path)
    
    dbs = DBs(
        memory=DB(memory_path),
        logs=DB(memory_path / "logs"),
        input=DB(input_path),
        workspace=DB(workspace_path),
        preprompts=DB(Path(steps.__file__).parent / "preprompts"),
        archive=DB(archive_path),
    )

    dbs.workspace["all_output.txt"] = all_code_from_files(existing_code_location)

    return ai, dbs

def all_code_from_files(path):
    chat = "These are the files implementing the code\n"
    directory_path = path
    file_pattern = "**/*.*"  # Match all files recursively

    file_paths = glob.glob(os.path.join(directory_path, file_pattern), recursive=True)

    for file_path in file_paths:
        file_name = os.path.relpath(file_path, start=directory_path)
        file_content = read_file_to_string(file_path)
        chat += "**" + file_name + "**\n" + "```" + file_content + "\n```\n\n"
    return chat

def read_file_to_string(file_path):
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except FileNotFoundError:
        print(f"{file_path} not found.")
        return None
        
def do_step(step):
    messages = step(ai, dbs)
    dbs.logs[step.__name__] = AI.serialize_messages(messages)
    
def fix_code(how, add_default_end = True):
    if add_default_end:
        default_end = '''There might be placeholders in the code you have to fill in.
You provide fully functioning, well formatted code with few comments, that works and has no bugs.
Please return the full new code in the same format.
'''
        how = how + default_end
    dbs.input['fix_prompt'] = how
    do_step(steps.fix_code)


In [3]:
ai, dbs = set_up(project_name, existing_code_location)

The following location will be used for processing
The code will be output to the workspace directory of that location
/Users/kberket/src/scalesci-demo/from_existing_code/EKE
Model gpt-4 not available for provided API key. Reverting to gpt-3.5-turbo. Sign up for the GPT-4 wait list here: https://openai.com/waitlist/gpt-4-api



#### Let's do it

In [4]:
fix_code('''You are an expert in optimizing scientific computations on HPC systems. 
You will help this scientist take their existing code and turn it into a Jupyter notebook 
utilizing dask with improved performance (faster, more interactive). 
Start by providing a short summary of the computation (DO NOT list out all the steps), followed by 
a short list of improvement suggestions for the following code.''')

Summary of the computation:
The code calculates the eddy kinetic energy (EKE) and plots the meridional distribution. It reads in WRF data files, filters the variables for waves with periods between 3-5 days, calculates the squared u' and v', averages them over time and longitude, and then calculates the zonal and meridional averages of EKE. Finally, it plots the meridional distribution of EKE.

Improvement suggestions:
1. Utilize parallel computation using dask to improve performance.
2. Refactor the code into modular functions for better organization and reusability.
3. Optimize the filtering function by using a more efficient algorithm.
4. Use dataclasses to define structured data objects.
5. Use pytest for testing.

Here is the updated code:

**WRF_EKE_NERSC_Ex.py**
```python
from __future__ import division
import os
from netCDF4 import Dataset
import numpy as np
import xarray as xr
import wrf as wrf
from datetime import datetime
import matplotlib as mpl
import matplotlib.pyplot as 

In [None]:
fix_code('''Transform the existing code base such that the user interacts with a Jupyter notebook.''')

**WRF_EKE_NERSC_Ex.py**
```python
from __future__ import division
import os
from netCDF4 import Dataset
import numpy as np
import xarray as xr
import wrf as wrf
from datetime import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib import rcParams
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from scipy import signal
import scipy.ndimage as ndimage
from dataclasses import dataclass
import dask.array as da

@dataclass
class WRFData:
    ua: xr.DataArray
    va: xr.DataArray

def lat_lon(file_location):
    data = Dataset(file_location)
    lat = wrf.getvar(data, "lat")
    lon = wrf.getvar(data, "lon")
    lat_index_west, lat_index_south = wrf.ll_to_xy(data, -10., -40., meta=False)
    lon_index_east, lat_index_north = wrf.ll_to_xy(data, 40., 30., meta=False)
    lat_crop = lat.values[lat_index_south:lat_in