In [1]:
%load_ext blackcellmagic

In [2]:
import pandas as pd
import numpy as np
import json
import openai
import base64
import os
from langchain import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import datetime

In [3]:
# getting keys
# TODO: remove unnecessary aspects
with open('../config.json') as f:
    keys = json.load(f)
PATH = keys['path']
openai_organization = keys['openai_organization']
openai.organization = openai_organization
openai_api_key = keys['openai_api_key']
openai.api_key = openai_api_key

In [4]:
os.environ["OPENAI_API_KEY"] = openai_api_key

# Variables 

In [5]:
# Get the current date and time
now = datetime.datetime.now()

In [6]:
# Format the date and time as a string in the desired format
timestamp = now.strftime("%Y%m%d%H%M%S")

# Functions

In [7]:
def save_to_txt(name, input_string):
    # Define the file name with the timestamp
    # TODO: add the PATH to the filename
    filename = f"../data/{timestamp}_{name}.txt"
    # Write the string to the file
    with open(filename, "w") as file:
        file.write(input_string)

In [8]:
def create_feedback(code_input):
    # defining the prompt template for a standardized input
    # TO EXPLORE: refine prompt
    feedback_prompt = PromptTemplate(
        input_variables=["code"],
        template="Please review the following code and give five recommendations with detailed explanations how to improve the programming: {code}?",
    )
    
    # initializing the LM
    # TO EXPLORE: adjust temperature
    # TO EXPLORE: test other LMs
    feedback_llm = OpenAI(temperature=0.3)
    
    # a simple chain taking user input, formatting the prompt and sending it to the LM
    feedback_chain = LLMChain(llm=feedback_llm, prompt=feedback_prompt)
    
    # Run the chain only specifying the input variable.
    feedback = feedback_chain.run(code_input)
    
    # saving the input to txt via the prepared function
    save_to_txt(name='feedback', input_string=feedback)
    
    return feedback

In [9]:
def shorten_feedback(feedback):
    # defining the prompt template for a standardized input
    short_feedback_prompt = PromptTemplate(
        input_variables=["feedback"],
        template="Please shorten the aspects of the following feeback: {feedback}?",
    )
    
    # initializing the LM
    # TO OPTIMIZE
    short_feedback_llm = OpenAI(temperature=0)
    
    # a simple chain taking user input, formatting the prompt and sending it to the LM
    short_feedback_chain = LLMChain(llm=short_feedback_llm, prompt=short_feedback_prompt)
    
    # Run the chain only specifying the input variable.
    short_feedback = short_feedback_chain.run(feedback)
    
    # saving the input to txt via the prepared function
    save_to_txt(name='shortfeedback', input_string=short_feedback)
    
    return short_feedback

In [10]:
# TODO: separate into a reading and a merging function and reuse reading function
def pick_shortfeedbacks(directory, n):
    # TODO: use PATH here for directory
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    # Filter the list to only include files with the correct format
    files = [f for f in files if f.endswith("_shortfeedback.txt") and len(f) == 32]
    
    # Sort the list of files by date, with the most recent file first
    files.sort(reverse=True)
    
    # Get the three most recent files
    latest_files = files[:n]
    
    # Read the contents of the two files into string variables
    file_contents = []
    for file in latest_files:
        with open(os.path.join(directory, file), "r") as f:
            file_contents.append(f.read())
            
    # Combine the two file contents into a single string variable
    latest_short_feedbacks = "\n".join(file_contents)
    
    return latest_short_feedbacks

In [15]:
# TODO: improve the template without making it break
def define_goal(latest_short_feedbacks):
    
    # defining the prompt template for a standardized input
    learning_goal_prompt = PromptTemplate(
        input_variables=["short_feedback"],
        template="Summarize the following points: {short_feedback}",
    )
    
    # initializing the LM
    # TODO: check for optimal LLM
    learning_goal_llm = OpenAI(temperature=0.5)
    
    # a simple chain taking user input, formatting the prompt and sending it to the LM
    learning_goal_chain = LLMChain(llm=learning_goal_llm, prompt=learning_goal_prompt)
    
    # Run the chain only specifying the input variable.
    learning_goals = learning_goal_chain.run(latest_short_feedbacks)
    
    # saving the input to txt via the prepared function
    save_to_txt(name='learninggoals', input_string=learning_goals)
    
    return learning_goals

In [22]:
def evaluate_code(directory, code_input):
    # Get a list of all files in the directory
    files = os.listdir(directory)

    # Filter the list to only include files with the correct format
    files = [f for f in files if f.endswith("_learninggoals.txt") and len(f) == 32]

    # Sort the list of files by date, with the most recent file first
    files.sort(reverse=True)

    # Get the most recent file
    latest_file = files[:1]

    with open(os.path.join(directory, latest_file[0]), "r") as f:
        learning_goals = f.read()
    
    # defining the promp template to get both the latest code input and learning_goals
    evaluation_prompt = PromptTemplate(
        input_variables=["code_input", "learning_goals"],
        template="Please compare this code: {code_input} with these learning goals: {learning_goals}. If the programmer considered the learning goals when writing the provided code, say something motivating. If the programmer didn't consider the learning goals, gently remind the person of their learning goals.",
    )
    
    # application of the template
    evaluation_prompt = evaluation_prompt.format(code_input=code_input, learning_goals=learning_goals)
    
    # llm definition
    evaluation_llm = OpenAI(temperature=0.3)
    
    evaluation = evaluation_llm(evaluation_prompt)
    
    return evaluation

# Code review

## Code input

In [8]:
# feature to be added: read from Github

In [18]:
# add code input as text here
code_input = str('''
    # Libraries
    import streamlit as st
    import pandas as pd
    import numpy as np
    import json
    import base64

    # getting variables from config.json
    with open('config/config.json') as f:
        keys = json.load(f)
    PATH = keys['path']


    # Functions
    # better read functions from utils, but not yet working
    def add_bg():
        st.markdown(
        f"""
        <style>
        .stApp {{
            background-image: url(https://gist.githubusercontent.com/kiralenz/8fa216a5ab87e92944129da83d84dd5b/raw/806c89b90ee9c6eaf75f833eb9482c9cbca7dec1/bread_loaf.svg);
            background-size: cover
        }}
        </style>
        """,
        unsafe_allow_html=True
        )

    def add_logo(height):
        st.markdown(
            f"""
            <style>
                [data-testid="stSidebarNav"] {{
                    background-image: url(https://gist.githubusercontent.com/kiralenz/16203a45856cfb596741f24f85e82fbe/raw/c9d93e3336730e77132d40df4eb8d758471bcfd8/keeprising_logo.svg);
                    background-repeat: no-repeat;
                    padding-top: {height - 40}px;
                    background-position: 20px 20px;
                }}
            </style>
            """,
            unsafe_allow_html=True,
        )

    # merging historical activities (df_hist) with latest activity data (df_new) 
    # on the target or shared date column (date_column)
    def add_latest_activity(df_hist, df_new, date_column):
        # Fixing dtypes
        df_hist[date_column] = df_hist[date_column].astype(str)
        df_new[date_column] = df_new[date_column].astype(str)

        # Df merging of historical feedings and latest feeding
        df = pd.concat([df_hist, df_new], ignore_index=True)
        # Fixing dtypes and formatting
        df[date_column] = pd.to_datetime(df[date_column])
        df[date_column] = df[date_column].dt.strftime('%Y-%m-%d')

        return df

    # adding a column with the microbial composition based on the feeding temperature
    def bacteria_column(df, bac_compos):
        df['bacteria_composition'] = np.where(
            df["temperature"] <= 20,
            bac_compos.loc[
                bac_compos["temperature"] == 20, "dominant_microbes"
            ],
            np.where(
                ((df["temperature"] > 20) & (df["temperature"] <= 25)),
                bac_compos.loc[
                    bac_compos["temperature"] == 25, "dominant_microbes"
                ],
                np.where(
                    ((df["temperature"] > 25) & (df["temperature"] <= 30)),
                    bac_compos.loc[
                        bac_compos["temperature"] == 30, "dominant_microbes"
                    ],
                    bac_compos.loc[
                        bac_compos["temperature"] == 35, "dominant_microbes"
                    ],
                ),
            ),
        )
        return df

    # adding two columns for growth rates to a dataframe, one is time normalized
    def growth_rate_cols(df):
        df['growth_rate'] = (
            df['end_height'] / df['initial_height']
        )

        df['growth_rate_per_hour'] = (
            df['end_height'] 
            / df['initial_height'] 
            / df['feeding_time']
        )

        return df


    # Loading data
    feedings = pd.read_parquet(PATH + 'feedings.parquet')
    bacteria_composition = pd.read_parquet(PATH + 'bacteria_composition.parquet')


    # streamlit page
    st.set_page_config(page_title="Keeprising")
    add_bg()  
    add_logo(height=160)
    st.title('How was your last feeding?') 


    # Adding new feeding data
    # user input for feeding
    date_today = st.date_input('Feeding date')
    temperature_today = st.number_input('Temperature')
    feeding_time_today = st.number_input('Feeding duration')
    initial_height_today = st.number_input('Intial height')
    end_height_today = st.number_input('End height')
    bubble_size_today = st.number_input('Bubble size')

    # error handling for invalid input
    if temperature_today < 0 or feeding_time_today < 0 or initial_height_today < 0 or end_height_today < 0 or end_height_today < initial_height_today:
        st.error('Invalid input! Please enter valid values for all feeding data. IF these had been your actual values consider immediately repeating the feeding to save your starter!')
    else:
        # storing latest information in a df
        latest_feeding = pd.DataFrame(data={
            'feeding_date':date_today, 
            'temperature':temperature_today,
            'feeding_time':feeding_time_today,
            'initial_height':initial_height_today,
            'end_height':end_height_today,
            'bubble_size':bubble_size_today
        }, index=[0])

        # merging new feeding to history of feedings
        feedings = add_latest_activity(df_hist=feedings, df_new=latest_feeding, date_column='feeding_date')

        # saving df to local file
        feedings.to_parquet(PATH + 'feedings.parquet')

        # application display of latest feedings
        st.dataframe(feedings.tail())
        st.write("Nice job! Well done!")


        # Data processing
        feedings_processed = feedings.copy()
        # Bacteria composition depending on temperature
        feedings_processed = bacteria_column(df=feedings_processed, bac_compos=bacteria_composition)
        # Growth rate composition
        feedings_processed = growth_rate_cols(df=feedings_processed)


        # Storing data
        feedings_processed.to_parquet(PATH + 'feedings_processed.parquet')" 
        '''
)

In [19]:
# saving the input to txt via the prepared function
save_to_txt(name='codeinput', input_string=code_input)

## Feedback

In [20]:
%%time
feedback = create_feedback(code_input=code_input)

CPU times: user 31.8 ms, sys: 5.06 ms, total: 36.8 ms
Wall time: 14.3 s


In [25]:
feedback

'\n\n1. To improve readability, the code should be broken down into smaller, more manageable functions. For example, the “add_latest_activity” function can be broken down into two separate functions, one to fix the data types and one to merge the data frames. \n\n2. To improve readability, the code should be organized into sections with clear headings. For example, the code can be organized into sections for “Libraries”, “Functions”, “Loading Data”, “Streamlit Page”, “Adding New Feeding Data”, and “Data Processing”. \n\n3. To improve readability, the code should use more descriptive variable names. For example, instead of using “df” as a variable name, a more descriptive name such as “data_frame” should be used. \n\n4. To improve readability, the code should use more consistent indentation and spacing. For example, the lines of code within the “add_latest_activity” function should all be indented by the same amount. \n\n5. To improve readability, the code should use more'

In [None]:
# stop the program if the created output is empty
assert len(feedback) != 0, 'The created feedback was empty'

# Learning goals 

## Shorten review for learning target

In [19]:
%%time
short_feedback = shorten_feedback(feedback=feedback)

CPU times: user 11.3 ms, sys: 3.53 ms, total: 14.8 ms
Wall time: 3.5 s


In [None]:
# stop the program if the created output is empty
assert len(short_feedback) != 0, 'The created shortfeedback was empty'

## Set learning goals

### prep: get latest shortfeedbacks

In [11]:
# This has to be done once and then only if the user checks a button like "generate learning targets".

In [12]:
button = True

In [13]:
%%time
if button:
    # get latest short feedbacks
    latest_short_feedbacks = pick_shortfeedbacks(directory='../data', n=3)
    # define goal from latest short feedbacks
    learning_goals = define_goal(latest_short_feedbacks=latest_short_feedbacks)

In [17]:
learning_goals

'\n6. Use descriptive variable names.\n7. Use consistent indentation.\n8. Use linter to check for errors.\n9. Use comments to explain code.\n\nOrganize code into separate functions, use descriptive variable names and consistent indentation, and use comments and a linter to check for errors.'

In [None]:
# stop the program if the created output is empty
assert len(learning_goals) != 0, 'The created learning goal was empty'

## Compare learning goals and latest submitted code

In [23]:
%%time
evaluation = evaluate_code(directory='../data', code_input=code_input)

CPU times: user 11 ms, sys: 3.05 ms, total: 14 ms
Wall time: 2.91 s


In [24]:
evaluation

'\n\nThe programmer has considered the learning goals when writing the provided code. The code is organized into separate functions, descriptive variable names are used, consistent indentation is used, and comments are used to explain the code. Great job!'

# TO INCLUDE

### one extensive function

In [16]:
%%time
# somehow doesn't work
learning_goals = define_goal(latest_short_feedbacks=latest_short_feedbacks)

CPU times: user 53.1 ms, sys: 11.6 ms, total: 64.7 ms
Wall time: 693 ms


In [17]:
learning_goals

''

### extensive step by step

### short input step by step