# Part 1: Pandas code generation helper.

In [None]:
import pandas as pd
import numpy as np
import requests

In [None]:
# This calls a model finetuned on pandas. All data is encrypted and nothing is saved.
class Framer:
    def __call__(self, df, objective):
        dtypes = df.dtypes
        columns = df.columns
        head = df.head()
        data = {"dtypes": str(dtypes), "columns": str(columns), "head": str(head), "objective": objective}
        res = requests.post("https://gptapi-production.up.railway.app/api/v1/finetuned/pandas", json=data)
        return res.json()["result"]

In [None]:
framer = Framer()

In [None]:
# Generate a test dataframe
# Define start and end dates for the dataframe
start_date = '2022-01-01'
end_date = '2022-12-31'

# Generate dates between start and end date
dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Generate random ids with normal distribution across dates
ids = np.random.normal(loc=50, scale=15, size=len(dates))
ids = np.clip(ids, 0, 100)  # Ensure ids are between 0 and 
ids = np.round(ids).astype(int)  # Round to nearest integer and convert to int

# Create dataframe
df = pd.DataFrame({'date': dates, 'key': ids})


In [None]:
df.head()

In [None]:
framer(df, "how many times does each key appear?")

In [None]:
df.groupby("key").size()

In [None]:
framer(df, "What is the earliest and latest date key 28 appears?")

In [None]:
print(df[df.key == 28].date.min())
print(df[df.key == 28].date.max())

In [None]:
framer(df, "Create a dataframe called existence where the columns are key, min, max, count. Min is the first date which the key appeared and max is the last date the key appeared. Count is the amount of times that key appeared in the entire dataframe.")

In [None]:
existence = df.groupby(['key'])['date'].agg(['min', 'max', 'count'])

In [None]:
existence.head()

# Part 2: Running the base code generation model locally.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
# Ensure you follow the steps in the readme, otherwise this step will not work.
model = AutoModelForCausalLM.from_pretrained("./checkpoints/codegen-16B-mono")
tokenizer = AutoTokenizer.from_pretrained("./tokenizer")

In [None]:
def predict(objective):
    input_ids = tokenizer(objective, return_tensors="pt").input_ids
    generated_ids = model.generate(input_ids, max_length=128)
    res = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return res

In [None]:
print(predict("Get the unique lines in this pandas dataframe"))