In [1]:
import os
import time
import datetime
import pandas as pd
import plotly.graph_objs as go
import plotly.offline as pyo
import datetime
import time
import subprocess
from credentials import API_KEY

### We will first define the file path and the batch size
### The batch size can be changed according to the requirements and considering how often the data is being updated in the world_bank_preprocessed.csv file.
### Setting the batch size to 1000 will call the pipeline function after 1000 entries have been added to the dataset 

In [2]:
filepath = os.path.join("world_bank_preprocessed.csv")
batch_size = 1000

### Defining a function that Preprocesses the data

In [3]:
def preprocess():
    subprocess.check_call(['jupyter', 'nbconvert', '--to', 'notebook', '--execute', 'preprocess.ipynb'])
    print("Preprocessing stage completed!")

### Defining the function that updates the prompt completion pairs

In [4]:
def prompt_completion():
    subprocess.run(['python', 'generator.py'])
    print("Prompt Completion stage completed!")

### Defining a function that retrains the model

In [5]:
input_file_path = "prompt_completion_pairs.json"
output_file_path = "prompt_completion_pairs_prepared.jsonl"

In [6]:
accuracy = []
timestamp = []

def retrain(api_key, input_file_path, output_file_path):
    os.environ['OPENAI_API_KEY'] = api_key
    
    cmd_prepare = f"openai tools fine_tunes.prepare_data -f {input_file_path}"
    subprocess.run(cmd_prepare.split(), check=True)
    
    cmd_create = f"openai api fine_tunes.create -t {output_file_path} -m curie"
    result = subprocess.run(cmd_create.split(), check=True, capture_output=True)

    model_accuracy = float(result.stdout.decode().split('\n')[2].split(': ')[1])

    accuracy.append(model_accuracy)
    timestamp.append(datetime.datetime.now())

    print("Retraining stage completed!")


### Defining the function to deploy

In [7]:
def deploy():
    print("Deployment stage completed!")
    pass

### Creating a function that is triggered when 1000 entries are added to the dataset (Defining the Pipeline)

In [8]:
def pipeline():

    print("Pipeline function triggered!")

    # THE MODEL PIPELINE

    # 1. This Function is called after every 1000 rows are added to the 'word_bank_preprocessed.csv' dataset
    # 2. Call the function that preprocesses the new entries exactly how the 'word_bank_preprocessed.csv' has been preprocessed.
    # 3. Write the commands to re-train the model
    # 4. Call the function that deploys the model in production environment ( Not Necessary as of now )
    
    preprocess()

    time.sleep(10)   # To make sure it is easy to keep a track of the real-time data coming in
    
    prompt_completion()
    retrain(API_KEY, input_file_path, output_file_path)   
    deploy()

In [9]:
def watch_csv_file(filepath, batch_size):

    initial_rows = pd.read_csv(filepath).shape[0]
    
    while True:
        while pd.read_csv(filepath).shape[0] == initial_rows:
            time.sleep(1)
            
        df = pd.read_csv(filepath)
        
        if df.shape[0] >= initial_rows + batch_size:

            pipeline()

            trace = go.Scatter(x=[], y=[], mode='lines', name='Model Analysis')
            layout = go.Layout(title='Model Analysis', xaxis=dict(title='Timestamp'), yaxis=dict(title='Model Accuracy'))
            fig = go.Figure(data=[trace], layout=layout)
            pyo.plot(fig, auto_open=False)
            pyo.plot(fig, filename="Real-Time-Model-Analysis.html", auto_open=False)
            
            initial_rows = df.shape[0]

In [10]:
def server():
    try:
        watch_csv_file(filepath, batch_size)   
    except KeyboardInterrupt:
        print("\nServer stopped!")

In [11]:
server()

Pipeline function triggered!
Preprocessing stage completed!
Prompt Completion stage completed!
Deployment stage completed!

Server stopped!
