In [None]:
#| default_exp ai_ass_label_1

In [None]:
#| export
import os
import pandas as pd
import openai
import random
import re

In [None]:
#| export 
def apply_color(val):
    # create a color 

    if float(val) > 0.7:
        color = 'green'
    elif float(val) < 0.3:
        color = 'red'
    else:
        color = "yellow"

    return f'background-color: {color}'

In [None]:
# #| export

# def generate_prompt(df):
#     start_prompt = "Given a list of sentences, label each sentence as either 'OTR', 'PRS', 'REP', or 'NEU'. 'OTR' stands for 'An opportunity to respond' (e.g., 'What color is this candy?'), 'PRS' stands for 'Praise' (e.g., 'Great job, Morgan!'), 'REP' stands for 'Reprimand' (e.g., 'You need to keep quiet while someone else is reading.) and 'NEU' stands for 'None of the above': "
#     sentences = df['Text'].tolist()
#     # list each prompt with a number
#     test_prompt = '\n'.join(f"{i+1}.{sentence}" for i, sentence in enumerate(sentences))
#     prompt = f"{start_prompt}\n\n{test_prompt}"
#     return prompt


In [None]:
# #| export

# def inference_by_openai(prompt, engine='gpt-4-1106-preview', max_tokens=12000):
#     openai.api_key = os.getenv['OPENAI_API_KEY']
#     response = openai.Completion.create(
#         engine=engine,
#         prompt=prompt,
#         max_tokens=max_tokens,
#     )
#     return response

### Function classify_unlabeled_texts is designed to classify texts in a pandas DataFrame using OpenAI's GPT-3.5 model via the chat API

1. Function Definition:
* df: The pandas DataFrame containing the text data.
* text_column: The column in df that contains the texts to be classified.
* label_column: The column where the classification labels will be stored.
2. Preparing Labeled Examples:
* The function first identifies rows in df that already have labels (non-null values in label_column). These rows are used as examples to guide the AI model in classifying unlabeled texts.
3. Setting Up OpenAI API:
* The API key for accessing OpenAI's service is set up using openai.api_key = os.getenv('OPENAI_API_KEY'). This assumes that the API key is stored in an environment variable.
4. Classification Process:
* The function iterates over rows in df that don't have a label (label_column is null). For each of these rows, it builds a set of messages to send to the OpenAI API.
* A standard message explaining the classification task and the possible labels ('OTR', 'PRS', 'REP', 'NEU') is added first.
* It then adds the labeled examples previously identified to provide context to the AI model. This is a form of few-shot learning, helping the model understand the classification task.
* The text to be classified is added as the final message.
5. API Request:
* For each unlabeled text, a request is made to OpenAI's ChatCompletion API using the constructed messages. The API is expected to return a classification label based on the context provided.
6. Label Assignment:
* The response from the API, presumably a classification label, is extracted and assigned to the corresponding row in the DataFrame.
7. Returning Updated DataFrame:
* After processing all rows needing classification, the function returns the updated DataFrame with newly assigned labels.

In [None]:
#| export
def classify_unlabeled_texts(df, text_column, label_column):
    # Filter to get labeled examples for the prompt
    labeled_examples = df[df[label_column].notna()]
    openai.api_key = os.getenv('OPENAI_API_KEY')

    # Iterate over rows that need classification
    for index, row in df[df[label_column].isna()].iterrows():
        messages = [{"role": "system", "content": "The following are examples of texts and their classifications, the label is either 'OTR', 'PRS', 'REP', or 'NEU'. 'OTR' stands for 'An opportunity to respond' (e.g., 'What color is this candy?'), 'PRS' stands for 'Praise' (e.g., 'Great job, Morgan!'), 'REP' stands for 'Reprimand' (e.g., 'You need to keep quiet while someone else is reading.) and 'NEU' stands for 'None of the above':"}]
        
        # Add few-shot learning examples
        for _, example_row in labeled_examples.iterrows():
            example = f"Text: {example_row[text_column]}\nLabel: {example_row[label_column]}"
            messages.append({"role": "user", "content": example})

        # Add the text to be classified
        text_to_classify = row[text_column]
        messages.append({"role": "user", "content": f"Text: {text_to_classify}\nLabel:"})

        # Make the request to OpenAI's chat API
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages
        )

        # Update the DataFrame with the classified label
        classified_label = response.choices[0].message['content'].strip()
        df.at[index, label_column] = classified_label

    return df

### The function classify_unlabeled_texts is designed to automatically classify texts in a pandas DataFrame using OpenAI's GPT-3.5 model. 

1. Function Definition: 
* The function classify_unlabeled_texts takes a DataFrame df, and two strings text_column and label_column as inputs. text_column is the name of the DataFrame column containing the text to be classified, and label_column is the name of the column where the classification labels will be stored.
2. Filter Labeled Examples: 
* The function first filters the DataFrame to get examples that already have labels. These labeled examples are used for few-shot learning, providing context to the model about the nature of the classification task.
3. API Key: 
* The line openai.api_key = os.getenv('OPENAI_API_KEY') sets the API key for OpenAI, which is assumed to be stored in an environment variable.
4. Classification Loop: 
* The function then iterates over the rows of the DataFrame that need classification (those with no label in label_column).
5. Few-Shot Learning Preparation: 
* For each row to be classified, the function constructs a series of messages. These include a description of the task and labeled examples from the dataset. This is an implementation of few-shot learning, where the model is provided with examples of the task to improve its performance.
6. Text Classification Request: 
* For each text needing classification, a request is made to the OpenAI ChatCompletion API. The request includes the series of messages as context, and the text to be classified.
7. Label Assignment: 
* The response from the API, which includes the classification label, is extracted and assigned to the appropriate row in the DataFrame.
8. Return Updated DataFrame: 
* Once all necessary rows are classified, the updated DataFrame is returned.

In [None]:
#| export
def classify_unlabeled_texts_all(df, text_column, label_column):
    # Filter to get labeled examples for the prompt
    labeled_examples = df[df[label_column].notna()]
    openai.api_key = os.getenv('OPENAI_API_KEY')

    messages = [{"role": "user", "content": "The following are examples of texts and their classifications, the label is either 'OTR', 'PRS', 'REP', or 'NEU'. 'OTR' stands for 'An opportunity to respond' (e.g., 'What color is this candy?'), 'PRS' stands for 'Praise' (e.g., 'Great job, Morgan!'), 'REP' stands for 'Reprimand' (e.g., 'You need to keep quiet while someone else is reading.) and 'NEU' stands for 'None of the above':"}]
        
        # Add few-shot learning examples
    few_shot_examples_texts = "Texts: "
    
    for _, example_row in labeled_examples.iterrows():
        few_shot_examples_texts += f"{example_row[text_column]}\n"

    few_shot_examples_labels = "\n\nLabels: "

    for _, example_row in labeled_examples.iterrows():
        few_shot_examples_labels += f"Labels: {example_row[label_column]}\n"

    messages.append({"role": "user", "content": few_shot_examples_texts + few_shot_examples_labels})

    text_to_classify = "Texts:"

    # Iterate over rows that need classification
    for index, row in df[df[label_column].isna()].iterrows():
        # Add the text to be classified
        text_to_classify += f"{row[text_column]}\n"

    text_to_classify += "\n\nLabels: "

    messages.append({"role": "user", "content": f"Texts: {text_to_classify}"})

        # Make the request to OpenAI's chat API
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    )
    
    contents = response.choices[0].message['content'].strip()
    # Extract all the labels
    # use regex to extract the labels 'OTR', 'PRS', 'REP', or 'NEU' from the response
    regex = r"(OTR|PRS|REP|NEU)"
    classified_labels = re.findall(regex, contents)
    # Update the DataFrame with the classified label
    
    for index, row in df[df[label_column].isna()].iterrows():
        df.at[index, label_column] = classified_labels.pop(0)

    return df

### The function ai_assisted_labeling is designed to process a CSV file for AI-assisted text classification. 

1. Function Definition:
* uploaded_file: The function takes an uploaded file (presumably a CSV file) as input.
2. Reading the CSV File:
* The CSV file is read into a pandas DataFrame (df) using pd.read_csv. It is assumed that the file has a column named 'Text' and another named 'Label'.
3. Data Preprocessing:
* The DataFrame is filtered to retain only the 'Text' and 'Label' columns.
4. Random Confidence Scores:
* A new column 'Confidence' is added to df. Initially, each row is assigned a random confidence score between 0 and 1. This is a placeholder for an actual confidence score that would typically be provided by an AI model.
5. Setting Confidence to 1 for Valid Labels:
* The confidence score is set to 1 for rows where the 'Label' is one of the predefined valid labels ('PRS', 'OTR', 'REP', 'NEU'). This implies full confidence in the labels that are already present and are valid.
6. Styling the DataFrame (Commented Out):
* The function includes commented-out code for applying color styling based on the 'Confidence' value. This suggests the intention to visually differentiate rows based on confidence scores.
7. Classifying Unlabeled Texts:
* The classify_unlabeled_texts function (defined earlier) is called to classify any unlabeled texts in the DataFrame. This function uses OpenAI's model to assign labels to texts without existing labels.
8. Saving the Processed DataFrame:
* The processed DataFrame is saved to a new CSV file. The file name is derived from the original file's name with '_AI_assisted' appended before the file extension.
9. Return Values:
* style_df.to_html(): This would be the HTML representation of the styled DataFrame. However, since the styling function is commented out, this part may not work as intended without the apply_color function.
* new_filename: The name of the new CSV file containing the processed data.

In [None]:
#| export
def ai_assisted_labeling(uploaded_file):
    # Placeholder for AI Labeling Logic
    df = pd.read_csv(uploaded_file.name, encoding='utf-8')

    df = df[['Text', 'Label']]
    # Process the dataframe
    # prompt = generate_prompt(df)

    # response = inference_by_openai(prompt)

    # add random confidence score to each row in confidence column, the confidence score is between 0 and 1
    df['Confidence'] = [round(random.random(), 2) for _ in range(len(df))]
    
    # set confidence as 1 is the label is in a valid list of labels
    df.loc[df['Label'].isin(['PRS', 'OTR', 'REP', 'NEU']), 'Confidence'] = 1
    
    # def apply_color(var):
    #     color = 'red' if var is None else 'green'
    #     return f'background-color: {color}'

    style_df = df.style.applymap(apply_color, subset=['Confidence'])

    df = classify_unlabeled_texts(df, 'Text', 'Label')

    # Save the new dataframe to a CSV file
    new_filename = uploaded_file.name.split('.')[0] + '_AI_assisted.csv'
    df.to_csv(new_filename, index=False)
    

    return style_df.to_html(), new_filename