In [1]:
import os
import pandas as pd
import data_processor as dp
import importlib

In [8]:
# to reload the module if we make changes to it while working on this notebook
importlib.reload(dp)

<module 'data_processor' from '/home/haskelt/openai-data-tools/data_processor.py'>

## Bare dates

In [2]:
examples = [
       {'item': 'May 5, 1985', 'target': '1985-05-05'}
]

In [3]:
dates = pd.read_csv('dates.csv', dtype=str, keep_default_na=False)

In [4]:
dates

Unnamed: 0,item,target
0,"Dec 12, 1995",1995-12-12
1,23 March 1974,1974-03-23
2,February 14th 2010,2010-02-14
3,7/9/80,1980-07-09
4,14/19/93,
5,2/15,
6,"13th of April, 2016",2016-04-13
7,"Twenty-fifth of August, 1955",1955-08-25
8,"October 12, nineteen eighty-six",1986-10-12


In [9]:
processor1 = dp.DataProcessor(
    api_key=os.getenv("OPENAI_API_KEY"), 
    timeout = 10,
    model = 'gpt-3.5-turbo', 
    instructions = "You will be provided with dates in various formats. For each date, convert it to the format YYYY-MM-DD. If it is not a valid date, return 'NA'.",
    examples = examples
)

In [None]:
processor1.process(dates['item'], mode='live')

Progress: 44%

In [None]:
processor1.score(dates['target'])

## Dates embedded in text

In [None]:
dates_in_text = pd.read_csv('dates_in_text.csv', dtype=str, keep_default_na=False)

In [None]:
dates_in_text

In [None]:
examples = [
    {'item': 'The date on the contract is 13/45/1991',
    'target': 'NA'},
    {'item': 'On June 15th I have a dentist appointment',
    'target': 'NA'},
    {'item': '',
    'target': 'Be it known that on this fifteenth day of March, 2021 this resolution was duly passed',
    'item': 'Be it known that on 2021-03-15 this resolution was duly passed'}
]

In [None]:
processor2 = dp.GPTDataProcessor(api_key=os.getenv("OPENAI_API_KEY"), 
                    model = 'gpt-3.5-turbo', 
                    instructions = "You will be provided with sentences that contain dates. For each date, convert it to the format YYYY-MM-DD, and return the sentence with the converted date. If the date is not a valid date, return 'NA' instead of the sentence.",
                    examples = examples
            )

In [None]:
processor2.process(dates_in_text['item'], mode='live')

In [None]:
processor2.score(dates_in_text['target'])

In [None]:
pd.DataFrame({'target': dates_in_text['target'], 'response': processor2._data['output']})

## Self-described gender

In [None]:
raw_gender_data = pd.read_csv('Combined-Calc-Data.csv', dtype=str, keep_default_na=False)

In [None]:
raw_gender_data

In [None]:
processor3 = dp.GPTDataProcessor(api_key=os.getenv("OPENAI_API_KEY"), 
                    model = 'gpt-3.5-turbo', 
                    instructions = "You will be provided with phrases describing someone's gender. For each phrase, classify it as 'female', 'male', 'nonbinary', or 'other'."
            )

In [None]:
processor3.process(raw_gender_data['gender'])

In [None]:
processor3.dump('objects/genderclass')

In [None]:
processor4 = dp.GPTDataProcessor(api_key=os.getenv("OPENAI_API_KEY"), 
                    model = 'gpt-3.5-turbo', 
                    instructions = "I conducted a survey where people self-reported their gender. For data analysis, I need to classify their responses into 4 categories: 'female', 'male', 'non-binary', and 'other'. You will be provided with the response from a survey. Please decide what category it belongs in, and respond with that category."
            )

In [None]:
processor4.process(raw_gender_data['gender'][0:20])

In [None]:
processor4.dump('objects/genderclass4')