In [1]:
import os
import pandas as pd
import openai_data_processor as dp
import importlib

In [6]:
# to reload the module if we make changes to it while working on this notebook
importlib.reload(dp)

<module 'openai_data_processor' from '/home/haskelt/openai-data-tools/openai_data_processor.py'>

## Bare dates

In [2]:
examples = [
       {'item': 'May 5, 1985', 'target': '1985-05-05'}
]

In [3]:
dates = pd.read_csv('dates.csv', dtype=str, keep_default_na=False)

In [9]:
dates

Unnamed: 0,item,target
0,"Dec 12, 1995",1995-12-12
1,23 March 1974,1974-03-23
2,February 14th 2010,2010-02-14
3,7/9/80,1980-07-09
4,14/19/93,
5,2/15,
6,"13th of April, 2016",2016-04-13
7,"Twenty-fifth of August, 1955",1955-08-25
8,"October 12, nineteen eighty-six",1986-10-12


In [7]:
processor1 = dp.OpenAIDataProcessor(
    api_key=os.getenv("OPENAI_API_KEY"), 
    timeout = 10,
    model = 'gpt-3.5-turbo', 
    instructions = "You will be provided with dates in various formats. For each date, convert it to the format YYYY-MM-DD. If it is not a valid date, return 'NA'.",
    examples = examples
)

In [8]:
processor1.process(dates['item'], mode='live')

Progress: 100%


['1995-12-12',
 '1974-03-23',
 '2010-02-14',
 '1980-07-09',
 'NA',
 'NA',
 '2016-04-13',
 '1955-08-25',
 '1986-10-12']

In [41]:
processor1.score(dates['target'])

array([1, 1, 1, 1, 1, 1, 1, 1, 1])

## Dates embedded in text

In [62]:
dates_in_text = pd.read_csv('dates_in_text.csv', dtype=str, keep_default_na=False)

In [63]:
dates_in_text

Unnamed: 0,item,target
0,"The ceremony will be on Dec 12, 1995 at 2 o'cl...",The ceremony will be on 1995-12-12 at 2 o'clock.
1,The article was first published on 23 March 1974.,The article was first published on 1974-03-23.
2,"On February 14th, 2010 the amended bill became...",On 2010-02-14 the amended bill became law.
3,The date of birth is listed as 7/9/80.,The date of birth is listed as 1980-07-09.
4,Please verify all records prior to 14/19/93.,
5,I've been really busy and I don't think I coul...,
6,The first meeting of the council will take pla...,The first meeting of the council will take pla...
7,"Recorded on this twenty-fifth day of August, 1...",Recorded on 1955-08-25.
8,"Please join us on October 12, nineteen eighty-...",Please join us on 1986-10-12.


In [55]:
examples = [
    {'item': 'The date on the contract is 13/45/1991',
    'target': 'NA'},
    {'item': 'On June 15th I have a dentist appointment',
    'target': 'NA'},
    {'item': '',
    'target': 'Be it known that on this fifteenth day of March, 2021 this resolution was duly passed',
    'item': 'Be it known that on 2021-03-15 this resolution was duly passed'}
]

In [64]:
processor2 = dp.GPTDataProcessor(api_key=os.getenv("OPENAI_API_KEY"), 
                    model = 'gpt-3.5-turbo', 
                    instructions = "You will be provided with sentences that contain dates. For each date, convert it to the format YYYY-MM-DD, and return the sentence with the converted date. If the date is not a valid date, return 'NA' instead of the sentence.",
                    examples = examples
            )

In [30]:
processor2.process(dates_in_text['item'], mode='live')

NameError: name 'processor2' is not defined

In [66]:
processor2.score(dates_in_text['target'])

array([1, 1, 0, 1, 1, 0, 0, 0, 1])

In [54]:
pd.DataFrame({'target': dates_in_text['target'], 'response': processor2._data['output']})

Unnamed: 0,target,response
0,The ceremony will be on 1995-12-12 at 2 o'clock,The ceremony will be on 1995-12-12 at 2 o'clock
1,The article was first published on 1974-03-23,The article was first published on 1974-03-23
2,On 2010-02-14 the amended bill became law,"On 2010-02-14, the amended bill became law."
3,The date of birth is listed as 1980-07-09,The date of birth is listed as 1980-07-09.
4,,Please verify all records prior to 1993-14-19.
5,,I've been really busy and I don't think I coul...
6,The first meeting of the council will take pla...,The first meeting of the council will take pla...
7,Recorded on 1955-08-25,Recorded on this 1955-08-25 day of August.
8,Please join us on the day of 1986-10-12,Please join us on the day of 1986-10-12.


## Self-described gender

In [3]:
raw_gender_data = pd.read_csv('Combined-Calc-Data.csv', dtype=str, keep_default_na=False)

In [4]:
raw_gender_data

Unnamed: 0,anonID,instructor,institution,course,condition,modality,term,first_time,calc1_grade,age,parent_degree,gender,ethnicity,psvt_pre,psvt_post,course_grade
0,0498329c2800fe0146d9a69c94dd8a415b24ff3266d471...,Jon Armel,Tacoma Community College,Calc II,Control,On-line,Winter 2021,Yes,B+ (3.2-3.4),23-29,Yes,Male,White or Caucasian,19,18,B+
1,05e931dd22e2cf4ea80380b0610975d78e8fc9284e3926...,Jon Armel,Tacoma Community College,Calc II,Control,On-line,Winter 2021,Yes,B+ (3.2-3.4),20-22,Yes,male,Asian or Pacific Islander,24,24,B
2,0c600d02dede2d6656650ed99e560da939d00110798e85...,Jon Armel,Tacoma Community College,Calc II,Control,On-line,Winter 2021,Yes,B- (2.5-2.8),23-29,Yes,female. she/her,Asian or Pacific Islander,22,20,E
3,0d1eb8f4718c039badf9bdcea21428cd27660463fa20cc...,Jon Armel,Tacoma Community College,Calc II,Control,On-line,Winter 2021,Yes,A- (3.5-3.8),17 or younger,Yes,Male,Hispanic or Latinx,19,22,A
4,1d34c61eb8288a35e98642b2d4e7c0b7916a246719fb84...,Jon Armel,Tacoma Community College,Calc II,Control,On-line,Winter 2021,Yes,B (2.9-3.1),20-22,No,She/her/hers pronouns,A race/ethnicity not listed here,16,6,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,78c2168d3cebb366af09e45cd7518bc306ceecf2a329bf...,Lee Singleton,WCC,Calc II,Control,On-line,Fall 2021,Yes,A- (3.5-3.8),20-22,No,he,Prefer not to answer,12,16,A
235,7ddcbd3237a978736a83a1f7541f62893211f9ac5f5e97...,Lee Singleton,WCC,Calc II,Control,On-line,Fall 2021,Yes,A- (3.5-3.8),20-22,No,Female,Multiracial or Biracial,6,5,A-
236,8c609206272481ebcb411c0884e9bce0c8bb6994a9e9fb...,Lee Singleton,WCC,Calc II,Control,On-line,Fall 2021,Yes,A (3.9-4.0),17 or younger,Yes,Female,White or Caucasian,15,17,A
237,a949a7af783b9f53b1c29475f612cecf35c5f8f682221b...,Lee Singleton,WCC,Calc II,Control,On-line,Fall 2021,Yes,A (3.9-4.0),20-22,Yes,Male,Multiracial or Biracial,26,26,A


In [5]:
processor3 = dp.GPTDataProcessor(api_key=os.getenv("OPENAI_API_KEY"), 
                    model = 'gpt-3.5-turbo', 
                    instructions = "You will be provided with phrases describing someone's gender. For each phrase, classify it as 'female', 'male', 'nonbinary', or 'other'."
            )

In [6]:
processor3.process(raw_gender_data['gender'])

Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Request timed out, retrying...
Progress: 100%


['Classified as: Male',
 'The phrase "male" is classified as \'male\'.',
 'female',
 'Phrases:\n1. Wears suits and ties to work every day.\n2. Enjoys fixing cars and working on engines.\n3. Has a deep voice and a beard.\n4. Loves playing football and watching sports.\n5. Prefers to wear jeans and t-shirts rather than dresses.\n6. Enjoys playing video games and collecting action figures.\n7. Is a professional ballet dancer.\n8. Has long hair and frequently wears makeup.\n9. Has a high-pitched voice and wears floral dresses.\n10. Enjoys cooking and baking.',
 'female',
 'male',
 'Correct answer: male',
 'Female',
 "Please provide a phrase describing someone's gender.",
 'female',
 'female',
 'Thank you for the input. Please provide the next phrase.',
 "Correct! 'Male' is the correct classification for the phrase. Well done!",
 'Female',
 'Gender: Female\nPreferred Pronouns: She/Her',
 "Sure, here are some phrases for you to classify:\n\n1. He is a father and takes care of his children.\n

In [7]:
processor3.dump('objects/genderclass')

In [8]:
processor4 = dp.GPTDataProcessor(api_key=os.getenv("OPENAI_API_KEY"), 
                    model = 'gpt-3.5-turbo', 
                    instructions = "I conducted a survey where people self-reported their gender. For data analysis, I need to classify their responses into 4 categories: 'female', 'male', 'non-binary', and 'other'. You will be provided with the response from a survey. Please decide what category it belongs in, and respond with that category."
            )

In [9]:
processor4.process(raw_gender_data['gender'][0:20])

Error communicating with OpenAI, retrying...
Request timed out, retrying...
Request timed out, retrying...
Progress: 100%


["Category: 'male'",
 "Category: 'male'",
 'Category: female',
 'Category: male',
 "Based on the response provided, the category would be 'female'.",
 "Based on the response provided, the category would be 'male'.",
 'male',
 'Category: Female',
 "Category: 'male'",
 "Category: 'female'",
 'Category: female',
 'Category: male',
 'Category: Male',
 "Category: 'male'",
 'Category: Female',
 'Category: Male',
 'female',
 "Based on the response provided, the category is 'female'.",
 "Category: 'male'",
 'Category: Male']

In [10]:
processor4.dump('objects/genderclass4')