- The system message helps set the behavior of the assistant. For example, you can modify the personality of the assistant or provide specific instructions about how it should behave throughout the conversation. However note that the system message is optional and the model’s behavior without a system message is likely to be similar to using a generic message such as "You are a helpful assistant."

- The user messages provide requests or comments for the assistant to respond to. 
- Assistant messages store previous assistant responses, but can also be written by you to give examples of desired behavior.

In [0]:
import pandas as pd
import os
import openai
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
openai.api_key = "OPEN_AI_API_KEY"
import warnings
warnings.simplefilter("ignore")
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
np.random.seed(123)
ROOT_DIR = '../data'
pred_field = 'context'  # this can be one of : 'sentence' or 'context'

## Data Reading + Formatting

In [0]:
# final_data_df = pd.read_csv('/Workspace/Users/aistrate@chanzuckerberg.com/software_intent_data - final_data.csv')
final_data_df = pd.read_csv('/Workspace/Users/k.moraw@epcc.ed.ac.uk/data/software_citation_intent_merged.csv')
LABEL2TEXT = {0: 'creation', 1 : 'usage', 2 : 'mention', 3 : 'none'}
final_data_df['label_descriptive'] = final_data_df['label'].apply(lambda x: LABEL2TEXT[x])

In [0]:
final_data_df.head()

Unnamed: 0.1,Unnamed: 0,id,sentence,used,created,mention,context,label,text,label_descriptive
0,0,PMC5189946,All of this analysis was implemented using Mat...,False,True,False,,0,All of this analysis was implemented using Mat...,creation
1,1,PMC4511233,"Code for calculating partition similarity, obt...",False,True,False,Since the probability of getting a given MI is...,0,"Code for calculating partition similarity, obt...",creation
2,2,PMC4186879,All behavioral statistical analyses were perfo...,False,False,True,All behavioral statistical analyses were perfo...,2,All behavioral statistical analyses were perfo...,mention
3,3,PMC5026371,"M-Track was written using Python 2.7, OpenCV 3...",True,False,False,,1,"M-Track was written using Python 2.7, OpenCV 3...",usage
4,4,PMC1283974,"Mindboggle is a freely downloadable, open sour...",False,True,False,"Mindboggle is a freely downloadable, open sour...",0,"Mindboggle is a freely downloadable, open sour...",creation


In [0]:
def update_context(df):
    df['context'] = df.apply(lambda x: x['context'] if x['context'] == x['context'] else x['sentence'], axis = 1)

In [0]:
final_data_df.head()

Unnamed: 0.1,Unnamed: 0,id,sentence,used,created,mention,context,label,text,label_descriptive
0,0,PMC5189946,All of this analysis was implemented using Mat...,False,True,False,,0,All of this analysis was implemented using Mat...,creation
1,1,PMC4511233,"Code for calculating partition similarity, obt...",False,True,False,Since the probability of getting a given MI is...,0,"Code for calculating partition similarity, obt...",creation
2,2,PMC4186879,All behavioral statistical analyses were perfo...,False,False,True,All behavioral statistical analyses were perfo...,2,All behavioral statistical analyses were perfo...,mention
3,3,PMC5026371,"M-Track was written using Python 2.7, OpenCV 3...",True,False,False,,1,"M-Track was written using Python 2.7, OpenCV 3...",usage
4,4,PMC1283974,"Mindboggle is a freely downloadable, open sour...",False,True,False,"Mindboggle is a freely downloadable, open sour...",0,"Mindboggle is a freely downloadable, open sour...",creation


In [0]:
def get_labels(df):
    final_labels = []
    used_labels = df['used'].to_list()
    created_labels = df['created'].to_list()
    mention_labels = df['mention'].to_list()
    sentences = df['sentence'].to_list()

    for used_label, created_label, mention_label, sentence in zip(used_labels, created_labels, mention_labels, sentences):
        final_label = 'no_label'
        if used_label:
            final_label = 'used'
        elif created_label:
            final_label = 'created'
        elif mention_label:
            final_label = 'mentioned'
        num_true = int(used_label) + int(created_label) + int(mention_label)
        if (num_true > 1):
            print(sentence, num_true)
        final_labels.append(final_label)
    return final_labels

In [0]:
X_train_df = pd.read_csv(ROOT_DIR + 'software_citation_intent_train.csv')
X_test_df = pd.read_csv(ROOT_DIR + 'software_citation_intent_test.csv')
update_context(X_train_df)
update_context(X_test_df)

labels = get_labels(X_train_df)
X_train_df, X_val_df = train_test_split(X_train_df, test_size=0.2, stratify=labels, random_state=42)

In [0]:
train_sentences = X_train_df[pred_field].to_list()
val_sentences = X_val_df[pred_field].to_list()
test_sentences = X_test_df[pred_field].to_list()

y_train = X_train_df['label_descriptive'].to_list()
y_val = X_val_df['label_descriptive'].to_list()
y_test = X_test_df['label_descriptive'].to_list()

In [0]:
print(Counter(y_train))
print(Counter(y_val))
print(Counter(y_test))
print(len(X_train_df), len(X_val_df), len(X_test_df))

Counter({'usage': 1436, 'none': 640, 'mention': 304, 'creation': 300})
Counter({'usage': 359, 'none': 160, 'mention': 76, 'creation': 75})
Counter({'usage': 449, 'none': 200, 'mention': 95, 'creation': 94})
2680 670 838


Convert to OpenAI fine-tuned format:
```
{"messages": 
  [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, 
  {"role": "user", "content": "What's the capital of France?"}, 
  {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}

{"messages": 
  [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, 
  {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, 
  {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}

{"messages": 
  [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, 
  {"role": "user", "content": "How far is the Moon from Earth?"}, 
  {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
```

In [0]:
import json
INITIAL_MESSAGE = {"role": "system", 
             "content": "You are a scientist trying to figure out the citation intent behind software mentioned in sentences coming from research articles. Your four categories are: usage, creation, mention, or none. The definitions of the classes are: \
             - usage: software was used in the paper \
             - creation: software was created by the authors of the paper \
             - mention: software was mentioned in the paper, but not used, nor created \
             - none: none of the previous 3 categories apply \
             You need to output one category only."}

jsonl_train = ROOT_DIR + 'gpt3.5-software-citation-intent-train-data.jsonl'
jsonl_val = ROOT_DIR + 'gpt3.5-software-citation-intent-val-data.jsonl'
jsonl_test = ROOT_DIR + 'gpt3.5-software-citation-intent-test-data.jsonl'

def generate_jsonl_file(jsonl_filename, sentences, labels):
        with open(jsonl_filename, mode='w') as f:
                for sentence, label in zip(sentences, labels):
                        message = {"messages" : [INITIAL_MESSAGE, 
                                {'role' : 'user', 'content' : sentence},
                                {'role' : 'assistant', 'content' : label}]}
                        f.write(json.dumps(message) + "\n")

In [0]:
generate_jsonl_file(jsonl_train, train_sentences, y_train)
generate_jsonl_file(jsonl_val, val_sentences, y_val)
generate_jsonl_file(jsonl_test, test_sentences, y_test)

## Fine-tuning model

In [0]:
train_file_id = openai.File.create(
  file=open(ROOT_DIR + 'gpt3.5-software-citation-intent-train-data.jsonl', "rb"),
  purpose='fine-tune'
)['id']

In [0]:
val_file_id = openai.File.create(
  file=open(ROOT_DIR + 'gpt3.5-software-citation-intent-val-data.jsonl', "rb"),
  purpose='fine-tune'
)['id']

In [0]:
openai.FineTuningJob.create(training_file=train_file_id, validation_file = val_file_id, model="gpt-3.5-turbo", hyperparameters = {'n_epochs' : 5})

<FineTuningJob fine_tuning.job id=ftjob-E97IKtm2bRHI50K4HkID1tFA at 0x7f38cdc8d4e0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-E97IKtm2bRHI50K4HkID1tFA",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1698418067,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-MLQFahmMO0SVF8xGt92L8edn",
  "result_files": [],
  "status": "validating_files",
  "validation_file": "file-0aJuTQmZJkeQm1K0HGQEpIqQ",
  "training_file": "file-Jf2bBSXoUH7gR1IYL0OupQHL",
  "hyperparameters": {
    "n_epochs": 5
  },
  "trained_tokens": null,
  "error": null
}

## Use in inference mode

In [0]:
def get_gpt_predicted_labels(test_sentences, has_true_labels, true_labels = []):
  predicted_labels = []
  true_labels_completed = []
  for i, x in enumerate(test_sentences):
    print('Sentence', i)
    try:
      completion = openai.ChatCompletion.create(
        model="ft:gpt-3.5-turbo-0613:personal::8EJLzjK6",
        messages=[
          INITIAL_MESSAGE,
          {"role": "user", "content": x} 
        ],
        request_timeout = 60
      )
      predicted_label = completion.choices[0].message['content']
      predicted_labels.append(predicted_label)
      if has_true_labels:
        true_labels_completed.append(true_labels[i])
    except:
      print('oops got an error')
      i -= 1
      continue
  return predicted_labels, true_labels_completed

## Evaluation

#### Test dataset

In [0]:
def evaluate(true_labels, predicted_labels):
    p, r, f1, support = precision_recall_fscore_support(true_labels, predicted_labels, average='macro')
    accuracy = round(accuracy_score(true_labels, predicted_labels), 3)
    print('Precision: ', round(p, 3), 'Recall: ', round(r, 3), 'F1:', round(f1, 3), 'Accuracy:', accuracy)
    print(classification_report(true_labels, predicted_labels))

In [0]:
len(test_sentences), len(test_sentences[:-1])

(838, 837)

In [0]:
n = -1
y_pred, y_true_completed = get_gpt_predicted_labels(test_sentences[:n], True, y_test[:n])
evaluate(y_pred, y_true_completed)

Sentence 0
oops got an error
Sentence 1
oops got an error
Sentence 2
oops got an error
Sentence 3
oops got an error
Sentence 4
oops got an error
Sentence 5
oops got an error
Sentence 6
oops got an error
Sentence 7
oops got an error
Sentence 8
oops got an error
Sentence 9
oops got an error
Sentence 10
oops got an error
Sentence 11
oops got an error
Sentence 12
oops got an error
Sentence 13
oops got an error
Sentence 14
oops got an error
Sentence 15
oops got an error
Sentence 16
oops got an error
Sentence 17
oops got an error
Sentence 18
oops got an error
Sentence 19
oops got an error
Sentence 20
oops got an error
Sentence 21
oops got an error
Sentence 22
oops got an error
Sentence 23
oops got an error
Sentence 24
oops got an error
Sentence 25
oops got an error
Sentence 26
oops got an error
Sentence 27
oops got an error
Sentence 28
oops got an error
Sentence 29
oops got an error
Sentence 30
oops got an error
Sentence 31
oops got an error
Sentence 32
oops got an error
Sentence 33
oops got

[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
File [0;32m<command-3494665154906555>, line 3[0m
[1;32m      1[0m n [38;5;241m=[39m [38;5;241m-[39m[38;5;241m1[39m
[1;32m      2[0m y_pred, y_true_completed [38;5;241m=[39m get_gpt_predicted_labels(test_sentences[:n], [38;5;28;01mTrue[39;00m, y_test[:n])
[0;32m----> 3[0m evaluate(y_pred, y_true_completed)

File [0;32m<command-2884248819784355>, line 5[0m, in [0;36mevaluate[0;34m(true_labels, predicted_labels)[0m
[1;32m      3[0m accuracy [38;5;241m=[39m [38;5;28mround[39m(accuracy_score(true_labels, predicted_labels), [38;5;241m3[39m)
[1;32m      4[0m [38;5;28mprint[39m([38;5;124m'[39m[38;5;124mPrecision: [39m[38;5;124m'[39m, [38;5;28mround[39m(p, [38;5;241m3[39m), [38;5;124m'[39m[38;5;124mRecall: [39m[38;5;124m'[39m, [38;5;28mround[39m(r, [38;5;241m3[39m), [3

#### CZI validation dataset

In [0]:
czi_combined = pd.read_csv('/Workspace/Users/aistrate@chanzuckerberg.com/czi_val.csv')
test_sentences_czi = czi_combined['text'].to_list()
y_test_czi = czi_combined['label'].to_list()
y_pred_czi, y_true_czi_completed = get_gpt_predicted_labels(test_sentences_czi, True, y_test_czi)

In [0]:
evaluate(y_pred_czi, y_true_czi_completed)

In [0]:
import plotly.express as px
labels_grouped = czi_combined.groupby('label').count().reset_index()
labels_grouped
fig = px.bar(labels_grouped, x = 'label', y = 'text', color = 'label')
fig.update_layout(width=700,height=500, title = 'Label Distribution in CZI Validation Dataset')

In [0]:
ct = 0
for x, x_pred in zip(y_pred_czi, y_true_czi_completed):
    if x == 'none':
        print(x, x_pred)
        ct += 1

none usage
none usage
none mention
none usage
none mention
none usage
none usage
none usage
none mention
none usage
none mention


In [0]:
ct

11