### A. Import necessary packages

In [3]:
import pandas as pd
import numpy as np
import json, os
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from openai import OpenAI

### B. Data loading and preprocessing

#### 1. Load data from excel file

In [80]:
# Randomly read 'n' samples from the excel file
n = 2000

file_path = '../data/medicine_description.xlsx'
data = pd.read_excel(file_path, sheet_name='Sheet1', header=0)

rdn_data = data.sample(n=n, random_state=42) 
rdn_data = rdn_data.reset_index(drop=True)

In [81]:
rdn_data.head()

Unnamed: 0,Drug_Name,Reason,Description
0,Seriwok 10mg Tablet 10'S,Pain,"fever, operative pain, dental pain, musculosk..."
1,EPI O2 Capsule 10'S,Supplement,antioxidant and memory enhancement property
2,Xymex Drops 30mlXymex Drops 15mlXymex Syrup 200ml,Digestion,"it is used to treat bloating, flatulence and ..."
3,Montgold Kid Tablet 10'S,Allergies,releief from all type of allergies
4,Tiam 100mg Capsule 10'STiam 75mg Tablet 10'STI...,Supplement,"protects your cells from oxidation, and neutra..."


In [82]:
np.unique(rdn_data['Reason'])

array(['Acne', 'Adhd', 'Allergies', 'Alzheimer', 'Amoebiasis', 'Anaemia',
       'Angina', 'Anxiety', 'Appetite', 'Arrhythmiasis', 'Arthritis',
       'Cleanser', 'Constipation', 'Contraception', 'Dandruff',
       'Depression', 'Diabetes', 'Diarrhoea', 'Digestion', 'Fever',
       'Fungal', 'General', 'Glaucoma', 'Gout', 'Haematopoiesis',
       'Haemorrhoid', 'Hyperpigmentation', 'Hypertension',
       'Hyperthyroidism', 'Hypnosis', 'Hypotension', 'Hypothyroidism',
       'Infection', 'Malarial', 'Migraine', 'Mydriasis', 'Osteoporosis',
       'Pain', 'Parkinson', 'Psychosis', 'Pyrexia', 'Scabies',
       'Schizophrenia', 'Smoking', 'Supplement', 'Thrombolysis',
       'Vaccines', 'Vertigo', 'Viral', 'Wound'], dtype=object)

#### 2. Map unique 'Reason' values to numerical indices

In [47]:
# Create a dictionary to assign a unique integer to each 'Reason'
med_data = rdn_data
reasons = med_data['Reason'].unique()
reasons_hash = {reason: idx for idx, reason in enumerate(reasons)}

#### 3. Format the 'Drug_Name' column

In [48]:
# Add 'Drug:' and 'Malady:' prefixes for structured formatting
med_data['Drug_Name'] = 'Drug: ' + med_data['Drug_Name'] + '\n' + 'Malady:'
med_data['Drug_Name']

0                 Drug: Seriwok 10mg Tablet 10'S\nMalady:
1                      Drug: EPI O2 Capsule 10'S\nMalady:
2       Drug: Xymex Drops 30mlXymex Drops 15mlXymex Sy...
3                 Drug: Montgold Kid Tablet 10'S\nMalady:
4       Drug: Tiam 100mg Capsule 10'STiam 75mg Tablet ...
                              ...                        
1995           Drug: Newbona Strong Capsule 10'S\nMalady:
1996    Drug: Glimestar M 0.5mg Tablet 10'SGlimestar M...
1997    Drug: Levorid 5mg Tablet 50'SLevorid Tablet 10...
1998    Drug: Itrason 200mg Capsule 4'SItrason 100mg C...
1999    Drug: Tabtret 5mg Tablet 10'STabtret 10mg Tabl...
Name: Drug_Name, Length: 2000, dtype: object

#### 4. Replace 'Reason' values with numerical indices

In [49]:
# Use the dictionary created earlier to replace textual reasons with their corresponding indices
med_data['Reason'] = med_data['Reason'].apply(lambda x: str(reasons_hash[x]))
med_data['Reason']

0       0
1       1
2       2
3       3
4       1
       ..
1995    1
1996    5
1997    3
1998    7
1999    9
Name: Reason, Length: 2000, dtype: object

#### 5. Drop unnecessary columns

In [50]:
# Remove the 'Description' column
med_data.drop(['Description'], axis=1, inplace=True)

In [51]:
med_data.head()

Unnamed: 0,Drug_Name,Reason
0,Drug: Seriwok 10mg Tablet 10'S\nMalady:,0
1,Drug: EPI O2 Capsule 10'S\nMalady:,1
2,Drug: Xymex Drops 30mlXymex Drops 15mlXymex Sy...,2
3,Drug: Montgold Kid Tablet 10'S\nMalady:,3
4,Drug: Tiam 100mg Capsule 10'STiam 75mg Tablet ...,1


### C. Split Data into Training and Validation Sets

#### 6. Perform an 80-20 split

In [52]:
# Split the dataset into training and validation datasets
train_data, val_data = train_test_split(med_data, train_size=0.8, random_state=100)

In [53]:
val_data.head()

Unnamed: 0,Drug_Name,Reason
1025,Drug: Hepcdac 60mg Tablet 28'S\nMalady:,22
1208,Drug: Para Nc Tablet 10's\nMalady:,0
1055,Drug: Amchek L Tablet 10'S\nMalady:,11
367,Drug: Qupin 50mg Tablet 10'S\nMalady:,8
815,Drug: Rapidon Eye Drops 5ml\nMalady:,3


In [54]:
train_data.shape, val_data.shape

((1600, 2), (400, 2))

### D. Prepare Data for Fine-Tuning

#### 7. Define a function to convert DataFrame to JSONL format

In [55]:
# Convert rows into JSONL format for fine-tuning
def convert_to_jsonl(df, output):
    result = []
    system_message = {'role': 'system', 'content': 'you are a drug classification assistant!'}

    for _, row in df.iterrows():
        user_message = {'role': 'user', 'content': row['Drug_Name']}
        assistant_message = {'role': 'assistant', 'content': row['Reason']}
        result.append({'messages': [system_message, user_message, assistant_message]})

    with open(output, 'w') as f:
        for entry in result:
            f.write(json.dumps(entry) + '\n')

#### 8. Convert training and validation datasets

In [56]:
# Save the training and validation datasets as JSONL files
convert_to_jsonl(train_data, 'train_data.jsonl')
convert_to_jsonl(val_data, 'val_data.jsonl')

### E. Upload Data for Fine-Tuning

#### 9. Set up OpenAI API

In [4]:
# Load API credentials and initialize OpenAI client
load_dotenv()
api_key = os.getenv('openai_api_key')
client = OpenAI(api_key=api_key)

#### 10. Upload JSONL files to OpenAI

In [58]:
# Upload training and validation datasets for fine-tuning
train_data = client.files.create(
    file=open('train_data.jsonl', 'rb'),
    purpose='fine-tune'
)

val_data = client.files.create(
    file=open('val_data.jsonl', 'rb'),
    purpose='fine-tune'
)

### F. Fine-Tune the Model

#### 11: Create a fine-tuning job

In [None]:
suffix = 'drug-classifier'

fine_tune_job = client.fine_tuning.jobs.create(
    training_file=train_data.id,
    model='gpt-3.5-turbo-0125',
    validation_file=val_data.id,
    suffix=suffix
)
print(f"Fine-tuning job '{suffix}' created with ID: {fine_tune_job.id}")

updated_job = client.fine_tuning.jobs.retrieve(fine_tune_job.id)
print(f"Fine-tuned Model ID: {updated_job.fine_tuned_model}")

In [60]:
jobs = client.fine_tuning.jobs.list()

for job in jobs.data:
    if job.fine_tuned_model:
        fine_tuned_model = job.fine_tuned_model
        print(f"Found model: {fine_tuned_model}")
        

Found model: ft:gpt-3.5-turbo-0125:personalapis:drug-classifier:AWSwjiou
Found model: ft:gpt-3.5-turbo-0125:personalapis:drug-classifier:AWD9hDg8


In [65]:
drugs = [
    "A CN Gel(Topical) 20gmA CN Soap 75gm",
    "Addnok Tablet 20'S",                   
    "ABICET M Tablet 10's",                 
]

In [66]:
model = 'ft:gpt-3.5-turbo-0125:personalapis:drug-classifier:AWD9hDg8'

for drug_name in drugs:
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": f"Drug: {drug_name}\nMalady:"}
        ],
        max_tokens=10,
        temperature=0
    )
    predicted_malady = response.choices[0].message.content.strip()
    print(f"Drug: {drug_name}\nPredicted Malady: {predicted_malady}\n")


Drug: A CN Gel(Topical) 20gmA CN Soap 75gm
Predicted Malady: 7

Drug: Addnok Tablet 20'S
Predicted Malady: 0

Drug: ABICET M Tablet 10's
Predicted Malady: 3



In [76]:
class_map = {
    0: "Acne",
    1: "ADHD",
    2: "Allergies",
}

for drug in drugs:
    drug_name = drug.split("'")[1]
    prompt = "Drug: {}\nMalady:".format(drug)
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    response = completion.choices[0].message.content

    try:
        print(f"{drug_name} is used for {class_map[int(response)]}.")
    except:
        print(f"I don't know what {drug_name} is used for.")


I don't know what A CN Gel(Topical) 20gmA CN Soap 75gm is used for.
Addnok Tablet 20 is used for Acne.
ABICET M Tablet 10 is used for Acne.


## Fine-tuning based on 3 examples

In [16]:
train_data = client.files.create(
    file=open('three_examples.jsonl', 'rb'),
    purpose='fine-tune'
)

In [17]:
sq_job = client.fine_tuning.jobs.create(
    training_file=train_data.id,
    model='gpt-3.5-turbo-0125',
    suffix='simple-questions'
)

print(f"Fine-tuning job created with ID: {sq_job.id}")

Fine-tuning job created with ID: ftjob-w5o8kR6tzOIQ82DOlayxjuUN


In [18]:
# Monitor the fine-tuning job
fine_tune_job_id = sq_job.id

updated_job = client.fine_tuning.jobs.retrieve(fine_tune_job_id)
print(f"Fine-tuned Model ID: {updated_job.fine_tuned_model}")


Fine-tuned Model ID: None
