# Load Libraries

In [1]:
%load_ext autoreload
%autoreload 2
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import openai
import pandas as pd
import bamboolib as bam
import json

In [3]:
from apikey import apikey
os.environ['OPENAI_API_KEY']=apikey
openai.api_key  = os.getenv('OPENAI_API_KEY')

# OpenAI

In [4]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{'role':'system', 'content':'You are an expert in classifying products into UNSPSC codes'},
                {"role": "user", "content": prompt}]
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [5]:
def classify_UNSPSC(tabular_data):
    prompt=f"""
    You must classify the products delimited by triple backticks:
    1.Provide the output with the following keys only
    {{
        "index": ,
        "Product Description": ,
        "UNSPSC coding":,
        "Segment": ,
        "Family": ,
        "Class": ,
        "Commodity": 
    }}
    2.No AI introduction, No AI analysis, Return generated Json data only without backticks, Not human-readable, No backticks in output
    ```{tabular_data}```"""
    display(prompt)
    response=get_completion(prompt)
    print(f'response:\t{response}')
    response_dict=json.loads(response)
    print(f'response_dict:\t{response_dict}')
    
    return pd.DataFrame.from_dict(response_dict)  

In [6]:
def get_tabular_UNSPSC(df,sample_size=5):
    sample_df=df.sample(sample_size)
    print(sample_df.index)
    sample_df=sample_df.reset_index(drop=True)
    sample_df.rename(columns={'MaterialDescription': 'Product Description', 'Genericname ': 'Generic Product Description', 'SupplierName': 'Vendor Name'},inplace=True)
    X_df=sample_df[sample_df.columns[0:3]].reset_index()
    Y_df=sample_df[sample_df.columns[3:]].reset_index()

    X=X_df.to_dict(orient='records')
    X=str(X).replace('{','{{',).replace('}','}}')
    
    return (X,X_df,Y_df)
    

In [18]:
# tt=f"""
# Classify the products delimited by triple backticks and Provide the output with the following keys :
# index,Product Description,UNSPSC coding,Segment,Family,Class,Commodity.
# The output should only contain a JSON string
# ```
# [{{'index': 0, 'Product Description': 'BAKFLEX-A 8 MG tablet', 'Generic Product Description': 'thiocolchicoside-8 mg+aceclofenac-100 mg', 'Vendor Name': 'DROGARIA ANANTA'}}, {{'index': 1, 'Product Description': 'DOMIN 200 MG/5 ML ampoule solution for injection, 40 mg/mL', 'Generic Product Description': 'dopamine hydrochloride-200 mg', 'Vendor Name': 'VIMAL SALES CORPORATION'}}]
# ```
# """
# response=get_completion(tt)

# response_dict=json.loads(response)
# pd.DataFrame.from_dict(response_dict)

# UNSPSC Data

## Raw 

In [9]:
df = pd.read_csv('UNSPSCdataset.csv',encoding='mac_roman',low_memory=False)

In [10]:
df = df.loc[df['SupplierName'].notna()]
df = df.loc[df['Genericname '].notna()]
df = df.loc[df['Commodity'].notna()]
df = df[['MaterialDescription', 'Genericname ', 'SupplierName', 'Segment', 'Family', 'Class', 'Commodity', 'UNSPSC_Final']]
print(df.shape)
df = df.drop_duplicates(subset=['MaterialDescription', 'Genericname '], keep='first')
print(df.shape)

(5808, 8)
(744, 8)


In [11]:
df.to_csv('UNSPSCdataset_final.csv',index=False)

## Preprocessed

In [10]:
df=pd.read_csv('UNSPSCdataset_final.csv')
df

         MaterialDescription  \
0              6-MP 50MG TAB   
1     A B PHYLLINE SYP 100ML   
2                     A TO Z   
3         A to Z Gold tablet   
4                 A TO Z SYR   
..                       ...   
739               DYTOR 20MG   
740               DYTOR 20MG   
741  DYTOR PLUS 10 MG tablet   
742    DYTOR PLUS-10MG,TAB.    
743      DYTORPLUS-20MG,TAB.   

                                          Genericname   \
0                                     5 MERCAPTOPURINE   
1                            acebrophylline-50 mg/5 ml   
2                   VITAMINS WITH AMINO ACIDS&MINERALS   
3    carbohydrate-0.015 g+protein-0.016 g+eicosapen...   
4                                         MULTIVITAMIN   
..                                                 ...   
739                                    torsemide-20 mg   
740                             TORSEMIDE TABLETS 10mg   
741               torsemide-10 mg+spironolactone-50 mg   
742                         , Torse

In [37]:
df = df.filter(items = [735, 250, 325, 214, 484], axis=0)
df.to_csv('UNSPSC_test.csv',index=False)

# Predict 

In [38]:
tt=get_tabular_UNSPSC(df)

Int64Index([484, 735, 325, 250, 214], dtype='int64')


In [39]:
tt[0]

"[{{'index': 0, 'Product Description': 'CLOPILET A 75', 'Generic Product Description': 'CLOPIDOGREL BISULFATE+ASPRIN CAP', 'Vendor Name': 'SRI BALAJI PHARMA DISTRIBUTORS'}}, {{'index': 1, 'Product Description': 'DYNAPAR EC 50', 'Generic Product Description': 'DICLOFENAC', 'Vendor Name': 'GEETHA PHARMA'}}, {{'index': 2, 'Product Description': 'BROPHYLE CAPS', 'Generic Product Description': 'acebrophylline-100 mg', 'Vendor Name': 'NOVA PHARMACEUTICALS & GENERAL DIST'}}, {{'index': 3, 'Product Description': 'BACIGYL-N oral suspension 30 ML', 'Generic Product Description': 'metronidazole-100 mg/5 ml+norfloxacin-100 mg/5 ml', 'Vendor Name': 'DROGARIA COLVALKAR'}}, {{'index': 4, 'Product Description': 'ATRAPURE 50 MG/ 5 ML AMP SOLN FOR INJECTION', 'Generic Product Description': 'ATRACURIUM BESYLATE INJ', 'Vendor Name': 'XCEL HEALTHCARE'}}]"

In [40]:
# json.dumps(tt[0])

In [46]:
result=classify_UNSPSC(tt[0])

'\n    You must classify the products delimited by triple backticks:\n    1.Provide the output with the following keys only\n    {\n        "index": ,\n        "Product Description": ,\n        "UNSPSC coding":,\n        "Segment": ,\n        "Family": ,\n        "Class": ,\n        "Commodity": \n    }\n    2.No AI introduction, No AI analysis, Return generated Json data only without backticks, Not human-readable, No backticks in output\n    ```[{{\'index\': 0, \'Product Description\': \'CLOPILET A 75\', \'Generic Product Description\': \'CLOPIDOGREL BISULFATE+ASPRIN CAP\', \'Vendor Name\': \'SRI BALAJI PHARMA DISTRIBUTORS\'}}, {{\'index\': 1, \'Product Description\': \'DYNAPAR EC 50\', \'Generic Product Description\': \'DICLOFENAC\', \'Vendor Name\': \'GEETHA PHARMA\'}}, {{\'index\': 2, \'Product Description\': \'BROPHYLE CAPS\', \'Generic Product Description\': \'acebrophylline-100 mg\', \'Vendor Name\': \'NOVA PHARMACEUTICALS & GENERAL DIST\'}}, {{\'index\': 3, \'Product Descriptio

response:	[{"index": 0, "Product Description": "CLOPILET A 75", "UNSPSC coding": null, "Segment": null, "Family": null, "Class": null, "Commodity": null}, {"index": 1, "Product Description": "DYNAPAR EC 50", "UNSPSC coding": "42201800", "Segment": "Healthcare Services", "Family": "Therapeutic and rehabilitative products", "Class": "Musculoskeletal therapy products", "Commodity": "Topical pain relief products"}, {"index": 2, "Product Description": "BROPHYLE CAPS", "UNSPSC coding": "51191910", "Segment": "Healthcare Services", "Family": "Pharmaceuticals", "Class": "Respiratory and anesthesia and resuscitation products", "Commodity": "Respiratory therapy products"}, {"index": 3, "Product Description": "BACIGYL-N oral suspension 30 ML", "UNSPSC coding": "51101500", "Segment": "Healthcare Services", "Family": "Pharmaceuticals", "Class": "Antiinfective agents", "Commodity": "Antibacterial agents"}, {"index": 4, "Product Description": "ATRAPURE 50 MG/ 5 ML AMP SOLN FOR INJECTION", "UNSPSC cod

In [24]:
result

   index         Product Description UNSPSC coding  \
0      0               AMITRIL  PLUS          None   
1      1           BOOSTEM 150MG INJ      51191600   
2      2        AVESSA 125MG INHALER          None   
3      3                 ALBUMIN 20%      51191600   
4      4  ARTHROSCOPY DRAPE ECO E518      42131700   

                                          Segment  \
0                                            None   
1   Pharmaceuticals and Biochemicals and Healt...   
2                                            None   
3   Pharmaceuticals and Biochemicals and Healt...   
4  Medical Equipment and Accessories and Supplies   

                              Family                            Class  \
0                               None                             None   
1  Drugs and Pharmaceutical Products  Parenteral and Enteral Supplies   
2                               None                             None   
3  Drugs and Pharmaceutical Products  Parenteral and Enteral Sup

In [42]:
result

   index                          Product Description UNSPSC coding  \
0      0                                CLOPILET A 75          None   
1      1                                DYNAPAR EC 50      42271700   
2      2                                BROPHYLE CAPS      51191910   
3      3              BACIGYL-N oral suspension 30 ML      51101500   
4      4  ATRAPURE 50 MG/ 5 ML AMP SOLN FOR INJECTION      51201600   

               Segment                                          Family  \
0                 None                                            None   
1  Healthcare Services  Medical Equipment and Accessories and Supplies   
2  Healthcare Services                                 Pharmaceuticals   
3  Healthcare Services                                 Pharmaceuticals   
4  Healthcare Services                                 Pharmaceuticals   

                               Class                           Commodity  
0                               None                 

In [45]:
pd.merge(tt[1],result,on=['index','Product Description'])

   index                          Product Description  \
0      0                                CLOPILET A 75   
1      1                                DYNAPAR EC 50   
2      2                                BROPHYLE CAPS   
3      3              BACIGYL-N oral suspension 30 ML   
4      4  ATRAPURE 50 MG/ 5 ML AMP SOLN FOR INJECTION   

                         Generic Product Description  \
0                   CLOPIDOGREL BISULFATE+ASPRIN CAP   
1                                         DICLOFENAC   
2                              acebrophylline-100 mg   
3  metronidazole-100 mg/5 ml+norfloxacin-100 mg/5 ml   
4                            ATRACURIUM BESYLATE INJ   

                           Vendor Name UNSPSC coding              Segment  \
0       SRI BALAJI PHARMA DISTRIBUTORS          None                 None   
1                        GEETHA PHARMA      42271700  Healthcare Services   
2  NOVA PHARMACEUTICALS & GENERAL DIST      51191910  Healthcare Services   
3           