In [1]:
 #!kaggle datasets download -d sohier/large-purchases-by-the-state-of-ca

In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


# Prep Dataset ( No need to run, only for data prep )

In [2]:
import pandas as pd
import bamboolib as bam
import pickle

## Drop and Cast cols

In [3]:
df = pd.read_csv(r'../../data/PURCHASE ORDER DATA EXTRACT 2012-2015_0.csv', sep=',', decimal='.')
df = df.drop(columns=['Purchase Date', 'LPA Number', 'Purchase Order Number', 'Requisition Number', 'Sub-Acquisition Type', 'Acquisition Method', 'Sub-Acquisition Method', 'Supplier Code', 'Supplier Qualifications', 'CalCard', 'Location'])

In [4]:
# drop rows with any NAs
mask=df.isna().sum(axis=1)==0
df=df.loc[mask]
df.shape

(273036, 20)

In [5]:
cols_cast=['Normalized UNSPSC','Class','Family','Segment']
for col in cols_cast:
    df[col] = pd.to_numeric(df[col], downcast='integer', errors='coerce')

<img title="a title" alt="Alt text" src="UNSPSC_tree.png" width="500" height="600" style="float: left;">

## Create Labels Dictionary

In [6]:
labels_dic={}
cols=['Segment Title','Family Title','Class Title','Normalized UNSPSC']
for col in cols:
    labels_dic[col]= df.groupby([col]).agg(cnt=('Creation Date', 'size')).reset_index()
labels_dic['Collated']=df.groupby(['Segment Title', 'Family Title', 'Class Title', 'Normalized UNSPSC']).agg(cnt=('Creation Date', 'size')).reset_index()

In [7]:
for key,val in labels_dic.items():
    print(f'{key}\t:{val.shape}')

Segment Title	:(56, 2)
Family Title	:(408, 2)
Class Title	:(2293, 2)
Normalized UNSPSC	:(12483, 2)
Collated	:(12483, 5)


In [8]:
tt=labels_dic['Collated']
tt = tt.groupby(['Segment Title', 'Family Title']).agg(sum_cnt=('cnt', 'sum')).reset_index()
tt

                                         Segment Title  \
0       Apparel and Luggage and Personal Care Products   
1       Apparel and Luggage and Personal Care Products   
2       Apparel and Luggage and Personal Care Products   
3       Apparel and Luggage and Personal Care Products   
4       Apparel and Luggage and Personal Care Products   
..                                                 ...   
403  Travel and Food and Lodging and Entertainment ...   
404  Travel and Food and Lodging and Entertainment ...   
405  Travel and Food and Lodging and Entertainment ...   
406  Travel and Food and Lodging and Entertainment ...   
407  Travel and Food and Lodging and Entertainment ...   

                                  Family Title  sum_cnt  
0                                     Clothing     2347  
1                                     Footwear      488  
2     Luggage and handbags and packs and cases      244  
3                       Personal care products      852  
4            

## Export and Save output

In [14]:
# df.to_csv('..\..\data\large-purchases-by-the-state-of-ca\cleansed_kaggle_po.csv',index=False)

In [10]:
import pickle
pickle_filename = '../../data/labels_dic.pkl'

# Open the file in binary write mode and dump the dictionary into it
with open(pickle_filename, 'wb') as pickle_file:
    pickle.dump(labels_dic, pickle_file)
    
pickle_filename = '../../data/cleansed_kaggle_po.pkl'

# Open the file in binary write mode and dump the dictionary into it
with open(pickle_filename, 'wb') as pickle_file:
    pickle.dump(df, pickle_file)

In [9]:
# with open(pickle_filename, 'rb') as pickle_file:
#     loaded_dict = pickle.load(pickle_file)

# print(loaded_dict)

# Label Dataset

In [2]:
import pickle
import bamboolib as bam

## Load Pickles

In [3]:
pickle_filename='../../data/labels_dic.pkl'
with open(pickle_filename, 'rb') as pickle_file:
    labels_dic = pickle.load(pickle_file)

print(labels_dic.keys())

dict_keys(['Segment Title', 'Family Title', 'Class Title', 'Normalized UNSPSC', 'Collated'])


In [4]:
pickle_filename='../../data/cleansed_kaggle_po.pkl'
with open(pickle_filename, 'rb') as pickle_file:
    df = pickle.load(pickle_file)

print(df.shape)

(273036, 20)


In [5]:
df.columns

Index(['Creation Date', 'Fiscal Year', 'Acquisition Type', 'Department Name',
       'Supplier Name', 'Supplier Zip Code', 'Item Name', 'Item Description',
       'Quantity', 'Unit Price', 'Total Price', 'Classification Codes',
       'Normalized UNSPSC', 'Commodity Title', 'Class', 'Class Title',
       'Family', 'Family Title', 'Segment', 'Segment Title'],
      dtype='object')

## Drop Cols and Create an extra label (Acquisition Type)

In [6]:
cols_drop=['Creation Date','Supplier Zip Code','Classification Codes','Total Price','Segment','Family','Class']
df.drop(labels=cols_drop,inplace=True,axis=1)
df.shape

(273036, 13)

In [7]:
df['Acquisition Type'].value_counts()

NON-IT Goods             172881
NON-IT Services           52514
IT Goods                  38272
IT Services                9243
IT Telecommunications       126
Name: Acquisition Type, dtype: int64

In [8]:
labels_dic['Acquisition Type']=df['Acquisition Type'].value_counts().index.tolist()
labels_dic.keys()

dict_keys(['Segment Title', 'Family Title', 'Class Title', 'Normalized UNSPSC', 'Collated', 'Acquisition Type'])

## HuggingFace Models

In [9]:
from transformers import pipeline
import pandas as pd
import torch

In [10]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


In [11]:
# zeroshotmodels=[]
# myDict = {key: None for key in ['name']}
# del zeroshotmodels,myDict

In [12]:
modelList=['sjrhuschlee/flan-t5-base-mnli','facebook/bart-large-mnl','roberta-large-mnli','MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli']

In [13]:
# classifier=pipeline("zero-shot-classification",model='sjrhuschlee/flan-t5-base-mnli',trust_remote_code=True,device=0)
# classifier=pipeline("zero-shot-classification",model="facebook/bart-large-mnl",trust_remote_code=True,device=0)
# classifier=pipeline("zero-shot-classification",model="roberta-large-mnli",trust_remote_code=True,device=0)
classifier=pipeline("zero-shot-classification",model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",trust_remote_code=True,device=0)



display(classifier(
    ["This is a course about the Transformers library","I expect to be big entrepreneur soon"],
    candidate_labels=["education", "politics", "business"],multi_label=False))

[{'sequence': 'This is a course about the Transformers library',
  'labels': ['education', 'business', 'politics'],
  'scores': [0.9950936436653137, 0.003088601166382432, 0.001817720360122621]},
 {'sequence': 'I expect to be big entrepreneur soon',
  'labels': ['business', 'education', 'politics'],
  'scores': [0.9911442399024963, 0.005248643923550844, 0.0036071285139769316]}]

In [14]:
def prepare_data(df):
    temp_df=df.sample(1000).copy(deep=True)
    temp_df.reset_index(inplace=True,drop=True)
    temp_df['premise']=temp_df.apply(lambda x: 'Item: '+x['Item Name']+' Description: '+x['Item Description']+\
                    ' with price '+x['Unit Price']+\
                    ' bought from Supplier: '+x['Supplier Name']
         ,axis=1)
    return temp_df

In [15]:
def do_classify(df,labels):
#     results=classifier(sequences=temp_df['premise'].tolist(),candidate_labels=labels_dic['Segment Title']['Segment Title'].tolist())
    results=classifier(sequences=df['premise'].tolist(),candidate_labels=labels)
    df_result=pd.DataFrame([(result['sequence'],result['labels'][0],result['scores'][0]) for result in results])
    df_result.columns=['premise','Pred','Pred_score']
    return df_result
    

In [16]:
labels=labels_dic['Acquisition Type']
labels

['NON-IT Goods',
 'NON-IT Services',
 'IT Goods',
 'IT Services',
 'IT Telecommunications']

In [17]:
%%time
template = "The UNSPSC classification for this example is {}."
temp_df=prepare_data(df)
print(temp_df.shape)
df_result=do_classify(temp_df,labels)
df_result.loc[:,'Acquisition Type']=temp_df['Acquisition Type']
df_result

(1000, 14)
CPU times: user 1min 24s, sys: 51.5 ms, total: 1min 24s
Wall time: 1min 24s


                                               premise             Pred  \
0    Item: vegetables Description: beans green cut,...  NON-IT Services   
1    Item: Legal Serv Description: EDD to repay LWD...  NON-IT Services   
2    Item: peripherals Description: peripherals wit...         IT Goods   
3    Item: magenta printer cartridge Description: M...         IT Goods   
4    Item: FILTECK SUPREME ULTRA CAPS A3-E Descript...         IT Goods   
..                                                 ...              ...   
995  Item: teflon tape Description: 1/2" with price...  NON-IT Services   
996  Item: Ambulance Rental Description: Ambulance ...  NON-IT Services   
997  Item: 1-11/4 X 10 PCV-80 Conduit Pipe Descript...     NON-IT Goods   
998  Item: MODEL 901 TOXIC SAMPLER Description: MOD...     NON-IT Goods   
999  Item: TRELLCHEM SPLASH 2000 HAZMAT SUIT Descri...  NON-IT Services   

     Pred_score Acquisition Type  
0      0.522861     NON-IT Goods  
1      0.535357  NON-IT Servi

In [22]:
import evaluate
accuracy = evaluate.load("accuracy")

In [23]:
{label for label in enumerate(labels)}

{(0, 'NON-IT Goods'),
 (1, 'NON-IT Services'),
 (2, 'IT Goods'),
 (3, 'IT Services'),
 (4, 'IT Telecommunications')}

In [24]:
mapping={'NON-IT Goods':0, "NON-IT Services": 1,'IT Goods':2,'IT Services':3,'IT Telecommunications':4}
df_result['Pred_mapping']=df_result['Pred'].map(mapping)
df_result['Acquisition Type_mapping']=df_result['Acquisition Type'].map(mapping)

In [25]:
accuracy.compute(references=df_result['Acquisition Type_mapping'].tolist(),predictions=df_result['Pred_mapping'].tolist())

{'accuracy': 0.401}

In [26]:
import pandas as pd; import numpy as np
# Step: Drop columns
tt = df_result.drop(columns=['Pred_mapping', 'Acquisition Type_mapping'])

# Step: Group by Acquisition Type, Pred and calculate new column(s)
tt = tt.groupby(['Acquisition Type', 'Pred']).agg(cnt=('premise', 'size')).reset_index()

# Step: Sort column(s) Acquisition Type ascending (A-Z), Pred ascending (A-Z)
tt = tt.sort_values(by=['Acquisition Type', 'Pred'], ascending=[True, True])

tt

         Acquisition Type             Pred  cnt
0                IT Goods         IT Goods  109
1                IT Goods     NON-IT Goods   11
2                IT Goods  NON-IT Services    6
3             IT Services         IT Goods   32
4             IT Services      IT Services    3
5             IT Services  NON-IT Services    1
6   IT Telecommunications         IT Goods    1
7            NON-IT Goods         IT Goods  219
8            NON-IT Goods     NON-IT Goods  179
9            NON-IT Goods  NON-IT Services  258
10        NON-IT Services         IT Goods   32
11        NON-IT Services     NON-IT Goods   39
12        NON-IT Services  NON-IT Services  110

In [40]:
# template = "The UNSPSC classification for this example is {}."
# temp_df=prepare_data(df)
# temp_df
# results=classifier(sequences=temp_df['premise'].tolist(),candidate_labels=labels,hypothesis_template=template)
# df_result=pd.DataFrame([(result['sequence'],result['labels'][0],result['scores'][0]) for result in results])
# # df_result.columns = [str(column) for column in df_result.columns]
# # # import pandas as pd; import numpy as np
# # # # Step: Keep rows where 2 >= .5
# # # df_result = df_result.loc[df_result['2'] >= .4]

# import pandas as pd; import numpy as np
# df_result.columns = [str(column) for column in df_result.columns]
# df_result