## 1. Load the test dataset

To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy.

In [1]:
# imports
import pandas as pd
import numpy as np
import tiktoken
from openai.embeddings_utils import get_embedding

In [2]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [15]:
input_datapath = "data/v6_v7.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df.reset_index(inplace=True)
df = df.dropna()

In [16]:
categories = sorted(df["intent"].unique())
print("Categories of samples:\n\n", df["intent"].value_counts())
df.head()

Categories of samples:

 p2p_transfer    28
unknown         15
Name: intent, dtype: int64


Unnamed: 0,Text,intent
0,abuse the cooperative bank limited,unknown
1,ajanta urban cooperative bank limited,unknown
2,1 rupees,unknown
3,1 rupees,unknown
4,i want to transfer 1 upi,p2p_transfer


In [17]:
df.shape

(43, 2)

## 2. Get test data embeddings and save them for future reuse

In [18]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
from config import API_KEY 
import openai
import config
openai.api_key = API_KEY

# This may take a few minutes
# uncomment to get embeddings
# df["embedding"] = df.Text.apply(lambda x: get_embedding(x, engine=embedding_model))

print(df.shape)
print(df.head())

(43, 3)
                                    Text        intent  \
0     abuse the cooperative bank limited       unknown   
1  ajanta urban cooperative bank limited       unknown   
2                               1 rupees       unknown   
3                               1 rupees       unknown   
4               i want to transfer 1 upi  p2p_transfer   

                                           embedding  
0  [-0.03387915715575218, -0.023782817646861076, ...  
1  [-0.015351150184869766, -0.019030826166272163,...  
2  [0.025440862402319908, -0.008647721260786057, ...  
3  [0.02538367174565792, -0.00863778218626976, -0...  
4  [-0.007308583706617355, -0.010682852938771248,...  


In [19]:
df.to_csv("data/v6_v7_embeddings_v1.csv")

In [28]:
input_datapath = "data/v6_v7_embeddings_v1.csv"
df = pd.read_csv(input_datapath, index_col=0)

## Load the saved model and make predictions

In [29]:
from joblib import load
pipe = load('sc_pca_rf.joblib')

In [30]:
test_df = df.copy()
test_df.head()

Unnamed: 0,Text,intent,embedding
0,abuse the cooperative bank limited,unknown,"[-0.03387915715575218, -0.023782817646861076, ..."
1,ajanta urban cooperative bank limited,unknown,"[-0.015351150184869766, -0.019030826166272163,..."
2,1 rupees,unknown,"[0.025440862402319908, -0.008647721260786057, ..."
3,1 rupees,unknown,"[0.02538367174565792, -0.00863778218626976, -0..."
4,i want to transfer 1 upi,p2p_transfer,"[-0.007308583706617355, -0.010682852938771248,..."


In [31]:
embedding_matrix = np.array(test_df.embedding.apply(eval).to_list())
embedding_matrix[0]

array([-0.03387916, -0.02378282,  0.01589917, ..., -0.01452906,
        0.00098912, -0.01093892])

In [33]:
predictions = pipe.predict(embedding_matrix)
test_df["pred"] = predictions
display(test_df.head())

Unnamed: 0,Text,intent,embedding,pred
0,abuse the cooperative bank limited,unknown,"[-0.03387915715575218, -0.023782817646861076, ...",unknown
1,ajanta urban cooperative bank limited,unknown,"[-0.015351150184869766, -0.019030826166272163,...",unknown
2,1 rupees,unknown,"[0.025440862402319908, -0.008647721260786057, ...",unknown
3,1 rupees,unknown,"[0.02538367174565792, -0.00863778218626976, -0...",unknown
4,i want to transfer 1 upi,p2p_transfer,"[-0.007308583706617355, -0.010682852938771248,...",p2p_transfer


In [34]:
test_df.loc[test_df.intent != test_df.pred].shape

(17, 4)

In [35]:
prob = pipe.predict_proba(embedding_matrix)
pipe.classes_

array(['balance_check', 'bpcl', 'electricity_payment', 'emi_collect_full',
       'emi_collect_partial', 'fastag_recharge', 'gas_payment',
       'insurance_renewal', 'mobile_recharge_postpaid',
       'mobile_recharge_prepaid', 'p2p_transfer', 'unknown',
       'upi_creation'], dtype=object)

In [36]:
prob_classes = {'max_prob':[], 'prob1':[], 'prob2':[]}
for ele in prob:
    lst = ([(tup[0], tup[1]) for tup in zip(ele, pipe.classes_)])
    sorted_lst = sorted(lst, key=lambda x: x[0], reverse=True)
    prob_classes['max_prob'].append(sorted_lst[0][0])
    prob_classes['prob1'].append(sorted_lst[0])
    prob_classes['prob2'].append(sorted_lst[1])

In [37]:
for key in prob_classes:
    test_df[key] = prob_classes[key]
test_df.shape

(43, 7)

In [38]:
(test_df.loc[test_df.max_prob > 0.7]).shape

(3, 7)

In [46]:
# test_df.tail()
# test_df.loc[test_df.intent != test_df.pred]

In [48]:
# test_df.drop('embedding', axis=1).loc[test_df.intent == test_df.pred]
test_df.drop('embedding', axis=1).loc[(test_df.max_prob > 0.50)&(test_df.intent == test_df.pred)]

Unnamed: 0,Text,intent,pred,max_prob,prob1,prob2
12,money transfer,p2p_transfer,p2p_transfer,0.85,"(0.85, p2p_transfer)","(0.03, unknown)"
14,9 7 3 00 8 7 8 7 1 1 rupees,unknown,unknown,0.52,"(0.52, unknown)","(0.12, mobile_recharge_prepaid)"
18,do money transfer,p2p_transfer,p2p_transfer,0.7,"(0.7, p2p_transfer)","(0.11, unknown)"
20,money transfer,p2p_transfer,p2p_transfer,0.85,"(0.85, p2p_transfer)","(0.03, unknown)"
24,money transfer post,p2p_transfer,p2p_transfer,0.57,"(0.57, p2p_transfer)","(0.12, balance_check)"
25,money transfer,p2p_transfer,p2p_transfer,0.85,"(0.85, p2p_transfer)","(0.03, unknown)"
