In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import svm
import joblib

from sklearn.utils import shuffle
df = pd.read_csv("PropertyDatasetCSV.csv")
df.columns = ["intent", "utterance"]
df = shuffle(df, random_state=42).reset_index(drop=True)
y = df['intent']
X = df['utterance']
df.head()

Unnamed: 0,intent,utterance
0,showGratitude,thanks so much
1,searchProperties,property in tokyo with wi-fi
2,searchProperties,any apartments available in munich from july 1...
3,searchProperties,in phoenix
4,getInformation,list all the locations


### Preprocess

In [2]:
def preprocess(text):
    text = text.lower()
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text
    
df['utterance'] = df['utterance'].apply(preprocess)

### TF-IDF

In [3]:
vectorizer = TfidfVectorizer(stop_words=None)
X_tfidf = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

### Logistic Regression

In [4]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state = 42, C=100)
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
lr_y_prob = lr_model.predict_proba(X_test)

lr_accuracy = accuracy_score(y_test, lr_y_pred)

predicted_class_confidences = np.max(lr_y_prob, axis=1)
average_confidence = np.mean(predicted_class_confidences)

print(f"Overall accuracy: {lr_accuracy:.2f}")
print(f"Average Confidence: {average_confidence:.2f}")

Overall accuracy: 0.95
Average Confidence: 0.92


In [5]:
test = ["show me places in tokyo"]
test_idf = vectorizer.transform(test)
probs = lr_model.predict_proba(test_idf)[0]
predicted_intent = lr_model.classes_[probs.argmax()]
predicted_confidence = probs.max()

if predicted_confidence < 0.70:
    print("IDK what you mean, sorry")
    print(f"Confidence: {predicted_confidence:.2f}")
else:
    print(f"Predicted intent: {predicted_intent}")
    print(f"Confidence: {predicted_confidence:.2f}")

Predicted intent: searchProperties
Confidence: 0.99


In [6]:
joblib.dump(lr_model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']