In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import re

In [4]:
df=pd.read_csv("../data/intentData.csv")

In [5]:
df.head()

Unnamed: 0,Text,Intent
0,How do I file a claim for my car insurance?,Complaint
1,What are the eligibility criteria for a loan?,Enquiry
2,Can you explain the benefits of this credit card?,Enquiry
3,I want to inquire about my insurance coverage.,Enquiry
4,My credit card statement has an incorrect charge.,Complaint


In [6]:
print(df.Intent.value_counts())
print(df.shape)

Intent
Enquiry         268
Complaint       264
General Talk    137
Name: count, dtype: int64
(669, 2)


In [7]:
df['Text']=df['Text'].apply(lambda x : x.lower())
df["Text"]=df["Text"].apply(lambda x: re.sub(r'[^\w]+'," ",x))
df["Text"]=df["Text"].apply(lambda x: re.sub("\d+", "", x))
df["Text"]=df["Text"].apply(lambda x: ' '.join(x.split())) 

In [8]:
df=df.sample(len(df),random_state=22)
df["Intent"].value_counts()

Intent
Enquiry         268
Complaint       264
General Talk    137
Name: count, dtype: int64

In [9]:
multilabel=MultiLabelBinarizer()
y=multilabel.fit_transform(df['Intent'])

In [10]:
one_hot_encoded_data = pd.get_dummies(df, columns = ['Intent'])

In [11]:
y=one_hot_encoded_data.drop(columns=["Text","Intent_Complaint"])
y

Unnamed: 0,Intent_Enquiry,Intent_General Talk
28,False,False
218,True,False
59,True,False
31,False,False
250,True,False
...,...,...
491,False,False
502,True,False
358,True,False
356,False,False


In [12]:
tfidf=TfidfVectorizer(analyzer='word',max_features=50,ngram_range=(1,3),stop_words='english')
x=tfidf.fit_transform(df['Text'])

In [13]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=3)

In [14]:
xtrain.shape,xtest.shape,ytrain.shape,ytest.shape

((535, 50), (134, 50), (535, 2), (134, 2))

In [15]:
lr=LogisticRegression(solver='liblinear',penalty='l2')
from sklearn.multiclass import OneVsRestClassifier

In [16]:
clf=OneVsRestClassifier(lr)
clf.fit(xtrain,ytrain)

In [17]:
x=['My savings account withdrawal was not processed correctly.']
xt=tfidf.transform(x)
clf.predict(xt)

array([[0, 0]])

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
accuracy_score(ytest,clf.predict(xtest))

0.8059701492537313

In [20]:
random_state=12
import pickle as pkl

In [21]:
pkl.dump(clf,open('../models/intent_classification.pkl','wb'))
pkl.dump(tfidf,open('../models/intent_classification_tfidf.pkl','wb'))