In [14]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords


import warnings
warnings.filterwarnings('ignore')

# problem statement 

# data collection

In [2]:
data = pd.read_csv('Helpdesk Sample Emails - Train Set.tsv','\t')
data

Unnamed: 0,Target Category,Email Body
0,Desktop Hardware,Hey IT support team - my monitor is not turnin...
1,Desktop Software,Hey - I just got a new laptop and I need some ...
2,Networking,IT Support team - my wireless internet keeps g...
3,Account Access,Hey - we just got a new team member on the ma...
4,General Question,Hey how do I delete my internet cookies? I saw...
...,...,...
221,Networking,Team! I need a new LAN Cable. Please could you...
222,Networking,Im trying to connect my phone to the wifi netw...
223,Networking,Im going to be traveling to a conference soon ...
224,Networking,The network cable on the back of my machine ke...


In [3]:
data.shape

(226, 2)

# text cleaning and preprocessing

### 1.tokenization 
### 2.removing stopwords 
### 3.removing punchuations 
### 4.sentence to words 
### 5.words to its basic form using stemming and lamitization

In [4]:
import re 
from nltk.corpus import stopwords

## Stemming

In [5]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [6]:
# cleaning using re --- lowering -- removing stopwords ---- stemming 
def preprocess(data):
    corpus = []
    for i in range(len(data)):
        review = re.sub('[^a-zA-Z0-9]',' ',data['Email Body'][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word not in set(stopwords.words('english')) ]
        review = ' '.join(review)
        corpus.append(review)
    return corpus
   

In [7]:
corpus = preprocess(data)

## lematization

In [8]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [9]:
# cleaning using re --- lowering -- removing stopwords ---- stemming 

def preprocess_lmtiz(data):
    c = []
    for i in range(len(data)):
        r = re.sub('[^a-zA-Z0-9]',' ',data['Email Body'][i])
        r = r.lower()
        r = r.split()
        r = [wl.lemmatize(word) for word in r if word not in set(stopwords.words('english'))]
        r = ' '.join(r)
        c.append(r)
    return c

In [10]:
c= preprocess_lmtiz(data)

In [11]:
data['stemed']=corpus
data['lematized'] = c

## Word Embedding -- word to vector 

In [12]:
data

Unnamed: 0,Target Category,Email Body,stemed,lematized
0,Desktop Hardware,Hey IT support team - my monitor is not turnin...,hey support team monitor turn tri plug still n...,hey support team monitor turning tried plug st...
1,Desktop Software,Hey - I just got a new laptop and I need some ...,hey got new laptop need softwar old laptop som...,hey got new laptop need software old laptop so...
2,Networking,IT Support team - my wireless internet keeps g...,support team wireless internet keep go reset r...,support team wireless internet keep going rese...
3,Account Access,Hey - we just got a new team member on the ma...,hey got new team member market team tri get se...,hey got new team member marketing team trying ...
4,General Question,Hey how do I delete my internet cookies? I saw...,hey delet internet cooki saw movi netflix larg...,hey delete internet cooky saw movie netflix la...
...,...,...,...,...
221,Networking,Team! I need a new LAN Cable. Please could you...,team need new lan cabl pleas could send,team need new lan cable please could send
222,Networking,Im trying to connect my phone to the wifi netw...,im tri connect phone wifi network work keep dr...,im trying connect phone wifi network work keep...
223,Networking,Im going to be traveling to a conference soon ...,im go travel confer soon need mobil hotspot st...,im going traveling conference soon need mobile...
224,Networking,The network cable on the back of my machine ke...,network cabl back machin keep come unplug caus...,network cable back machine keep coming unplugg...


In [16]:
from sklearn.feature_extraction.text import  TfidfVectorizer
tf = TfidfVectorizer()

In [91]:
# creating BOW model 

#x = tf.fit_transform(data['stemed']).toarray()
x = tf.fit_transform(data['lematized']).toarray()
x

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.23556668, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [92]:
y = data['Target Category']
y

0      Desktop Hardware
1      Desktop Software
2            Networking
3        Account Access
4      General Question
             ...       
221          Networking
222          Networking
223          Networking
224          Networking
225          Networking
Name: Target Category, Length: 226, dtype: object

# model building 

### 1. K- nearest Neighbour

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0,stratify=y)

In [58]:
from sklearn.neighbors import KNeighborsClassifier

In [59]:
classifier = KNeighborsClassifier()
classifier.fit(x_train,y_train)

In [60]:
y_pred = classifier.predict(x_test)

### Evaluation

In [61]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report 

In [62]:
matrix = confusion_matrix(y_test,y_pred)

In [63]:
matrix

array([[10,  0,  1,  0,  0],
       [ 2,  3,  1,  1,  1],
       [ 6,  0,  7,  0,  0],
       [ 2,  1,  3,  1,  0],
       [ 1,  0,  1,  0,  5]], dtype=int64)

In [64]:
accuracy_score(y_test,y_pred)

0.5652173913043478

In [65]:
cr = classification_report(y_test,y_pred)
print(cr)

                  precision    recall  f1-score   support

  Account Access       0.48      0.91      0.62        11
Desktop Hardware       0.75      0.38      0.50         8
Desktop Software       0.54      0.54      0.54        13
General Question       0.50      0.14      0.22         7
      Networking       0.83      0.71      0.77         7

        accuracy                           0.57        46
       macro avg       0.62      0.54      0.53        46
    weighted avg       0.60      0.57      0.54        46



### 2 . Naive Bayes

#### A. Guassian NB  - continous bag of words

In [66]:
from sklearn.naive_bayes  import GaussianNB

In [67]:
nb = GaussianNB()

In [68]:
nb.fit(x_train,y_train)

In [69]:
nb_y = nb.predict(x_test)

In [70]:
acc1 = accuracy_score(nb_y,y_test)

In [71]:
acc1

0.5652173913043478

In [72]:
confusion_matrix(nb_y,y_test)

array([[5, 0, 4, 1, 0],
       [0, 5, 1, 1, 0],
       [3, 1, 5, 1, 0],
       [2, 0, 1, 4, 0],
       [1, 2, 2, 0, 7]], dtype=int64)

#### B.   Multinomial NB   ----- TF-IDF

In [73]:
from sklearn.naive_bayes import MultinomialNB

In [74]:
mnnb = MultinomialNB()

In [75]:
mnnb.fit(x_train,y_train)

In [76]:
mnnb_y = mnnb.predict(x_test)

In [77]:
acc2 = accuracy_score(mnnb_y,y_test)

In [78]:
acc2

0.45652173913043476

#### C.   Bernulis NB - Bag of Words

In [79]:
from sklearn.naive_bayes import BernoulliNB

In [80]:
brnb = BernoulliNB()

In [81]:
brnb.fit(x_train,y_train)

In [82]:
brnb_y = brnb.predict(x_test)

In [83]:
acc3 = accuracy_score(y_test,brnb_y)

In [84]:
acc3

0.41304347826086957

### 3.SVM --- support vector machine 

In [85]:
from sklearn.svm import SVC

In [86]:
svc = SVC()

In [87]:
svc.fit(x_train,y_train)

In [88]:
svc_pred = svc.predict(x_test)

In [89]:
svc_acc = accuracy_score(y_test,svc_pred)

In [90]:
svc_acc

0.43478260869565216

In [97]:
data

Unnamed: 0,Target Category,Email Body,stemed,lematized
0,Desktop Hardware,Hey IT support team - my monitor is not turnin...,hey support team monitor turn tri plug still n...,hey support team monitor turning tried plug st...
1,Desktop Software,Hey - I just got a new laptop and I need some ...,hey got new laptop need softwar old laptop som...,hey got new laptop need software old laptop so...
2,Networking,IT Support team - my wireless internet keeps g...,support team wireless internet keep go reset r...,support team wireless internet keep going rese...
3,Account Access,Hey - we just got a new team member on the ma...,hey got new team member market team tri get se...,hey got new team member marketing team trying ...
4,General Question,Hey how do I delete my internet cookies? I saw...,hey delet internet cooki saw movi netflix larg...,hey delete internet cooky saw movie netflix la...
...,...,...,...,...
221,Networking,Team! I need a new LAN Cable. Please could you...,team need new lan cabl pleas could send,team need new lan cable please could send
222,Networking,Im trying to connect my phone to the wifi netw...,im tri connect phone wifi network work keep dr...,im trying connect phone wifi network work keep...
223,Networking,Im going to be traveling to a conference soon ...,im go travel confer soon need mobil hotspot st...,im going traveling conference soon need mobile...
224,Networking,The network cable on the back of my machine ke...,network cabl back machin keep come unplug caus...,network cable back machine keep coming unplugg...


In [106]:
y = pd.get_dummies(data['Target Category'])
y

Unnamed: 0,Account Access,Desktop Hardware,Desktop Software,General Question,Networking
0,0,1,0,0,0
1,0,0,1,0,0
2,0,0,0,0,1
3,1,0,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
221,0,0,0,0,1
222,0,0,0,0,1
223,0,0,0,0,1
224,0,0,0,0,1


In [104]:
y = y.iloc[:,1].values

In [105]:
y

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=uint8)