In [None]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [None]:
data = pd.read_csv('textclass.csv')

In [None]:
data.head()

Unnamed: 0,Statements,category
0,"Good morning, how did you sleep last night?",general
1,"I've had this persistent cough for a week now,...",medical
2,"Can you pass me the salt, please?",general
3,"I'm sorry, I didn't catch what you said. Can y...",general
4,What are your plans for the weekend?,general


In [None]:
data.shape

(40, 2)

In [None]:
#Checking class distribution
data.groupby('category').count()

Unnamed: 0_level_0,Statements
category,Unnamed: 1_level_1
general,20
medical,20


# Text Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
data['tokens'] = data['Statements'].map(lambda text:  nltk.tokenize.word_tokenize(text))

In [None]:
#Lets check tokenized text from first email

print(data['tokens'][1])

['I', "'ve", 'had', 'this', 'persistent', 'cough', 'for', 'a', 'week', 'now', ',', 'and', 'it', "'s", 'been', 'getting', 'worse', '.']


### Stop Words Removal

In [None]:
#Removing stop words

stop_words = set(nltk.corpus.stopwords.words('english'))
data['filtered_text'] = data['tokens'].map(lambda tokens: [w for w in tokens if not w in stop_words])

In [None]:
#Lets compare an email with stop words removed

print(data['tokens'][3],end='\n\n')
print(data['filtered_text'][3])


['I', "'m", 'sorry', ',', 'I', 'did', "n't", 'catch', 'what', 'you', 'said', '.', 'Can', 'you', 'repeat', 'it', '?']

['I', "'m", 'sorry', ',', 'I', "n't", 'catch', 'said', '.', 'Can', 'repeat', '?']


In [None]:
#Joining all tokens together in a string
data['filtered_text'] = data['filtered_text'].map(lambda text: ' '.join(text))

#removing apecial characters from each mail
data['filtered_text'] = data['filtered_text'].map(lambda text: re.sub('[^A-Za-z0-9]+', ' ', text))

### Lammetization

In [None]:
wnl = nltk.WordNetLemmatizer()
data['filtered_text'] = data['filtered_text'].map(lambda text: wnl.lemmatize(text))

In [None]:
data['filtered_text'][4]

'What plans weekend '

### Bag of Words

In [None]:
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['filtered_text'])

In [None]:
print(counts.shape)

(40, 162)


# Naive Bayes Classifier

In [None]:
classifier = MultinomialNB()
targets = data['category'].values
classifier.fit(counts, targets)

In [None]:
test = pd.read_csv('testdata.csv')

In [None]:
X_train = counts
y_train = targets
X_test = test['Statments'].values
y_test = test['category'].values

In [None]:
# Converting String to Integer
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['filtered_text'])
X_test= count_vectorizer.transform(test['Statments'])

In [1]:
# %%time
# from sklearn.metrics import classification_report
# y_pred = classifier.predict(X_test)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

### Direct Method

In [None]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline

df = pd.read_csv('textclass2.csv')

In [None]:
X = df['Statements'].values
y = df['category'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 7)

In [None]:
# Converting String to Integer
cv = CountVectorizer()
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

In [None]:
print(nb.score(X_test,y_test))

0.875


In [None]:
%%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.875
              precision    recall  f1-score   support

     general       1.00      0.83      0.91         6
     medical       0.67      1.00      0.80         2

    accuracy                           0.88         8
   macro avg       0.83      0.92      0.85         8
weighted avg       0.92      0.88      0.88         8

CPU times: user 10.3 ms, sys: 3.16 ms, total: 13.4 ms
Wall time: 14 ms
