In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
sent = ['this pasta is very tasty and affordable', 'this pasta is not very tasty and is affordable', 'this pasta is very very delicious']
sent1 = ['petrol cars are cheaper than diesel cars', 'diesel is cheaper than petrol']

In [3]:
tfidf = TfidfVectorizer()
vectors = tfidf.fit_transform(sent1)
feature_names = tfidf.get_feature_names()
print(feature_names)

['are', 'cars', 'cheaper', 'diesel', 'is', 'petrol', 'than']


In [4]:
matrix = vectors.todense()
denselist = matrix.tolist()
df = pd.DataFrame(denselist, columns = feature_names)
df

Unnamed: 0,are,cars,cheaper,diesel,is,petrol,than
0,0.377292,0.754584,0.268446,0.268446,0.0,0.268446,0.268446
1,0.0,0.0,0.40909,0.40909,0.574962,0.40909,0.40909


In [5]:
d1 = 'petrol cars are cheaper than diesel cars'
d2 = 'diesel is cheaper than petrol'
doc = [d1, d2]
vec = TfidfVectorizer(stop_words = 'english')
matrix = vec.fit_transform(doc)
feature_names = tfidf.get_feature_names_out()
print(feature_names)
sparse_matrix = matrix.toarray()
print(sparse_matrix)

['are' 'cars' 'cheaper' 'diesel' 'is' 'petrol' 'than']
[[0.85135433 0.30287281 0.30287281 0.30287281]
 [0.         0.57735027 0.57735027 0.57735027]]


##Spam classification using tf-idf

In [8]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", 
                  encoding='ISO-8859-1', usecols=['v1', 'v2'])
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize as stop_words
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer as wordnet
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [10]:
# pre-processing
corpus = []
wordnet = wordnet()
length = len(df['v2'])
for i in range(length):
  rev = re.sub('[^a-zA-Z]', ' ', df['v2'][i])
  rev = rev.lower()
  rev = rev.split()
  rev = [wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')]
  rev = ' '.join(rev)
  corpus.append(rev)

In [16]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(corpus).toarray()
y = df['v1']

In [19]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

In [20]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
x.shape

(5572, 7021)

In [22]:
# label encoding y
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 1, ..., 0, 0, 0])

In [23]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [24]:
# naive-bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
model = MultinomialNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc = model.score(x_test, y_test)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy for naive bayes model - ", acc)
print("Confusion matrix for naive bayes model - \n", cm)

Accuracy for naive bayes model -  0.9775784753363229
Confusion matrix for naive bayes model - 
 [[975   1]
 [ 24 115]]


In [25]:
# logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
acc = lr.score(x_test, y_test)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy for logistic regression model - ", acc)
print("Confusion matrix for logistic regression model - \n", cm)

Accuracy for logistic regression model -  0.9695067264573991
Confusion matrix for logistic regression model - 
 [[971   5]
 [ 29 110]]


In [26]:
# support vector machine
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = clf.score(x_test, y_test)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy for SVM model - ", acc)
print("Confusion matrix for SVM model - \n", cm)

Accuracy for SVM model -  0.9910313901345291
Confusion matrix for SVM model - 
 [[975   1]
 [  9 130]]
