In [2]:
import numpy as np
import pandas as pd

In [3]:
spam = pd.read_table('./data/SMSSpamCollection.txt', sep='\t', header=0)
spam.head()

Unnamed: 0,classe,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
type(spam)

pandas.core.frame.DataFrame

In [5]:
spam.shape

(5572, 2)

In [6]:
spam.describe()

Unnamed: 0,classe,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
pd.crosstab(index=spam['classe'], columns='count')

col_0,count
classe,Unnamed: 1_level_1
ham,4825
spam,747


In [8]:
from sklearn.model_selection import train_test_split
spamTrain, spamTest = train_test_split(spam, train_size=3572, random_state=1, stratify=spam['classe'])

In [9]:
freqTrain = pd.crosstab(index=spamTrain['classe'], columns='count')
freqTrain/freqTrain.sum()

col_0,count
classe,Unnamed: 1_level_1
ham,0.865901
spam,0.134099


In [10]:
freqTest = pd.crosstab(index=spamTest['classe'], columns='count')
freqTest/freqTest.sum()

col_0,count
classe,Unnamed: 1_level_1
ham,0.866
spam,0.134


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, stop_words='english')

XTrain = vectorizer.fit_transform(spamTrain['message'])

In [12]:
vectorizer.get_feature_names_out()

array(['00', '000', '000pes', ..., 'zouk', 'zyada', '〨ud'], dtype=object)

In [13]:
len(vectorizer.get_feature_names_out())

6536

In [14]:
mdtTrain = XTrain.toarray()

In [15]:
type(mdtTrain)

numpy.ndarray

In [16]:
mdtTrain.shape

(3572, 6536)

In [17]:
word_freq = np.sum(mdtTrain, axis=0)
word_freq

array([ 7, 16,  1, ...,  1,  1,  1], dtype=int64)

In [18]:
index = np.argsort(word_freq)
index

array([3267, 3729, 3726, ..., 4143, 6078, 3266], dtype=int64)

In [19]:
printstring = {'word': np.asarray(vectorizer.get_feature_names_out())[index], 'freq': word_freq[index]}
pd.DataFrame(printstring).tail(10)

Unnamed: 0,word,freq
6526,gt,151
6527,lt,152
6528,good,153
6529,got,157
6530,know,159
6531,like,159
6532,ll,161
6533,ok,169
6534,ur,188
6535,just,241


In [20]:
# use logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(XTrain, spamTrain['classe'])

In [21]:
model.coef_.shape

(1, 6536)

In [22]:
model.intercept_

array([-4.04393375])

In [23]:
mdtTest = vectorizer.transform(spamTest['message'])
mdtTest.shape

(2000, 6536)

In [24]:
predTest = model.predict(mdtTest)

In [25]:
from sklearn import metrics

mcTest = metrics.confusion_matrix(spamTest['classe'], predTest)
mcTest

array([[1732,    0],
       [  42,  226]], dtype=int64)

In [26]:
metrics.recall_score(spamTest['classe'], predTest, pos_label='spam')


0.8432835820895522

In [27]:
# precision
metrics.precision_score(spamTest['classe'], predTest, pos_label='spam')

1.0

In [28]:
# F1 score
metrics.f1_score(spamTest['classe'], predTest, pos_label='spam')

0.9149797570850203

In [29]:
# accuracy
metrics.accuracy_score(spamTest['classe'], predTest)

0.979

## remove stop words

In [30]:
BisParser = CountVectorizer(binary=True, stop_words='english', min_df=10)
XTrainBis = BisParser.fit_transform(spamTrain['message'])
len(BisParser.get_feature_names_out())

541

In [31]:
mdtTrainBis = XTrainBis.toarray()
modelBis = LogisticRegression()
modelBis.fit(XTrainBis, spamTrain['classe'])
mdtTestBis = BisParser.transform(spamTest['message'])
predTestBis = modelBis.predict(mdtTestBis)
#confusion matrix
mcTestBis = metrics.confusion_matrix(spamTest['classe'], predTestBis)
mcTestBis

array([[1731,    1],
       [  37,  231]], dtype=int64)

In [32]:
# recall
metrics.recall_score(spamTest['classe'], predTestBis, pos_label='spam')

0.8619402985074627

In [33]:
# precision
metrics.precision_score(spamTest['classe'], predTestBis, pos_label='spam')

0.9956896551724138

In [34]:
# F1 score
metrics.f1_score(spamTest['classe'], predTestBis, pos_label='spam')

0.9239999999999999

In [35]:
# accuracy
metrics.accuracy_score(spamTest['classe'], predTestBis)

0.981

In [39]:
coef_abs = np.abs(modelBis.coef_[0,:])

thresholds = np.percentile(coef_abs, [0,25,50,90,100])
thresholds

array([2.37540153e-03, 1.77540082e-01, 2.95911402e-01, 1.05613226e+00,
       2.73460362e+00])

In [40]:
index = np.where(coef_abs > thresholds[1])
len(index[0])

405

In [41]:
mdtTrainTer = mdtTrainBis[:,index[0]]
mdtTestTer = mdtTestBis[:,index[0]]

print(mdtTrainTer.shape)
print(mdtTestTer.shape)

(3572, 405)
(2000, 405)


In [44]:
#instatiate the object
modelTer = LogisticRegression()
#train a new classifier with selected terms
modelTer.fit(mdtTrainTer,spamTrain['classe'])
#prediction on the test set
predTestTer = modelTer.predict(mdtTestTer)
#confusion matrix
mcTestTer = metrics.confusion_matrix(spamTest['classe'],predTestTer)
print(mcTestTer)
#recall
print(metrics.recall_score(spamTest['classe'],predTestTer,pos_label='spam'))
#precision
print(metrics.precision_score(spamTest['classe'],predTestTer,pos_label='spam'))
#F1-Score
print(metrics.f1_score(spamTest['classe'],predTestTer,pos_label='spam'))
#accuracy rate
print(metrics.accuracy_score(spamTest['classe'],predTestTer))

[[1731    1]
 [  37  231]]
0.8619402985074627
0.9956896551724138
0.9239999999999999
0.981


In [51]:
import pandas
#selected terms
sel_terms = np.asarray(BisParser.get_feature_names_out())[index[0]]
#sorted indices of the absolute value coefficients
sorted_indices = np.argsort(np.abs(modelTer.coef_[0, :]))
#print the terms and theirs coefficients
imp = {'term': np.asarray(sel_terms)[sorted_indices], 'coef': modelTer.coef_[
    0, :][sorted_indices]}

print(pandas.DataFrame(imp))


        term      coef
0       best -0.152671
1       true  0.169354
2        sad -0.175941
3      haven -0.177285
4         im -0.177623
..       ...       ...
400      new  2.045639
401     150p  2.071956
402  service  2.222191
403    claim  2.273162
404       uk  2.744857

[405 rows x 2 columns]


In [55]:
doc = ['this is a new free service for you only']

desc = BisParser.transform(doc)
desc.shape

(1, 541)

In [56]:
print(np.asarray(BisParser.get_feature_names_out())[desc.indices])


['free' 'new' 'service']


In [57]:
dense_desc = desc.toarray()
dense_desc.shape

(1, 541)