In [9]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB

In [10]:
data = pd.read_csv('data/spam.csv', encoding='latin-1')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [11]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
source = data['v2']
type(source)

pandas.core.series.Series

In [13]:
source.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [14]:
target = data['v1']
type(target)

pandas.core.series.Series

In [15]:
# 0:ham, 1:spam
target = pd.get_dummies(target, drop_first=True)
target.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [16]:
# import  CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Instantitive CountVectorizer
cv = CountVectorizer(stop_words='english')
cv

CountVectorizer(stop_words='english')

In [17]:
cv.fit(source)

CountVectorizer(stop_words='english')

In [18]:
cv.vocabulary_

{'jurong': 4224,
 'point': 5741,
 'crazy': 2271,
 'available': 1271,
 'bugis': 1703,
 'great': 3534,
 'world': 8227,
 'la': 4349,
 'buffet': 1701,
 'cine': 1994,
 'got': 3494,
 'amore': 1051,
 'wat': 8026,
 'ok': 5343,
 'lar': 4385,
 'joking': 4192,
 'wif': 8134,
 'oni': 5369,
 'free': 3265,
 'entry': 2875,
 'wkly': 8185,
 'comp': 2110,
 'win': 8146,
 'fa': 3005,
 'cup': 2329,
 'final': 3121,
 'tkts': 7519,
 '21st': 411,
 '2005': 402,
 'text': 7388,
 '87121': 784,
 'receive': 6115,
 'question': 6010,
 'std': 7028,
 'txt': 7701,
 'rate': 6062,
 'apply': 1128,
 '08452810075over18': 77,
 'dun': 2738,
 'say': 6450,
 'early': 2757,
 'hor': 3815,
 'nah': 5092,
 'don': 2651,
 'think': 7443,
 'goes': 3458,
 'usf': 7837,
 'lives': 4535,
 'freemsg': 3272,
 'hey': 3732,
 'darling': 2386,
 'week': 8071,
 'word': 8218,
 'like': 4485,
 'fun': 3323,
 'tb': 7323,
 'xxx': 8292,
 'chgs': 1948,
 'send': 6536,
 '50': 607,
 'rcv': 6074,
 'brother': 1674,
 'speak': 6910,
 'treat': 7634,
 'aids': 985,
 'pate

In [20]:
# Apply the vectorizer
cv_transformed = cv.transform(source)
# Print the full array
cv_array = cv_transformed.toarray()

In [21]:
cv_array.shape

(5572, 8404)

In [22]:
from scipy import sparse
a0 = sparse.csr_matrix(cv_array[0])
print(a0)

  (0, 1051)	1
  (0, 1271)	1
  (0, 1701)	1
  (0, 1703)	1
  (0, 1994)	1
  (0, 2271)	1
  (0, 3494)	1
  (0, 3534)	1
  (0, 4224)	1
  (0, 4349)	1
  (0, 5741)	1
  (0, 8026)	1
  (0, 8227)	1


### Với Tf-IDF

In [23]:
# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Instantitive TfidfVectorizer
tv = TfidfVectorizer(max_features=500 ,stop_words='english')
tv

TfidfVectorizer(max_features=500, stop_words='english')

In [24]:
# Fit the vectorizer
tv.fit_transform(source)

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [25]:
tv.vocabulary_

{'available': 42,
 'great': 177,
 'world': 484,
 'got': 175,
 'wat': 461,
 'ok': 301,
 'lar': 224,
 'wif': 472,
 'free': 157,
 'entry': 143,
 'win': 475,
 'final': 150,
 'text': 405,
 'receive': 345,
 'question': 338,
 'txt': 441,
 'rate': 340,
 'apply': 36,
 'dun': 135,
 'say': 360,
 'early': 137,
 'don': 125,
 'think': 413,
 'goes': 170,
 'hey': 199,
 'week': 465,
 'word': 480,
 'like': 238,
 'fun': 162,
 'xxx': 489,
 'send': 367,
 '50': 16,
 'brother': 65,
 'speak': 390,
 'set': 371,
 'friends': 159,
 'network': 287,
 'customer': 105,
 'selected': 366,
 'prize': 334,
 'claim': 86,
 'code': 90,
 'valid': 448,
 'hours': 206,
 'mobile': 276,
 '11': 5,
 'update': 444,
 'latest': 227,
 'colour': 93,
 'camera': 74,
 'gonna': 173,
 'home': 202,
 'soon': 387,
 'want': 458,
 'talk': 402,
 'stuff': 398,
 'tonight': 430,
 've': 449,
 'today': 423,
 'cash': 77,
 '100': 2,
 '000': 0,
 'pounds': 330,
 'cost': 102,
 '150p': 8,
 'day': 110,
 '16': 10,
 'reply': 347,
 'urgent': 446,
 'won': 478,
 'w

In [27]:
# Apply the vectorizer
tv_transformed = tv.transform(source)
# Print the full array
tv_array = tv_transformed.toarray()

In [28]:
print(tv_array)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [30]:
# Create a DataFrame with these features
tv_df = pd.DataFrame(tv_array, columns=tv.get_feature_names()).add_prefix('TFIDF_')
tv_df.head()

Unnamed: 0,TFIDF_000,TFIDF_10,TFIDF_100,TFIDF_1000,TFIDF_10p,TFIDF_11,TFIDF_12hrs,TFIDF_150,TFIDF_150p,TFIDF_150ppm,...,TFIDF_ya,TFIDF_yeah,TFIDF_year,TFIDF_years,TFIDF_yes,TFIDF_yesterday,TFIDF_yo,TFIDF_yup,TFIDF_ì_,TFIDF_ìï
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
examine_row = tv_df.iloc[0]
print(examine_row.sort_values(ascending=False).head())

TFIDF_available    0.549238
TFIDF_world        0.496702
TFIDF_wat          0.410286
TFIDF_great        0.405632
TFIDF_got          0.344604
Name: 0, dtype: float64


### Với TF_IDF và N-grams

In [32]:
tv_bi_gram_vec = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
# Fit and apply bigram vectorizer
tv_bi_gram = tv_bi_gram_vec.fit_transform(source)

In [33]:
tv_bi_gram_vec.vocabulary_

{'jurong': 16991,
 'point': 24832,
 'crazy': 7794,
 'available': 3587,
 'bugis': 5262,
 'great': 13756,
 'world': 36201,
 'la': 17808,
 'buffet': 5257,
 'cine': 6501,
 'got': 13476,
 'amore': 2805,
 'wat': 34993,
 'jurong point': 16992,
 'point crazy': 24833,
 'crazy available': 7796,
 'available bugis': 3589,
 'bugis great': 5264,
 'great world': 13820,
 'world la': 36214,
 'la buffet': 17809,
 'buffet cine': 5258,
 'cine got': 6507,
 'got amore': 13482,
 'amore wat': 2806,
 'ok': 23058,
 'lar': 17917,
 'joking': 16908,
 'wif': 35615,
 'oni': 23306,
 'ok lar': 23122,
 'lar joking': 17927,
 'joking wif': 16911,
 'wif oni': 35628,
 'free': 12068,
 'entry': 10658,
 'wkly': 35884,
 'comp': 7173,
 'win': 35687,
 'fa': 11066,
 'cup': 7965,
 'final': 11538,
 'tkts': 32392,
 '21st': 989,
 '2005': 966,
 'text': 31311,
 '87121': 1918,
 'receive': 26189,
 'question': 25689,
 'std': 29971,
 'txt': 33275,
 'rate': 25885,
 'apply': 3072,
 '08452810075over18': 177,
 'free entry': 12107,
 'entry wkly

In [37]:
# Create a DataFrame with the Counts features
tv_df = pd.DataFrame(tv_bi_gram.toarray(), columns=tv_bi_gram_vec.get_feature_names())
tv_sums = tv_df.sum()

In [38]:
print(tv_sums.head())

00           1.540984
00 easter    0.168684
00 sub       0.838168
00 subs      0.346519
000          4.507586
dtype: float64
