# KUC, NLP
# LDA from scratch


In [17]:
# import sms-spam.csv
import numpy as np
import pandas as pd

In [18]:
sms = pd.read_csv('sms-spam.csv',usecols= ['spam','text'])
print(sms.head())

   spam                                               text
0     0  Go until jurong point, crazy.. Available only ...
1     0                      Ok lar... Joking wif u oni...
2     1  Free entry in 2 a wkly comp to win FA Cup fina...
3     0  U dun say so early hor... U c already then say...
4     0  Nah I don't think he goes to usf, he lives aro...


In [19]:
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
print(index[:10])# <2>

['sms0', 'sms1', 'sms2!', 'sms3', 'sms4', 'sms5!', 'sms6', 'sms7', 'sms8!', 'sms9!']


In [20]:
mask = sms.spam.astype(bool).values
print(mask)

[False False  True ... False False False]


In [21]:
sms['spam'] = sms.spam.astype(int).values
print(sms['spam'])


0       0
1       0
2       1
3       0
4       0
       ..
4832    1
4833    0
4834    0
4835    0
4836    0
Name: spam, Length: 4837, dtype: int64


In [22]:
print(sms.head())


   spam                                               text
0     0  Go until jurong point, crazy.. Available only ...
1     0                      Ok lar... Joking wif u oni...
2     1  Free entry in 2 a wkly comp to win FA Cup fina...
3     0  U dun say so early hor... U c already then say...
4     0  Nah I don't think he goes to usf, he lives aro...


---------------

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize

# compare two tf-idf models
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# 1.
tfidf_model1 = TfidfVectorizer()
tfidf_docs1 = tfidf_model1.fit_transform(raw_documents=sms.text).toarray()
print(tfidf_docs1.shape)

(4837, 8713)


In [24]:
# 2.
tfidf_model2 = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs2 = tfidf_model2.fit_transform(raw_documents=sms.text).toarray()
print(tfidf_docs2.shape)


(4837, 9232)


In [25]:
print(tfidf_model1.get_feature_names_out()[:50])
# get_feature_names
print(tfidf_docs1[:2])

['00' '000' '000pes' '008704050406' '0089' '0121' '01223585236'
 '01223585334' '0125698789' '02' '0207' '02072069400' '02073162414'
 '02085076972' '021' '03' '04' '0430' '05' '050703' '0578' '06' '07'
 '07008009200' '07046744435' '07090201529' '07090298926' '07099833605'
 '07123456789' '0721072' '07732584351' '07734396839' '07742676969'
 '07753741225' '0776xxxxxxx' '07781482378' '07786200117' '077xxx' '078'
 '07801543489' '07808' '07808247860' '07808726822' '07815296484'
 '07821230901' '078498' '07880867867' '0789xxxxxxx' '07946746291'
 '0796xxxxxx']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [26]:
print(tfidf_model2.get_feature_names_out()[:50])
print(tfidf_docs2[:2])

['!' '"' '#' '#150' '#5000' '$' '%' '&' "'" '(' ')' '*' '+' ',' '-' '.'
 '. .' '. . .' '. . . .' '. . . . .' '. ..' '..' '.. .' '.. . . .'
 '.. ... ...' '...' '... . . . .' '/' '0' '00' '00870405040' '0089' '01'
 '0121 2025050' '01223585236' '01223585334' '01256987' '02' '02/06'
 '02/09' '0207 153 9153' '0207 153 9996' '0207-083-6089' '02072069400'
 '02073162414' '02085076972' '03' '03530150' '04' '04/09']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [27]:

# Before applying, need to compute the centroids of binary class (spam and nonspam)
mask = sms.spam.astype(bool).values

spam_centroid = tfidf_docs2[mask].mean(axis=0) # <2>
nonspam_centroid = tfidf_docs2[~mask].mean(axis=0)
spamminess_score = tfidf_docs2.dot(spam_centroid - nonspam_centroid)

In [28]:
print(spamminess_score)
print(spamminess_score.shape)

[-0.01469806 -0.02007376  0.03856095 ... -0.01014774 -0.00344281
  0.00395752]
(4837,)


In [29]:
# Transform features by scaling each feature to a given range.
print(spamminess_score.reshape(-1,1).shape)
from sklearn.preprocessing import MinMaxScaler
sms['lda_score'] = MinMaxScaler().fit_transform(spamminess_score.reshape(-1,1))



(4837, 1)


In [30]:
print(sms['lda_score'])

0       0.227478
1       0.177888
2       0.718785
3       0.184565
4       0.286944
          ...   
4832    0.850649
4833    0.292753
4834    0.269454
4835    0.331306
4836    0.399573
Name: lda_score, Length: 4837, dtype: float64


In [31]:
sms['lda_predict'] = (sms.lda_score > .5).astype(int)


--------------

In [32]:
# output the results
sms['spam lda_predict lda_score'.split()].round(2).head(20)


Unnamed: 0,spam,lda_predict,lda_score
0,0,0,0.23
1,0,0,0.18
2,1,1,0.72
3,0,0,0.18
4,0,0,0.29
5,1,1,0.55
6,0,0,0.32
7,0,0,0.5
8,1,1,0.89
9,1,1,0.77


2. when it fails to work?

3. How to evaluate?