# KUC, NLP
# LDA from scratch


In [1]:
# import sms-spam.csv
import numpy as np
import pandas as pd

In [5]:
sms = pd.read_csv('sms-spam.csv',usecols= ['spam','text'])
print(sms.head())

   spam                                               text
0     0  Go until jurong point, crazy.. Available only ...
1     0                      Ok lar... Joking wif u oni...
2     1  Free entry in 2 a wkly comp to win FA Cup fina...
3     0  U dun say so early hor... U c already then say...
4     0  Nah I don't think he goes to usf, he lives aro...


In [6]:
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
print(index[:10])# <2>

['sms0', 'sms1', 'sms2!', 'sms3', 'sms4', 'sms5!', 'sms6', 'sms7', 'sms8!', 'sms9!']


In [7]:
mask = sms.spam.astype(bool).values
print(mask)

[False False  True ... False False False]


In [8]:
sms['spam'] = sms.spam.astype(int).values
print(sms['spam'])

0       0
1       0
2       1
3       0
4       0
       ..
4832    1
4833    0
4834    0
4835    0
4836    0
Name: spam, Length: 4837, dtype: int64


In [9]:
print(sms.head())

   spam                                               text
0     0  Go until jurong point, crazy.. Available only ...
1     0                      Ok lar... Joking wif u oni...
2     1  Free entry in 2 a wkly comp to win FA Cup fina...
3     0  U dun say so early hor... U c already then say...
4     0  Nah I don't think he goes to usf, he lives aro...


---------------

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize

# compare two tf-idf models
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# 1.
tfidf_model1 = TfidfVectorizer()
tfidf_docs1 = tfidf_model1.fit_transform(raw_documents=sms.text).toarray()
print(tfidf_docs1.shape)

(4837, 8713)


In [None]:
# 2.
tfidf_model2 = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs2 = tfidf_model2.fit_transform(raw_documents=sms.text).toarray()
print(tfidf_docs2.shape)


In [None]:
print(tfidf_model1.get_feature_names_out()[:50])
print(tfidf_docs1[:2])

In [None]:
print(tfidf_model2.get_feature_names_out()[:50])
print(tfidf_docs2[:2])

In [None]:

# Before applying, need to compute the centroids of binary class (spam and nonspam)
mask = sms.spam.astype(bool).values

spam_centroid = tfidf_docs2[mask].mean(axis=0) # <2>
nonspam_centroid = tfidf_docs2[~mask].mean(axis=0)
spamminess_score = tfidf_docs2.dot(spam_centroid - nonspam_centroid)

In [None]:
print(spamminess_score)
print(spamminess_score.shape)

In [None]:
# Transform features by scaling each feature to a given range.
print(spamminess_score.reshape(-1,1).shape)
from sklearn.preprocessing import MinMaxScaler
sms['lda_score'] = MinMaxScaler().fit_transform(spamminess_score.reshape(-1,1))



In [None]:
print(sms['lda_score'])

In [None]:
sms['lda_predict'] = (sms.lda_score > .5).astype(int)


--------------

In [None]:
# output the results
sms['spam lda_predict lda_score'.split()].round(2).head(20)


2. when it fails to work?

3. How to evaluate?