SMSSpamCollection

In [6]:
import pandas as pd 
df =  pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords


In [9]:
# Preprocessing
df['clean_documents'] = df['message'].str.replace("[^a-zA-Z#]", " ")
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

df.head()

  df['clean_documents'] = df['message'].str.replace("[^a-zA-Z#]", " ")


Unnamed: 0,label,message,clean_documents
0,ham,"Go until jurong point, crazy.. Available only ...",until jurong point crazy available only bugis ...
1,ham,Ok lar... Joking wif u oni...,lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...
3,ham,U dun say so early hor... U c already then say...,dun say early hor already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah don think goes usf lives around here though


In [10]:
stop_words = stopwords.words('english')

In [11]:
# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# remove stop-words

#tokenized_doc = tokenized_doc.apply(lambda x: [itemstop_words = stopwords.words('english') for item in x if item not in stop_words])
# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words]) 
# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc


In [12]:
df.head()

Unnamed: 0,label,message,clean_documents
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis great world...
1,ham,Ok lar... Joking wif u oni...,lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...
3,ham,U dun say so early hor... U c already then say...,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


In [13]:
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
X.shape

(5572, 7216)

In [26]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=0)
lsa = svd_model.fit_transform(X)

In [27]:
#Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])

Unnamed: 0,documents,topic_1,topic_2
0,jurong point crazy available bugis great world...,0.0256205318207794,0.0697817372260633
1,lar joking wif oni,0.0117026555141287,0.0181323257623029
2,free entry wkly comp win cup final tkts may te...,0.0239760446291013,0.0803767188923967
3,dun say early hor already say,0.0182520758718167,0.0509857908770320
4,nah think goes usf lives around though,0.0132693659221473,0.0396260029020490
...,...,...,...
5567,time tried contact pound prize claim easy call...,0.0298220438233330,0.1016060571807131
5568,going esplanade home,0.0583140104414502,0.1869109850008792
5569,pity mood suggestions,0.0005328846347664,0.0018950271694712
5570,guy bitching acted like interested buying some...,0.0278650491569741,0.0803363303743757


In [29]:
# Features or words used as features 
dictionary = vectorizer.get_feature_names()

In [30]:
dictionary

['aah',
 'aaniye',
 'aaooooright',
 'aathi',
 'abbey',
 'abdomen',
 'abeg',
 'abel',
 'aberdeen',
 'abi',
 'ability',
 'abiola',
 'abj',
 'able',
 'abnormally',
 'aboutas',
 'abroad',
 'absence',
 'absolutely',
 'absolutly',
 'abstract',
 'abt',
 'abta',
 'aburo',
 'abuse',
 'abusers',
 'academic',
 'acc',
 'accent',
 'accenture',
 'accept',
 'access',
 'accessible',
 'accidant',
 'accident',
 'accidentally',
 'accommodation',
 'accommodationvouchers',
 'accomodate',
 'accomodations',
 'accordin',
 'accordingly',
 'account',
 'accounting',
 'accounts',
 'accumulation',
 'achan',
 'ache',
 'achieve',
 'acid',
 'acknowledgement',
 'acl',
 'acnt',
 'aco',
 'act',
 'acted',
 'actin',
 'acting',
 'action',
 'activ',
 'activate',
 'active',
 'activities',
 'actor',
 'actual',
 'actually',
 'adam',
 'add',
 'addamsfa',
 'added',
 'addicted',
 'addie',
 'adding',
 'address',
 'adds',
 'adewale',
 'adi',
 'adjustable',
 'admin',
 'administrator',
 'admirer',
 'admission',
 'admit',
 'adore',
 '

In [35]:
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T

In [36]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
aah,0.0001196041261378,0.0004069840317316
aaniye,0.0000825485709391,0.0003434728617813
aaooooright,0.0002159122539339,0.0007192476648452
aathi,0.0015593850553083,0.0066263583672637
abbey,0.0001927135381222,0.0007760461889550
...,...,...
zoe,0.0000854068888333,0.0003268487936183
zogtorius,0.0002009680431187,0.0007587968697315
zoom,0.0002681070537439,0.0006740368708729
zouk,0.0000513769830004,0.0001946763237413


In [37]:
topic_encoded_df["label"] = df['label']

In [38]:
topic_encoded_df

Unnamed: 0,topic_1,topic_2,documents,label
0,0.0256205318207794,0.0697817372260633,jurong point crazy available bugis great world...,ham
1,0.0117026555141287,0.0181323257623029,lar joking wif oni,ham
2,0.0239760446291013,0.0803767188923967,free entry wkly comp win cup final tkts may te...,spam
3,0.0182520758718167,0.0509857908770320,dun say early hor already say,ham
4,0.0132693659221473,0.0396260029020490,nah think goes usf lives around though,ham
...,...,...,...,...
5567,0.0298220438233330,0.1016060571807131,time tried contact pound prize claim easy call...,spam
5568,0.0583140104414502,0.1869109850008792,going esplanade home,ham
5569,0.0005328846347664,0.0018950271694712,pity mood suggestions,ham
5570,0.0278650491569741,0.0803363303743757,guy bitching acted like interested buying some...,ham
