In [35]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
df=pd.read_csv('SMSSpamCollection.txt',sep='\t')
df.columns=['out','text']
df.head()

Unnamed: 0,out,text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [37]:
df.shape

(5571, 2)

In [38]:
df['text'][4]

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"

#### Stopwords

In [39]:
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))

In [40]:
len(stops)

179

#### Stemming

In [41]:
stemmer=PorterStemmer()
import re
for val in range(df.shape[0]):
    sen=re.sub('[^a-zA-Z]',' ',df.loc[val]['text'])
    sen=sen.lower()
    sen=sen.split(' ')
    sen=[stemmer.stem(word) for word in sen if word not in stops]
    sen= ' '.join(sen)
    df.loc[val]['text']=sen
df.loc[4]['text']

'freemsg hey darl   week word back  like fun still  tb ok  xxx std chg send        rcv'

#### Lemmatization

In [43]:
df=pd.read_csv('SMSSpamCollection.txt',sep='\t')
df.columns=['out','text']
lemme=WordNetLemmatizer()
import re
for val in range(df.shape[0]):
    sen=re.sub('[^a-zA-Z]',' ',df.loc[val]['text'])
    sen=sen.lower()
    sen=sen.split(' ')
    sen=[lemme.lemmatize(word) for word in sen if word not in stops]
    sen= ' '.join(sen)
    df.loc[val]['text']=sen
df.loc[4]['text']

'freemsg hey darling   week word back  like fun still  tb ok  xxx std chgs send        rcv'

#### CountVectorizer / Bag of Words

In [44]:
cv=CountVectorizer()
vecs=cv.fit_transform(df['text'])
print(vecs.shape)

(5571, 7096)


In [45]:
print(type(vecs))
print(vecs[0])

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 4224)	1
  (0, 3338)	1
  (0, 3167)	1
  (0, 6846)	1
  (0, 4250)	1


#### TF-IDF Vectorizer

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tcv=TfidfVectorizer()
vecs_tf=tcv.fit_transform(df['text'])
print(vecs_tf.shape)
print(type(vecs_tf))
print(vecs_tf[0])

(5571, 7096)
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 4250)	0.546626673667619
  (0, 6846)	0.43162886465569406
  (0, 3167)	0.5236821800631463
  (0, 3338)	0.40832452144188486
  (0, 4224)	0.27188943029231066
