In [7]:
import pandas as pd
import re
import string
import nltk

pd.set_option ('display.max_colwidth',100)

from nltk.corpus import stopwords

stopwords = nltk.corpus.stopwords.words('english')

ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None)
data.columns = ['label','body_text']

data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [8]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation]) # remove punctuation
    tokens = re.split('\W+',text) #tokenize sentence, note the regular expression used
    text = " ".join([word for word in tokens if word not in stopwords]) # remove stop words, join to form sentence
    return text

In [9]:
data['cleaned_text'] = data['body_text'].apply(lambda x: clean_text(x))

data.head()

Unnamed: 0,label,body_text,cleaned_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,ive searching right words thank breather promise wont take help granted fulfil promise wonderful...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
2,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think goes usf lives around though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak treat like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vect = CountVectorizer(ngram_range=(2,2))

x_counts=ngram_vect.fit_transform(data['cleaned_text'])
print(x_counts.shape)
print(ngram_vect.get_feature_names())

(5568, 31957)


In [15]:
data_sample = data[0:20]

ngram_vect_sample = CountVectorizer(ngram_range=(2,2))

x_counts_sample=ngram_vect_sample.fit_transform(data_sample['cleaned_text'])
print(x_counts_sample.shape)
print(ngram_vect_sample.get_feature_names())

(20, 209)
['09061701461 claim', '100 20000', '100000 prize', '11 months', '12 hours', '150pday 6days', '16 tsandcs', '20000 pounds', '2005 text', '21st may', '4txtú120 poboxox36504w45wq', '6days 16', '81010 tc', '87077 eg', '87077 trywales', '87121 receive', '87575 cost', '900 prize', 'aft finish', 'aids patent', 'anymore tonight', 'apply 08452810075over18s', 'apply reply', 'ard smth', 'around though', 'blessing times', 'breather promise', 'brother like', 'call 09061701461', 'call mobile', 'callers press', 'callertune callers', 'camera free', 'cash 100', 'chances win', 'claim 81010', 'claim call', 'claim code', 'click httpwap', 'click wap', 'co free', 'code kl341', 'colour mobiles', 'comp win', 'copy friends', 'cost 150pday', 'credit click', 'cried enough', 'csh11 send', 'cup final', 'customer selected', 'da stock', 'date sunday', 'dont miss', 'dont think', 'dont want', 'eg england', 'eh remember', 'england 87077', 'england macedonia', 'enough today', 'entitled update', 'entry question

In [18]:
x_counts_df = pd.DataFrame(x_counts_sample.toarray())
x_counts_df.columns = ngram_vect_sample.get_feature_names()
x_counts_df

Unnamed: 0,09061701461 claim,100 20000,100000 prize,11 months,12 hours,150pday 6days,16 tsandcs,20000 pounds,2005 text,21st may,...,win fa,winner valued,wkly comp,wonderful blessing,wont take,word claim,words thank,wwwdbuknet lccltd,xxxmobilemovieclub use,yes naughty
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,1,...,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
