<a href="https://colab.research.google.com/github/hrushikute/DataAnalytics/blob/master/nltk_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exploring the NLP space.

In [4]:
import nltk
nltk.download


<bound method Downloader.download of <nltk.downloader.Downloader object at 0x7f88cd75ad50>>

In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')[0:30]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself']

In [6]:
import pandas as pd

headers = ['label', 'message']
data =pd.read_csv('SMSSpamCollection.csv', sep='|', names=headers)
data.head()
data2 =pd.read_csv('SMSSpamCollection', sep='\t', names=headers , header=None)
data2.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data2.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


## Explore the data set

## Find the number of rows and columns of data set

In [8]:
num_of_rows = len(data2)
num_of_rows

5572

In [9]:
num_of_cols = len(data2.columns)
num_of_cols

2

In [10]:
print(f'There are {num_of_rows} of rows and {num_of_cols} of columns in data')

There are 5572 of rows and 2 of columns in data


## Find the number of ham and spam of total rows

In [11]:
num_of_spam = (data2['label'] == 'spam').sum()
num_of_spam

747

In [12]:
num_of_ham = (data2['label'] == 'ham').sum()
num_of_ham

4825

In [13]:
print(f'There are in all {num_of_spam} spam and {num_of_ham} ham out of {num_of_rows}')

There are in all 747 spam and 4825 ham out of 5572


## NLP : Basics To clean the text

1. Remove Punctuation
2. Tokenization
3. Removing Stopwords
4. Lematization/Stemming 

In [14]:
## Remove punction

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
def remove_punctuation(data):
  # Used list comprehension
  text_noPunct = [char for char in data if char not in string.punctuation]
  return text_noPunct

In [16]:
data2['clean_message'] = data2['message'].apply(lambda x: remove_punctuation(x))
data2.head()

Unnamed: 0,label,message,clean_message
0,ham,"Go until jurong point, crazy.. Available only ...","[G, o, , u, n, t, i, l, , j, u, r, o, n, g, ..."
1,ham,Ok lar... Joking wif u oni...,"[O, k, , l, a, r, , J, o, k, i, n, g, , w, ..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[F, r, e, e, , e, n, t, r, y, , i, n, , 2, ..."
3,ham,U dun say so early hor... U c already then say...,"[U, , d, u, n, , s, a, y, , s, o, , e, a, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[N, a, h, , I, , d, o, n, t, , t, h, i, n, ..."


In [17]:
# Here the clean message have each character as list however we want sentences
# Hence we need to add join by modifying the remove_punctuation function

def remove_punctuation(data):
  # Used list comprehension
  text_noPunct = "".join([char for char in data if char not in string.punctuation])
  return text_noPunct

data2['clean_message_after'] = data2['message'].apply(lambda x: remove_punctuation(x))
data2.head()

Unnamed: 0,label,message,clean_message,clean_message_after
0,ham,"Go until jurong point, crazy.. Available only ...","[G, o, , u, n, t, i, l, , j, u, r, o, n, g, ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,"[O, k, , l, a, r, , J, o, k, i, n, g, , w, ...",Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[F, r, e, e, , e, n, t, r, y, , i, n, , 2, ...",Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,"[U, , d, u, n, , s, a, y, , s, o, , e, a, ...",U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...","[N, a, h, , I, , d, o, n, t, , t, h, i, n, ...",Nah I dont think he goes to usf he lives aroun...


In [18]:
## Tokenize the text
import re

def tokenize_text(data):
  tokens =re.split('\W+',data)
  return tokens

data2['message_tokens'] = data2['clean_message_after'].apply(lambda x: tokenize_text(x.lower()))
data2.head()

Unnamed: 0,label,message,clean_message,clean_message_after,message_tokens
0,ham,"Go until jurong point, crazy.. Available only ...","[G, o, , u, n, t, i, l, , j, u, r, o, n, g, ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[O, k, , l, a, r, , J, o, k, i, n, g, , w, ...",Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[F, r, e, e, , e, n, t, r, y, , i, n, , 2, ...",Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[U, , d, u, n, , s, a, y, , s, o, , e, a, ...",U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[N, a, h, , I, , d, o, n, t, , t, h, i, n, ...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [19]:
## Remove stopwords
import nltk

stop_words = nltk.corpus.stopwords.words('english')


In [20]:
def remove_stopwords(data):
  clean_data = [word for word in data if word not in stop_words]
  return clean_data
data2['message_after_stop'] = data2['message_tokens'].apply(lambda x: remove_stopwords(x))
data2.head()


Unnamed: 0,label,message,clean_message,clean_message_after,message_tokens,message_after_stop
0,ham,"Go until jurong point, crazy.. Available only ...","[G, o, , u, n, t, i, l, , j, u, r, o, n, g, ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[O, k, , l, a, r, , J, o, k, i, n, g, , w, ...",Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[F, r, e, e, , e, n, t, r, y, , i, n, , 2, ...",Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[U, , d, u, n, , s, a, y, , s, o, , e, a, ...",U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[N, a, h, , I, , d, o, n, t, , t, h, i, n, ...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t..."


## Suplemental Data Cleaning : Using stemming

Test out porter stemer

In [21]:
import nltk
import re
import pandas as pd
nltk.download('wordnet')

pd.set_option('display.max_colwidth',100)
ps = nltk.PorterStemmer()
stop_words = nltk.corpus.stopwords.words('english')

data3=pd.read_csv('SMSSpamCollection', sep='\t', names=headers , header=None)
data3.head()



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [22]:
def Pre_process_data(text_data):
  text_noPunct = "".join([char for char in text_data if char not in string.punctuation])
  tokenize_text = re.split('\W+',text_noPunct)
  no_stop_text = [word for word in tokenize_text if word not in stop_words]
  
  return no_stop_text
   

In [23]:
data3['pre_procs_data'] = data3['message'].apply(lambda x : Pre_process_data(x.lower()))
data3.head()

Unnamed: 0,label,message,pre_procs_data
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"


In [24]:
def stemming_data(text_data):
  stem_text = [ps.stem(word) for word in text_data]
  return stem_text

data3['stem_text'] = data3['pre_procs_data'].apply(lambda x : stemming_data(x))
data3.head()


Unnamed: 0,label,message,pre_procs_data,stem_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"


## Apply preprocesig using lemmatizer : Using WordnetLemmatizer

In [25]:
wn = nltk.WordNetLemmatizer()

def lemmatize_data(text_data):
  lemma_text = [wn.lemmatize(word) for word in text_data]
  return lemma_text

data3['lemmatize_text'] = data3['pre_procs_data'].apply(lambda x : lemmatize_data(x))
data3.head()

Unnamed: 0,label,message,pre_procs_data,stem_text,lemmatize_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"


## Vectorizing the Data : 
It is used to convert text data to numbers to feed it to Machine learning model

1. CountVectorizer
2. N-gram
3. TF-IDF


In [26]:
## Let import the data 
import pandas as pd
import re
import string
import nltk
nltk.download('wordnet')

pd.set_option('display.max_colwidth',100)
ps = nltk.PorterStemmer()
stop_words = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

headers = ['label','message']
df = pd.read_csv('SMSSpamCollection', sep='\t',names=headers, header=None)
df.head()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


Preporcess Data
1.  Remove punctuation
2.  Tokenize
3.  Remove stop words
4.  Lematize the data

In [27]:
def PreProcessin_data(text_data):
  puct_removal ="".join([char for char in text_data if  char not in string.punctuation])
  tokenzie_text = re.split('\W+', puct_removal)
  stop_words_removal = [wn.lemmatize(word)   for word in tokenzie_text if word not in stop_words]
  # lema_text= [wn.lemmatize(word) for word in stop_word_removal]
  return stop_words_removal




In [28]:
df['clean_data'] = df['message'].apply(lambda x: PreProcessin_data(x.lower()))
df.head()

Unnamed: 0,label,message,clean_data
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, go, usf, life, around, though]"


## CountVectorizer

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=PreProcessin_data)

x_count=count_vect.fit_transform(df['message'])

print(x_count.shape)
print(count_vect.get_feature_names())

(5572, 11045)




In [30]:
x_count 

<5572x11045 sparse matrix of type '<class 'numpy.int64'>'
	with 56326 stored elements in Compressed Sparse Row format>

Lets use the small data set


In [31]:
data_sample = df[0:20]
data_sample

Unnamed: 0,label,message,clean_data
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, go, usf, life, around, though]"
5,spam,FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...,"[freemsg, hey, darling, 3, week, word, back, id, like, fun, still, tb, ok, xxx, std, chgs, send,..."
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]"
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, caller, pre..."
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,"[winner, valued, network, customer, selected, receivea, 900, prize, reward, claim, call, 0906170..."
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,"[mobile, 11, month, u, r, entitled, update, latest, colour, mobile, camera, free, call, mobile, ..."


In [32]:
x_count_sample = count_vect.fit_transform(data_sample['message'])
x_count_sample

<20x248 sparse matrix of type '<class 'numpy.int64'>'
	with 274 stored elements in Compressed Sparse Row format>

In [33]:
x_count_df =pd.DataFrame(x_count_sample.toarray())
x_count_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
x_count_df.columns = count_vect.get_feature_names()



In [35]:
x_count_df.head()

Unnamed: 0,08002986030,08452810075over18s,09061701461,1,100,100000,11,12,150,150pday,16,2,20000,2005,21st,3,4,4403LDNW1A7RW18,4txtú120,6days,81010,87077,87121,87575,9,900,A,As,Available,CASH,CLAIM,CSH11,Call,Callers,Callertune,Cine,Claim,Co,Cost,Cup,...,set,soon,speak,spell,std,still,stuff,take,talk,team,thank,think,though,time,tkts,today,tonight,treat,txt,u,ur,use,usf,v,valued,want,wat,watching,way,week,wet,wif,win,wkly,wonderful,wont,word,world,wwwdbuknet,xxxmobilemovieclubcomnQJKGIGHJJGCBL
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## N- Gram vectorizer

Have to change the Pre_processing function to have cleaned text in form of sentence

In [36]:
def pre_process_data_n_gram(text_data):
  no_punct = "".join([char for char in text_data if char not in string.punctuation])
  tokenzie_text = re.split('\W+', no_punct)
  # Note change is made in below line to retain the sentence.
  stop_words_removal = " ".join([word for word in tokenzie_text if word not in stop_words])
  return stop_words_removal


In [37]:
df['clean_sentence'] = df['message'].apply( lambda x: pre_process_data_n_gram(x.lower()))
df.head()

Unnamed: 0,label,message,clean_data,clean_sentence
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]",go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...",free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, go, usf, life, around, though]",nah dont think goes usf lives around though


We still require count vecotrizer

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect_n_gram = CountVectorizer(ngram_range=(2,2))

In [39]:
x_count = count_vect_n_gram.fit_transform(df['clean_sentence'])
x_count.shape

(5572, 31990)

In [40]:
x_count

<5572x31990 sparse matrix of type '<class 'numpy.int64'>'
	with 43788 stored elements in Compressed Sparse Row format>

In [41]:
count_vect_n_gram.get_feature_names

<bound method CountVectorizer.get_feature_names of CountVectorizer(ngram_range=(2, 2))>

Lets view it on sample data

In [42]:
data_sample = df[:20]
data_sample.count

<bound method DataFrame.count of    label  ...                                                                                       clean_sentence
0    ham  ...                   go jurong point crazy available bugis n great world la e buffet cine got amore wat
1    ham  ...                                                                              ok lar joking wif u oni
2   spam  ...  free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
3    ham  ...                                                                  u dun say early hor u c already say
4    ham  ...                                                          nah dont think goes usf lives around though
5   spam  ...              freemsg hey darling 3 weeks word back id like fun still tb ok xxx std chgs send 150 rcv
6    ham  ...                                                       even brother like speak treat like aids patent
7    ham  ...  per request melle melle oru minn

In [43]:
x_count_sample = count_vect_n_gram.fit_transform(data_sample['clean_sentence'])
df_x_count_sample = pd.DataFrame(x_count_sample.toarray())
df_x_count_sample.columns = count_vect_n_gram.get_feature_names()
df_x_count_sample



Unnamed: 0,09061701461 claim,100 20000,100000 prize,11 months,12 hours,150 rcv,150pday 6days,16 tsandcs,20000 pounds,2005 text,21st may,4txtú120 poboxox36504w45wq,6days 16,81010 tc,87077 eg,87077 trywales,87121 receive,87575 cost,900 prize,aids patent,already say,amore wat,anymore tonight,apply 08452810075over18s,apply reply,around though,available bugis,back id,blessing times,breather promise,brother like,buffet cine,bugis great,call 09061701461,call mobile,callers press,callertune callers,camera free,cash 100,chances win,...,tkts 21st,tonight ive,treat like,trywales scotland,tsandcs apply,txt csh11,txt message,txt ratetcs,txt ur,txt word,update co,update latest,ur national,urgent week,use credit,usf lives,valid 12,valued network,vettam set,want talk,wap link,way feel,way gota,week free,weeks word,wif oni,win cash,win fa,winner valued,wkly comp,wonderful blessing,wont take,word back,word claim,words thank,world la,wwwdbuknet lccltd,xxx std,xxxmobilemovieclub use,yes naughty
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## TF -IDF vectorizer 

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Have to change the preprocessing function to have the data in terms of words.
def pre_process_data_tfidf(text_data):
  no_punct = "".join([char for char in text_data if char not in string.punctuation])
  tokenzie_text = re.split('\W+', no_punct)
  # Note change is made in below line to retain the sentence.
  stop_words_n_lemma = [wn.lemmatize(word) for word in tokenzie_text if word not in stop_words]
  return stop_words_n_lemma



In [47]:
tf_idf_vec = TfidfVectorizer(analyzer = pre_process_data_tfidf )

x_count=tf_idf_vec.fit_transform(df['message'])

print(x_count.shape)
print(tf_idf_vec.get_feature_names())

data_sample = df[:20]
data_sample.count

x_count_sample = tf_idf_vec.fit_transform(data_sample['clean_sentence'])
df_x_count_sample = pd.DataFrame(x_count_sample.toarray())
df_x_count_sample.columns = tf_idf_vec.get_feature_names()
df_x_count_sample


(5572, 11045)




Unnamed: 0,08002986030,08452810075over18s,09061701461,1,100,100000,11,12,150,150pday,16,2,20000,2005,21st,3,4,4403ldnw1a7rw18,4txtú120,6days,81010,87077,87121,87575,9,900,aid,already,amore,anymore,apply,around,available,b,back,blessing,breather,brother,buffet,bugis,...,think,though,time,tkts,today,tonight,treat,trywales,tsandcs,txt,u,update,ur,urgent,use,usf,v,valid,valued,vettam,want,wap,wat,watching,way,week,wet,wif,win,winner,wkly,wonderful,wont,word,world,wwwdbuknet,xxx,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.251795,0.0,0.0,0.0,0.251795,0.0,0.0,0.0,0.0,0.0,0.251795,0.251795,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.251795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.251795,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.294083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.437499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.198423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174417,0.0,0.198423,0.198423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.198423,0.0,0.0,0.0,0.0,0.0,0.133378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174417,0.0,0.198423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.304187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.367563,0.367563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237559,0.0,0.0,0.0,0.0,0.0,0.0,0.237559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237559,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188426,0.0,0.0,0.237559,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.331667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.231645,0.0,0.0,0.0,0.0,0.231645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231645,0.231645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.1971,0.0,0.0,0.0,0.0,0.0,0.1971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132489,0.3942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
