In [1]:
import nltk

In [2]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [8]:
#show the context where a word can be found in a text
text1.concordance('monstrous')

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


In [4]:
type(text1)

nltk.text.Text

In [7]:
#look for words that are used in the same context in text1
text1.similar('monstrous')

true subtly singular candid fearless imperial curious uncommon wise
untoward careful mouldy part tyrannical mystifying mean horrible
exasperate puzzled lazy


In [6]:
text2.similar('monstrous')

very exceedingly so heartily extremely good as remarkably great vast
amazingly sweet a


In [9]:
text4

<Text: Inaugural Address Corpus>

In [None]:
#see a plot which shows where in the text we can find these terms
text4.dispersion_plot(['citizens', 'freedom', 'duties'])

In [1]:
#sentence tokenization
para = 'This is a sentence. This is also a sentence'

from nltk.tokenize import sent_tokenize

sent_tokenize(para)

['This is a sentence.', 'This is also a sentence']

In [3]:
import nltk.data

dutch_tokenizer = nltk.data.load('/Users/ciprian/nltk_data/tokenizers/punkt/dutch.pickle')
dutch_tokenizer.tokenize('Dit zal ook. Dat hier')

['Dit zal ook.', 'Dat hier']

In [4]:
from nltk.tokenize import word_tokenize
word_tokenize("can't will not work well")

['ca', "n't", 'will', 'not', 'work', 'well']

In [7]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("can't will work well now")

["can't", 'will', 'work', 'well', 'now']

# !!! Maybe we can use this !!!

In [44]:
from nltk.corpus import stopwords

#we can exclude some stop words from being removed (like 'not')
english_stop = set(stopwords.words('english'))

In [19]:
#list of languages for which we have stop words
stopwords.fileids()

['danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'portuguese',
 'russian',
 'spanish',
 'swedish',
 'turkish']

In [20]:
#word stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmer.stem('cooking')

'cook'

In [21]:
stemmer.stem('cookery')

'cookeri'

In [23]:
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()

stemmer.stem('cookery')

'cookery'

In [24]:
#you can create a custom stemmer with regexp
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing')
stemmer.stem('cooking')

'cook'

In [26]:
stemmer.stem('MyBankIsing')

'MyBankIs'

In [27]:
from nltk.stem import SnowballStemmer
SnowballStemmer.languages

('danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [29]:
dutch_stemmer = SnowballStemmer('dutch')

dutch_stemmer.stem('kruipen')

'kruip'

In [35]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize('cooking')

'cooking'

In [36]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmer.stem('believes')

'believ'

In [39]:
#WordNet is the best stemmer available
lemmatizer.lemmatize('believes')

'belief'

# !!! Maybe we can use this !!!

In [43]:
import re

#we should include this replacement in the document cleaning
replacement_patterns = [{r'won\'t', 'will not'}, {r'can\'t', 'cannot'}]

In [42]:
def replace(text):
    s = text
    for pattern, replace in replacement_patterns:
        s = re.sub(pattern, replace, s)
    return s

In [41]:
replace("I won't go to Jamaica this year")

'I will not go to Jamaica this year'

In [49]:
!pip install pyenchant



In [None]:
#there is an error on Mac OS X with enchant in Python 3
import enchant

spell_dict = enchant.Dict('en')

word = 'accurate'

if spell_dict.check(word):
    print(word)
    
suggestion = spell_dict.suggest(word)

print(suggestion)

# Machine learning exercise

In [52]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv

In [61]:
def preprocessing(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    
    tokens = [word for word in tokens if len(word) > 3]
    
    tokens = [word.lower() for word in tokens]
    
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [62]:
preprocessing('This is a test for a dog')

'this test'

In [66]:
smsdata = open('/Users/ciprian/nltk_Data/smsspamcollection/SMSSpamCollection')
sms_label = []
sms_data = []
csv_reader = csv.reader(smsdata, delimiter = '\t')

In [67]:
for line in csv_reader:
    sms_label.append(line[0])
    sms_data.append(preprocessing(line[1]))

In [69]:
import sklearn
import numpy as np

trainset_size = int(round(len(sms_data) * 0.70))

In [108]:
x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([''.join(el) for el in sms_label[0:trainset_size]])

x_test = np.array([''.join(el) for el in sms_data[trainset_size + 1 : -1]])
y_test = np.array([''.join(el) for el in sms_label[trainset_size + 1 : -1]])

In [72]:
print(x_train)

['okie' 'that depends would like treated' 'right brah later' ...,
 'will going esplanade home' 'pity mood suggestion'
 'bitching acted like interested buying something else next week gave free']


In [73]:
print(y_train)

['ham' 'ham' 'ham' ..., 'ham' 'ham' 'ham']


# !!! Maybe we can use this !!!

In [99]:
#We can use this feature to create a dictionary
from sklearn.feature_extraction.text import CountVectorizer
sms_exp = []

for line in sms_data:
    sms_exp.append(preprocessing(line))
    
vectorizer = CountVectorizer(min_df=1)
X_exp = vectorizer.fit_transform(sms_exp)

In [100]:
X_exp

<5572x7344 sparse matrix of type '<class 'numpy.int64'>'
	with 36715 stored elements in Compressed Sparse Row format>

In [101]:
print('||'.join(vectorizer.get_feature_names()))
print(X_exp.toarray)

<bound method _cs_matrix.toarray of <5572x7344 sparse matrix of type '<class 'numpy.int64'>'
	with 36715 stored elements in Compressed Sparse Row format>>


In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 2, ngram_range = (1,2), stop_words = 'english', strip_accents = 'unicode')

In [110]:
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [111]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

clf = MultinomialNB().fit(x_train, y_train)
y_nb_predicted = clf.predict(x_test)

print(y_nb_predicted)

cm = confusion_matrix(y_test, y_nb_predicted)
print(cm)

print(classification_report(y_test, y_nb_predicted))

['ham' 'ham' 'ham' ..., 'ham' 'ham' 'ham']
[[1442    0]
 [  56  172]]
             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1442
       spam       1.00      0.75      0.86       228

avg / total       0.97      0.97      0.96      1670



In [120]:
feature_names = vectorizer.get_feature_names()
coefs = clf.coef_
intercept = clf.intercept_
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
n = 10
top = zip(coefs_with_fns[:n], coefs_with_fns[: -(n+1) : -1])
for(coef_1, fn_1), (coef_2, fn_2) in top:
    print('\t%.4f\t-\t%s\t\t%.4f\t-\t%s' % (coef_1, fn_1, coef_2, fn_2))

	-8.8515	-	15		-5.7249	-	free
	-8.8515	-	2morrow		-6.0219	-	text
	-8.8515	-	2mrw		-6.1760	-	claim
	-8.8515	-	30		-6.2085	-	stop
	-8.8515	-	30 want		-6.2360	-	reply
	-8.8515	-	30ish		-6.2482	-	mobile
	-8.8515	-	4get		-6.3348	-	service
	-8.8515	-	5min		-6.3363	-	prize
	-8.8515	-	65		-6.3616	-	tone
	-8.8515	-	6hrs		-6.4079	-	www
