In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

In [2]:
from nltk.corpus import stopwords
englishsw=stopwords.words('english')

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
wnl.lemmatize('', 'n')

import re
ProgRoman = re.compile(u'^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$')
Prog = re.compile(u'[^a-z]+')

In [3]:
def cleaning_text_tax(msg):
    msg=msg.lower()
    msg=Prog.sub(' ', msg)
    msg=[wnl.lemmatize(word,'n') for word in msg.split() if (word not in englishsw) and (len(word) > 2)]
    msg = ' '.join([m for m in msg if m.strip()])
    return msg

In [4]:
spam_data = pd.read_csv("sms_spam_short.csv")
print(spam_data.dtypes)
display(spam_data.type.value_counts())
display(spam_data.type.value_counts(normalize=True)*100)
display(spam_data.head())

type    object
text    object
dtype: object


ham     437
spam     63
Name: type, dtype: int64

ham     87.4
spam    12.6
Name: type, dtype: float64

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [5]:
spam_data.text=spam_data.text.apply(cleaning_text_tax)
display(spam_data.head())
corpus=spam_data.text
vectorizer = TfidfVectorizer(stop_words='english')
tfidf=vectorizer.fit_transform(corpus).todense()
print(tfidf.shape)

Unnamed: 0,type,text
0,ham,hope good week checking
1,ham,give back thanks
2,ham,also cbe pay
3,spam,complimentary star ibiza holiday cash need urg...
4,spam,okmail dear dave final notice collect tenerife...


(500, 1682)


In [6]:
X = tfidf
y = spam_data.type
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(335, 1682) (165, 1682) (335,) (165,)


In [7]:
classifier=GaussianNB()
classifier=classifier.fit(X_train,y_train)
predictions=classifier.predict(X_test)

In [8]:
labels=['ham', 'spam']
f1score=list(sklearn.metrics.f1_score(y_test, predictions, average=None))
print(f1score)
fscore=pd.Series(f1score,index=labels)
print("fscore  of single class : ".format(fscore))
print("accuracy : ", sklearn.metrics.accuracy_score(y_test, predictions))
pred_labels = ['Predicted '+ l for l in labels]
cm=sklearn.metrics.confusion_matrix(y_test,predictions)
cm = pd.DataFrame(cm, index=labels, columns=pred_labels)
cm['Actual T'] = cm.sum(axis=1)
cm.loc['Predicted T']= cm.sum()
cm

[0.9032258064516128, 0.4705882352941177]
fscore  of single class : 
accuracy :  0.8363636363636363


Unnamed: 0,Predicted ham,Predicted spam,Actual T
ham,126,21,147
spam,6,12,18
Predicted T,132,33,165


In [9]:
import nltk

doc = 'Mark and John are working at Google since Jan 2015 and for $2000 pm each.'
tokenized_doc = nltk.word_tokenize(doc)
tagged_sentences = nltk.pos_tag(tokenized_doc)
ne_chunked_sents = nltk.ne_chunk(tagged_sentences)
named_entities = []
for tagged_tree in ne_chunked_sents:
    if hasattr(tagged_tree, 'label'):
        entity_name = ' '.join(c[0] for c in tagged_tree.leaves())
        entity_type = tagged_tree.label()
        named_entities.append((entity_name, entity_type))
print(named_entities)

[('Mark', 'PERSON'), ('John', 'PERSON'), ('Google', 'ORGANIZATION')]


In [10]:
pip install -U textblob

Requirement already up-to-date: textblob in /home/laxmi/anaconda3/lib/python3.7/site-packages (0.15.3)
Note: you may need to restart the kernel to use updated packages.


In [11]:
from textblob import TextBlob

text = '''
The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant.
'''
blob = TextBlob(text)
print([np_ for np_ in blob.noun_phrases])

['titular threat', 'blob', 'ultimate movie monster', 'amoeba-like mass', 'snide', 'potential consequences', 'grey goo scenario', 'technological theorists fearful', 'artificial intelligence run rampant']
