# spam 判定



In [1]:
%matplotlib inline

In [2]:
# Check data size
messages = [line.rstrip() for line in open('./SMSSpamCollection')]
print(len(messages))

5574


In [3]:
# Load data into data frame
import pandas as pd
df = pd.read_csv('./SMSSpamCollection', sep='\t', names=["label", "message"])

In [4]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [5]:
df.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,message
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,count,4825
ham,unique,4516
ham,top,"Sorry, I'll call later"
ham,freq,30
spam,count,747
spam,unique,653
spam,top,Please call our customer service representativ...
spam,freq,4


In [6]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(list(df['message']), list(df['label']), test_size=0.1, random_state=42)

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

import numpy as np
_ = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

np.mean(predicted == y_test)

0.95878136200716846

In [8]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.98       485
       spam       1.00      0.68      0.81        73

avg / total       0.96      0.96      0.96       558



In [9]:
metrics.confusion_matrix(y_test, predicted)

array([[485,   0],
       [ 23,  50]])

In [10]:
for message, label in zip(X_test, predicted):
    print('%s => %r' % (label, message))

ham => 'Squeeeeeze!! This is christmas hug.. If u lik my frndshp den hug me back.. If u get 3 u r cute:) 6 u r luvd:* 9 u r so lucky;) None? People hate u:'
ham => "And also I've sorta blown him off a couple times recently so id rather not text him out of the blue looking for weed"
ham => 'Mmm thats better now i got a roast down me! i\x92d b better if i had a few drinks down me 2! Good indian?'
ham => 'Mm have some kanji dont eat anything heavy ok'
ham => "So there's a ring that comes with the guys costumes. It's there so they can gift their future yowifes. Hint hint"
ham => 'Sary just need Tim in the bollox &it hurt him a lot so he tol me!'
ham => "Love isn't a decision, it's a feeling. If we could decide who to love, then, life would be much simpler, but then less magical"
ham => 'My supervisor find 4 me one lor i thk his students. I havent ask her yet. Tell u aft i ask her.'
ham => 'Dear good morning now only i am up'
ham => "I'm in chennai velachery:)"
ham => 'Lol grr my mom is tak

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                          alpha=1e-3, n_iter=5, random_state=42)),
                    ])
_ = text_clf.fit(X_train, y_train)
predicted = _.predict(X_test)
print(np.mean(predicted == y_test))

0.976702508961


In [12]:
print(metrics.classification_report(y_test, predicted))
metrics.confusion_matrix(y_test, predicted)

             precision    recall  f1-score   support

        ham       0.97      1.00      0.99       485
       spam       1.00      0.82      0.90        73

avg / total       0.98      0.98      0.98       558



array([[485,   0],
       [ 13,  60]])

In [13]:
from sklearn.metrics import make_scorer, f1_score
from sklearn.grid_search import GridSearchCV
parameters = [
    { 
        'tfidf__use_idf': [True], 
        'tfidf__min_df': [1, 5, 10],
        'tfidf__max_df': [0.4, 1.0],
        'clf__alpha': [1e-2, 1e-3],
        'clf__loss': ['hinge', 'log']
    },
    {
        'tfidf__use_idf': [False], 
        'clf__alpha': [1e-2, 1e-3],
        'clf__loss': ['hinge', 'log']        
    }
]
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring='f1_micro')
gs_clf = gs_clf.fit(X_train, y_train)

In [14]:
# Predict by best model
gs_clf.predict(['Free entry in 2 a wkly comp to win FA Cup'])

array(['spam'], 
      dtype='<U4')

In [15]:
# Show best parameters
print(gs_clf.best_score_)
gs_clf.best_params_

0.979856402074


{'clf__alpha': 0.001,
 'clf__loss': 'hinge',
 'tfidf__max_df': 0.4,
 'tfidf__min_df': 5,
 'tfidf__use_idf': True}

In [16]:
# Show full best parameters
gs_clf.best_estimator_.get_params(deep=False)

{'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.4, max_features=None, min_df=5,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('clf',
   SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
          eta0=0.0, fit_intercept=True, l1_ratio=0.15,
          learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
          penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
          warm_start=False))]}

In [17]:
# Show all scores
for params, mean_score, all_scores in gs_clf.grid_scores_:
    print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

0.866 (+/- 0.000) for {'clf__loss': 'hinge', 'tfidf__use_idf': True, 'tfidf__max_df': 0.4, 'clf__alpha': 0.01, 'tfidf__min_df': 1}
0.866 (+/- 0.000) for {'clf__loss': 'hinge', 'tfidf__use_idf': True, 'tfidf__max_df': 0.4, 'clf__alpha': 0.01, 'tfidf__min_df': 5}
0.866 (+/- 0.000) for {'clf__loss': 'hinge', 'tfidf__use_idf': True, 'tfidf__max_df': 0.4, 'clf__alpha': 0.01, 'tfidf__min_df': 10}
0.866 (+/- 0.000) for {'clf__loss': 'hinge', 'tfidf__use_idf': True, 'tfidf__max_df': 1.0, 'clf__alpha': 0.01, 'tfidf__min_df': 1}
0.866 (+/- 0.000) for {'clf__loss': 'hinge', 'tfidf__use_idf': True, 'tfidf__max_df': 1.0, 'clf__alpha': 0.01, 'tfidf__min_df': 5}
0.866 (+/- 0.000) for {'clf__loss': 'hinge', 'tfidf__use_idf': True, 'tfidf__max_df': 1.0, 'clf__alpha': 0.01, 'tfidf__min_df': 10}
0.866 (+/- 0.000) for {'clf__loss': 'log', 'tfidf__use_idf': True, 'tfidf__max_df': 0.4, 'clf__alpha': 0.01, 'tfidf__min_df': 1}
0.866 (+/- 0.000) for {'clf__loss': 'log', 'tfidf__use_idf': True, 'tfidf__max_df':

In [18]:
predicted = gs_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))
metrics.confusion_matrix(y_test, predicted)

             precision    recall  f1-score   support

        ham       0.98      1.00      0.99       485
       spam       1.00      0.85      0.92        73

avg / total       0.98      0.98      0.98       558



array([[485,   0],
       [ 11,  62]])

In [19]:
from sklearn.externals import joblib
# Export learned model
joblib.dump(gs_clf.best_estimator_, 'model/spam_clf.pkl')

['model/spam_clf.pkl',
 'model/spam_clf.pkl_01.npy',
 'model/spam_clf.pkl_02.npy',
 'model/spam_clf.pkl_03.npy',
 'model/spam_clf.pkl_04.npy',
 'model/spam_clf.pkl_05.npy',
 'model/spam_clf.pkl_06.npy']

In [20]:
from sklearn.externals import joblib
clf = joblib.load('model/spam_clf.pkl')
clf.predict(['Free entry in 2 a wkly comp to win FA Cup'])

array(['spam'], 
      dtype='<U4')