In [2]:
df = pd.read_csv('./SMSSpamCollection', sep='\t', names=["label", "message"])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
from sklearn.cross_validation import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(list(df['message']), list(df['label']), test_size=0.1, random_state=42)

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf =Pipeline([
        ('count', CountVectorizer()),
        ('tfid', TfidfTransformer()),
        ('clf', MultinomialNB())
    ])

text_clf.fit(X_train, y_train)

Pipeline(steps=[('count', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [16]:
predicted = text_clf.predict(X_test)

In [18]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.98       485
       spam       1.00      0.68      0.81        73

avg / total       0.96      0.96      0.96       558



In [19]:
metrics.confusion_matrix(y_test, predicted)

array([[485,   0],
       [ 23,  50]])

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))
    ])

In [23]:
from sklearn.grid_search import GridSearchCV
parameters = [{
        'tfidf__use_idf': [True],
        'tfidf__min_df': [1, 5, 10],
        'tfidf__max_df': [0.4, 1.0],
        'clf__alpha': [1e-2, 1e-3],
        'clf__loss': ['hinge', 'log']
}, {
        'tfidf__use_idf': [False],
        'clf__alpha': [1e-2, 1e-3],
        'clf__loss': ['hinge', 'log']
}]
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring='f1_micro')
gs_clf = gs_clf.fit(X_train, y_train)

In [24]:
for params, mean_score, all_scores in gs_clf.grid_scores_:
    print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

0.866 (+/- 0.000) for {'clf__alpha': 0.01, 'tfidf__max_df': 0.4, 'tfidf__min_df': 1, 'clf__loss': 'hinge', 'tfidf__use_idf': True}
0.866 (+/- 0.000) for {'clf__alpha': 0.01, 'tfidf__max_df': 0.4, 'tfidf__min_df': 5, 'clf__loss': 'hinge', 'tfidf__use_idf': True}
0.866 (+/- 0.000) for {'clf__alpha': 0.01, 'tfidf__max_df': 0.4, 'tfidf__min_df': 10, 'clf__loss': 'hinge', 'tfidf__use_idf': True}
0.866 (+/- 0.000) for {'clf__alpha': 0.01, 'tfidf__max_df': 1.0, 'tfidf__min_df': 1, 'clf__loss': 'hinge', 'tfidf__use_idf': True}
0.866 (+/- 0.000) for {'clf__alpha': 0.01, 'tfidf__max_df': 1.0, 'tfidf__min_df': 5, 'clf__loss': 'hinge', 'tfidf__use_idf': True}
0.866 (+/- 0.000) for {'clf__alpha': 0.01, 'tfidf__max_df': 1.0, 'tfidf__min_df': 10, 'clf__loss': 'hinge', 'tfidf__use_idf': True}
0.866 (+/- 0.000) for {'clf__alpha': 0.01, 'tfidf__max_df': 0.4, 'tfidf__min_df': 1, 'clf__loss': 'log', 'tfidf__use_idf': True}
0.866 (+/- 0.000) for {'clf__alpha': 0.01, 'tfidf__max_df': 0.4, 'tfidf__min_df': 5

In [25]:
predicted = gs_clf.predict(X_test)

In [26]:
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        ham       0.98      1.00      0.99       485
       spam       1.00      0.85      0.92        73

avg / total       0.98      0.98      0.98       558



In [29]:
from sklearn.externals import joblib
#joblib.dump(gs_clf.best_estimator_, './model/spam_clf.pkl')

In [30]:
#clf = joblib.load('./model/spam_clf.pkl')