In [29]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline, FeatureUnion, make_union
from sklearn.externals import joblib




In [18]:
df = pd.read_csv("data/SMSSpamCollection", sep="\t", 
                 header=None, 
                 names=["target", "text"])

In [19]:
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
X = df['text']
y = df['target']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class CapitalDocTransformer(BaseEstimator, TransformerMixin):
    
    'Transforms the input document to either 1 or 0.' 
    '1 if all words in the document are Capital else Returns 0'
    
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_new = np.array([line == line.upper() for line in X]).astype(int)   
        return X_new.reshape(-1,1)

In [23]:
log_reg_model = Pipeline(steps = [('features', make_union(CapitalDocTransformer(), CountVectorizer())),
                                  ('model', LogisticRegression())])

In [37]:
log_reg_model.fit(X_train, y_train)

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('capitaldoctransformer', CapitalDocTransformer(columns=None)), ('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
      ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [38]:
log_reg_model.score(X_test, y_test)

0.98504784688995217

In [30]:
# import pickle
# with open('Spam.pkl', 'wb') as picklefile:
#     pickle.dump(log_reg_model, picklefile)

In [31]:
joblib.dump(log_reg_model, 'models/spam_ham.pkl')

['models/spam_ham.pkl']

In [32]:
new_model = joblib.load('models/spam_ham.pkl')

In [33]:
new_model.predict(["Hello, I am a prince from Nigeria,  I am stuck in london and need a wire transfer"])

array(['ham'], dtype=object)

In [36]:
new_model.predict(["www.viagraforcheap.com is spam"])

array(['ham'], dtype=object)