In [34]:
# Import libraries
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/iryna.savchuk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
#https://www.kaggle.com/uciml/sms-spam-collection-dataset
df_train = pd.read_csv('spam_train.csv', encoding='ISO-8859-1')
df_train.head(5)

Unnamed: 0,category,sms,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [36]:
df_test = pd.read_csv('spam_test.csv', encoding='ISO-8859-1')
df_test.head(5)

Unnamed: 0,category,sms,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,Well its not like you actually called someone ...,,,
1,ham,"Nope. Since ayo travelled, he has forgotten hi...",,,
2,ham,You still around? Looking to pick up later,,,
3,spam,CDs 4u: Congratulations ur awarded å£500 of CD...,,,
4,ham,There's someone here that has a year &lt;#&gt...,,,


In [42]:
tokenizer = RegexpTokenizer('r\w+')
stopwords_english = set(stopwords.words('english'))

# Tokenizing & stemming & removing stop words
def cleanSms(sms):
    sms = sms.replace("<br /><br />"," ")
    sms = sms.lower()
    sms_tokens = tokenizer.tokenize(sms)
    sms_tokens_without_stopwords = [token for token in sms_tokens if token not in stopwords_english]
    stemmed_sms_tokens_without_stopwords = [PorterStemmer().stem(token) for token in sms_tokens_without_stopwords]
    cleaned_sms = ' '.join(stemmed_sms_tokens_without_stopwords)
    return cleaned_sms

In [44]:
# Clean the data & plot it on X & Y
df_train['sms'].apply(cleanSms)
x_train = df_train['sms'].values
y_train = df_train['category'].values

df_test['sms'].apply(cleanSms)
x_test = df_test['sms'].values
y_test = df_test['category'].values

In [45]:
# Vectorze the data
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='ISO-8859-1')
vectorizer.fit(x_train)
x_train=vectorizer.transform(x_train)
x_test=vectorizer.transform(x_test)

In [46]:
# Create model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='lbfgs')
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
# Predict Spam
model.predict(vectorizer.transform(["you won $900 in the new lottery draw. Call +123456789."]))

array(['spam'], dtype=object)

In [48]:
# Predict Ham
model.predict(vectorizer.transform(["Hello there. How are you doing?"]))

array(['ham'], dtype=object)

In [49]:
# Pickling
import joblib
joblib.dump(model,'spam_ham_model.pkl')
joblib.dump(vectorizer,'vectorizer.pkl')

['vectorizer.pkl']