# Classification of text messages as ham/spam

## With multinomial Logistic regression

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Load labeled SMS dataset

In [3]:
df_in=pd.read_csv('data/sms.tsv.zip', sep='\t', header=None)
df_in.columns=['label','text']

In [4]:
df_in.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# convert label to a numerical variable
df=df_in
df['label_num'] = df.label.map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Create test/train datasets

In [6]:
X = df.text
y = df.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


### Vectorize the data and tune vectorizer

In [8]:
# remove English stop words
# include 1-grams and 2-grams
# ignore terms that appear in more than 50% of the documents
# only keep terms that appear in at least 2 documents

vect = CountVectorizer(stop_words='english',ngram_range=(1, 2),max_df=0.5,min_df=2)

In [9]:
X_train_dtm = vect.fit_transform(X_train)

In [10]:
X_train_dtm

<4179x6876 sparse matrix of type '<class 'numpy.int64'>'
	with 39203 stored elements in Compressed Sparse Row format>

In [11]:
# 4. transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

# you can see that the number of columns, 7456, is the same as what we have learned above in X_train_dtm

<1393x6876 sparse matrix of type '<class 'numpy.int64'>'
	with 11142 stored elements in Compressed Sparse Row format>

### Train Logistic Regression Model

In [12]:
logreg = LogisticRegression()

In [13]:
#train the model 
%time logreg.fit(X_train_dtm, y_train)

CPU times: user 59.2 ms, sys: 981 µs, total: 60.2 ms
Wall time: 58 ms


LogisticRegression()

### Make predictions with model and look at performance

In [14]:
y_pred_class = logreg.predict(X_test_dtm)

In [15]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([0.0137417 , 0.00743006, 0.01638615, ..., 0.04181215, 0.99718035,
       0.00471916])

In [16]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.9820531227566404

In [17]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9953239663504565