In [14]:
import pandas as pd


# Reading the data


pd.read_csv('spam.csv') # raises unicode decode error

pd.read_csv('spam.csv', encoding='Latin-1') # works 
    

In [15]:
# when encoding is unknown
import chardet 
with open('spam.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large

print(result['encoding'])
df_raw = pd.read_csv('spam.csv', encoding=result['encoding'])

Windows-1252


In [16]:
df_raw.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [27]:
df = df_raw[['v1', 'v2']].rename(columns={'v1' : 'label', 'v2' : 'sms'})

In [28]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Exploring data

In [29]:
df.describe()

Unnamed: 0,label,sms
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [30]:
df.drop_duplicates(subset='sms', inplace=True)

In [31]:
df.describe()

Unnamed: 0,label,sms
count,5169,5169
unique,2,5169
top,ham,"Em, its olowoyey@ usc.edu have a great time in..."
freq,4516,1


In [41]:
df.groupby('label').count().reset_index()

Unnamed: 0,label,sms
0,ham,4516
1,spam,653


In [42]:
df['len'] = df['sms'].map(lambda x: len(x))

In [46]:
df.groupby('label').agg({'sms': 'count', 'len' : 'mean'}).reset_index()

Unnamed: 0,label,sms,len
0,ham,4516,70.459256
1,spam,653,137.891271


# Logistic Regression Classifier

In [90]:
"""
    Train-test split: Do not touch the test data until the time of final evaluation.
"""

from sklearn.model_selection import train_test_split

X = df['sms']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [91]:
print("Shape of X is {}".format(X.shape))
print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

Shape of X is (5169,)
Shape of X_train is (3876,) and shape of y_train is (3876,)
Shape of X_test is (1293,) and shape of y_test is (1293,)


In [92]:
train_corpus = list(X_train)

In [93]:
"""
    Featurizer: Train the featurizer on train data.
"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(train_corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [94]:
print("Number of features = {}".format(len(vectorizer.vocabulary_)))
print("Number of omitted words = {}".format(len(vectorizer.stop_words_)))

Number of features = 5000
Number of omitted words = 2395


In [95]:
X_train_features = vectorizer.transform(list(X_train))
print("Shape of X_train_features is {}".format(X_train_features.shape))

Shape of X_train_features is (3876, 5000)


In [96]:
"""
    Training a classifier: Train a Logistic Regression classifier
"""
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)

model.fit(X_train_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [102]:
"""
    Evaluation on training data: This does not mean anything  
        Do not evaluate on the same data that you trained on
"""
from sklearn.metrics import accuracy_score, confusion_matrix

y_train_predicted = model.predict(X_train_features)
y_train_predicted.shape

(3876,)

In [108]:
print("The fraction of correctly classified samples is {}".format(accuracy_score(y_train, y_train_predicted)))
print("The number of correctly classified samples is {}".format(accuracy_score(y_train, y_train_predicted, normalize=False)))

The fraction of correctly classified samples is 0.9713622291021672
The number of correctly classified samples is 3765


In [113]:
pd.DataFrame(confusion_matrix(y_train_predicted, y_train), 
             index={'true ham', 'true spam'}, 
             columns={'pred ham', 'pred spam'})

Unnamed: 0,pred ham,pred spam
true ham,3408,110
true spam,1,357


In [115]:
"""
    Evaluation on test data: This score is important
"""
X_test_features = vectorizer.transform(list(X_test))
y_test_predicted = model.predict(X_test_features)

print("The fraction of correctly classified samples is {}".format(accuracy_score(y_test, y_test_predicted)))
print("The number of correctly classified samples is {}".format(accuracy_score(y_test, y_test_predicted, normalize=False)))

The fraction of correctly classified samples is 0.9675174013921114
The number of correctly classified samples is 1251


In [117]:
pd.DataFrame(confusion_matrix(y_test_predicted, y_test), 
             index={'true ham', 'true spam'}, 
             columns={'pred ham', 'pred spam'})

Unnamed: 0,pred ham,pred spam
true ham,1105,40
true spam,2,146
