# Import Libraries

In [21]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joykaus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *

### Loading the sms dataset into a pandas dataframe

In [14]:
df = pd.read_csv('data/sms-spam-collection-dataset/spam.csv', encoding="latin-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


##### Remove column 2, 3 and 4 as they have no useful information

In [15]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df['SMS'] = df['v2']
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})
df.drop(['v1', 'v2'], axis=1, inplace=True)
train_data = df[:4400]
test_data = df[4400:]
df.head()

Unnamed: 0,SMS,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


### Data Cleaning and Preprocessing

In [24]:
stop_words = nltk.corpus.stopwords.words('english')
porter = nltk.PorterStemmer()

In [25]:
def preprocess_data(data):
    data = str(data)
    cleaned = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr',data)
    cleaned = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr',cleaned)
    cleaned = re.sub(r'£|\$', 'moneysymb', cleaned)
    cleaned = re.sub(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b','phonenumbr', cleaned)
    cleaned = re.sub(r'\d+(\.\d+)?', 'numbr', cleaned)
    cleaned = re.sub(r'[^\w\d\s]', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'^\s+|\s+?$', '', cleaned.lower())
    return ' '.join(
        porter.stem(term) 
        for term in cleaned.split()
        if term not in set(stop_words)
    )

In [26]:
# Applying preprocessing cleaning on test and train data
preprocess_data(train_data)
preprocess_data(test_data)


'sm label numbr mani time lose best one bcoz numbr numbr good friend care close frien numbr numbr get back home numbr numbr sorri call later lt gt min numbr numbr dun need use dial juz open da browser n numbr numbr numbrnd time tri numbr contact u numbr numbr ì_ b go esplanad fr home numbr numbr piti mood numbr numbr guy bitch act like numbr numbr rofl true name numbr numbr row x numbr column'

### Creating models

In [27]:
def perform(classifiers, vectorizers, train_data, test_data):
    max_score = 0
    max_name = 0
    for classifier in classifiers:
        for vectorizer in vectorizers:
        
            # train
            vectorize_text = vectorizer.fit_transform(train_data.SMS)
            classifier.fit(vectorize_text, train_data.label)

            # score
            vectorize_text = vectorizer.transform(test_data.SMS)
            score = classifier.score(vectorize_text, test_data.label)
            name = classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__ 
            print(name, score)
        if score > max_score:
            max_score = score
            max_name = name
    print ('===========================================')
    print ('===========================================')
    print (max_name, max_score)
    print ('===========================================')
    print ('===========================================')

#### List of various classifiers we are going to use

In [28]:
classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ]

#### List of various vectorizers we are going to use

In [29]:
vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ]

#### Perform classification and save results to a new dataframe

In [30]:
perform(
    classifiers,
    vectorizers,
    train_data,
    test_data
)

BernoulliNB with CountVectorizer 0.9778156996587031
BernoulliNB with TfidfVectorizer 0.9778156996587031
BernoulliNB with HashingVectorizer 0.8728668941979523
RandomForestClassifier with CountVectorizer 0.976962457337884
RandomForestClassifier with TfidfVectorizer 0.9744027303754266
RandomForestClassifier with HashingVectorizer 0.9684300341296929
AdaBoostClassifier with CountVectorizer 0.9718430034129693
AdaBoostClassifier with TfidfVectorizer 0.9692832764505119
AdaBoostClassifier with HashingVectorizer 0.9735494880546075
BaggingClassifier with CountVectorizer 0.9684300341296929
BaggingClassifier with TfidfVectorizer 0.9650170648464164
BaggingClassifier with HashingVectorizer 0.9658703071672355
ExtraTreesClassifier with CountVectorizer 0.9795221843003413
ExtraTreesClassifier with TfidfVectorizer 0.9803754266211604
ExtraTreesClassifier with HashingVectorizer 0.9701365187713311
GradientBoostingClassifier with CountVectorizer 0.9684300341296929
GradientBoostingClassifier with TfidfVectoriz



DummyClassifier with TfidfVectorizer 0.7636518771331058
DummyClassifier with HashingVectorizer 0.7807167235494881




PassiveAggressiveClassifier with CountVectorizer 0.9829351535836177
PassiveAggressiveClassifier with TfidfVectorizer 0.985494880546075
PassiveAggressiveClassifier with HashingVectorizer 0.9829351535836177
RidgeClassifier with CountVectorizer 0.9812286689419796
RidgeClassifier with TfidfVectorizer 0.9829351535836177
RidgeClassifier with HashingVectorizer 0.9820819112627986
RidgeClassifierCV with CountVectorizer 0.9829351535836177
RidgeClassifierCV with TfidfVectorizer 0.984641638225256
RidgeClassifierCV with HashingVectorizer 0.9803754266211604
SGDClassifier with CountVectorizer 0.9829351535836177
SGDClassifier with TfidfVectorizer 0.985494880546075
SGDClassifier with HashingVectorizer 0.9829351535836177
OneVsRestClassifier with CountVectorizer 0.9863481228668942
OneVsRestClassifier with TfidfVectorizer 0.9880546075085325
OneVsRestClassifier with HashingVectorizer 0.9829351535836177
OneVsRestClassifier with CountVectorizer 0.9837883959044369
OneVsRestClassifier with TfidfVectorizer 0.97

##### Selecting the best combination to create our model

In [41]:
# train the classifier with best accuracy
Classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))
Vectorizer = TfidfVectorizer()
vectorize_text = Vectorizer.fit_transform(train_data.SMS)
Classifier.fit(vectorize_text, train_data.label)

OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma='scale', kernel='linear', max_iter=-1,
                                  probability=True, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False),
                    n_jobs=None)

In [42]:
SMS= 'You have an interview tomorrow'
vectorize_message = Vectorizer.transform([SMS])
predict = Classifier.predict(vectorize_message)[0]

## Testing Model Predictions

In [43]:
if predict == 0:
    print ('not spam')
else:
    print ('spam')

not spam


In [44]:
SMS = ' won a 1 week FREE membership in our $100,000 Prize Jackpot! Txt the word: C'
vectorize_message = Vectorizer.transform([SMS])
predict = Classifier.predict(vectorize_message)[0]

In [45]:
if predict == 0:
    print ('not spam')
else:
    print ('spam')

spam
