In [1]:
import pandas as pd

# read csv and define header
df = pd.read_csv('../dataset/set_01/set_01.csv', header=None, encoding='ascii')
df.rename(columns={0: 'spam', 1: 'content'}, inplace=True)

# transform content to lower case before any further process
df.content = df.content.str.lower()

print(len(df))


5886


In [2]:
import re


def deContact(text):
    text = str(text)

    # use regex for handling some ' is ?
    # ??? if asdfklhli'maksdjhfl

    # specific
    text = re.sub(r'i[\'?]m', 'i am', text)
    text = re.sub(r'let[\'?]s', 'let us', text)
    text = re.sub(r'don[\'?]t', 'do not', text)
    text = re.sub(r'can[\'?]t', 'can not', text)
    text = re.sub(r'won[\'?]t', 'will not', text)

    # general
    text = re.sub(r'[\'?]s', ' is', text)
    text = re.sub(r'[\'?]re', ' are', text)
    text = re.sub(r'[\'?]ll', ' will', text)
    text = re.sub(r'[\'?]d', ' would', text)
    text = re.sub(r'[\'?]ve', ' have', text)
    text = re.sub(r'n[\'?]t', ' not', text)

    return text


# for showing the result only
print(df.content[24])
print(deContact(df.content[24]))
print(df.content[110])
print(deContact(df.content[110]))


sorry i missed your call let's talk when you have the time. i'm on 07090201529
sorry i missed your call let us talk when you have the time. i am on 07090201529
i luv u soo much u don?t understand how special u r 2 me ring u 2morrow luv u xxx
i luv u soo much u do not understand how special u r 2 me ring u 2morrow luv u xxx


In [3]:
# const for controlling the level of data cleansing

DECONTACT = True

REPLACE_HYPERLINK = True
REPLACE_EMAIL_ADDRESS = True
REPLACE_CURRENCY_SIGN = True
REPLACE_NUMBER = True
REPLACE_SPECIAL_CHAR = True
REPLACE_NEW_LINE = True
REPLACE_WHITE_SPACE = True

LEMMATIZE = True
REMOVE_STOP_WORDS = True


In [4]:
if DECONTACT:
    df.content = df.content.map(deContact)

if REPLACE_HYPERLINK:
    df.content = df.content.map(
        lambda row: re.sub(r'http[s]?:\/\/[\w\/.?=-]+', ' link ', row))

if REPLACE_EMAIL_ADDRESS:
    df.content = df.content.map(
        lambda row: re.sub(r'[\w\.+]+@[\w\.]+\.[a-z]{2,}', ' email ', row))

if REPLACE_CURRENCY_SIGN:
    df.content = df.content.map(
        lambda row: re.sub(r'[\$€£¥]', ' money ', row))

if REPLACE_NUMBER:
    df.content = df.content.map(
        lambda row: re.sub(r'[\d]+', ' number ', row))

if REPLACE_SPECIAL_CHAR:
    df.content = df.content.map(
        lambda row: re.sub(r'[^a-zA-Z0-9]+', ' ', row))

if REPLACE_NEW_LINE:
    df.content = df.content.map(
        lambda row: re.sub(r'[\r\n]', ' ', row))

if REPLACE_WHITE_SPACE:
    df.content = df.content.map(
        lambda row: re.sub(r'[\s]{2,}', ' ', row))
    df.content = df.content.map(
        lambda row: re.sub(r'^[\s]+|[\s]+$', '', row))


In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stopwords = stopwords.words('english')

df.content = df.content.map(
    lambda row: ' '.join([word for word in row.split() if word not in (stopwords)]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

df.content = df.content.map(
    lambda row: ' '.join([lemmatizer.lemmatize(word) for word in row.split()]))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
df.to_csv('../dataset/set_01/set_01_new.csv',
          header=None, index=False, encoding='ascii')


In [8]:
# drop rows can't be used

df = df.dropna()
df = df.drop_duplicates()

print(df)


      spam                                            content
0        1  urgent call number landline complimentary numb...
1        1  number urgent number nd attempt contact u u nu...
2        1  free number st week number nokia tone number u...
3        1  urgent call number landline complementary numb...
4        1  winner valued network customer selected receiv...
...    ...                                                ...
5801     0                      lol grin babe thanks thinking
5802     0                       man bus slow think gonna get
5803     0  hope text meet smiling let text give reason sm...
5804     0  case wake wondering forgot take care something...
5852     0            hey gal u wanna meet number dinner n te

[5072 rows x 2 columns]


In [9]:
from sklearn.model_selection import train_test_split

X_train = None
X_test = None
y_train = None
y_test = None


def useVectorizer(vectorizer):
    global X_train, X_test, y_train, y_test

    X = vectorizer.fit_transform(df.content).toarray()
    y = df.spam

    print(X.shape)
    print(y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=0)

    print(X_train)
    print(X_train.shape)


In [10]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score


def useClassifier(classifier):
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(accuracy)


def useAllClassifier():
    useClassifier(AdaBoostClassifier(n_estimators=100, random_state=0))
    useClassifier(DecisionTreeClassifier(random_state=0))
    useClassifier(DecisionTreeRegressor(random_state=0))
    useClassifier(GaussianNB())
    useClassifier(GradientBoostingClassifier(n_estimators=100,
                  learning_rate=1.0, max_depth=1, random_state=0))
    useClassifier(KMeans(n_clusters=2, random_state=0))
    useClassifier(KNeighborsClassifier(n_neighbors=3))
    useClassifier(LogisticRegression(random_state=0))
    useClassifier(MultinomialNB())
    useClassifier(RandomForestClassifier(max_depth=2, random_state=0))
    useClassifier(SGDClassifier(max_iter=1000, tol=1e-3))


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

useVectorizer(CountVectorizer())
useAllClassifier()

print()

useVectorizer(TfidfVectorizer())
useAllClassifier()

print()

useVectorizer(HashingVectorizer(n_features=2**4))
useAllClassifier()


(5072, 7084)
(5072,)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(4057, 7084)
0.9773399014778326
0.9665024630541872
0.9665024630541872
0.8866995073891626
0.9556650246305419




0.9527093596059113
0.9556650246305419
0.9724137931034482
0.961576354679803
0.8955665024630541
0.9753694581280788

(5072, 7084)
(5072,)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(4057, 7084)
0.9822660098522168
0.961576354679803
0.961576354679803
0.8827586206896552
0.9507389162561576




0.05123152709359606
0.941871921182266
0.9655172413793104
0.9665024630541872
0.8955665024630541
0.9793103448275862

(5072, 16)
(5072,)
[[ 0.         -0.30151134  0.30151134 ...  0.30151134  0.
   0.30151134]
 [ 0.          0.         -0.42857143 ...  0.14285714  0.14285714
   0.        ]
 [ 0.          0.          0.70710678 ...  0.          0.
   0.        ]
 ...
 [ 0.          0.         -0.70710678 ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
  -0.5       ]
 [ 0.          0.81649658  0.         ...  0.          0.
   0.        ]]
(4057, 16)
0.9349753694581281
0.9192118226600985
0.9192118226600985
0.9211822660098522
0.9182266009852217
0.21970443349753693
0.9349753694581281
0.941871921182266




ValueError: Negative values in data passed to MultinomialNB (input X)