In [1]:
import pandas as pd

UTF_8 = 'utf-8'

# read csv, no header
df1 = pd.read_csv('../dataset/set_01/set_01.csv',
                  header=None, encoding=UTF_8)

# define header
df1.rename(columns={0: 'spam', 1: 'content'}, inplace=True)

# transform content to lower case before any further process
df1.content = df1.content.str.lower()

print(df1)


      spam                                            content
0        1  urgent! call 09061749602 from landline. your c...
1        1  +449071512431 urgent! this is the 2nd attempt ...
2        1  free for 1st week! no1 nokia tone 4 ur mob eve...
3        1  urgent! call 09066612661 from landline. your c...
4        1  winner!! as a valued network customer you have...
...    ...                                                ...
5881     0                ok lor ?_ reaching then message me.
5882     1  marvel mobile play the official ultimate spide...
5883     0             it???s reassuring in this crazy world.
5884     1  asked 3mobile if 0870 chatlines inclu in free ...
5885     0              will ?_ b going to esplanade fr home?

[5886 rows x 2 columns]


In [2]:
# read csv, get two columns
df2 = pd.read_csv('../dataset/set_02/SMS collect form (Responses) - Form Responses 1.csv',
                  usecols=['Content', 'Spam or Ham'], encoding=UTF_8)

# rename and reorder columns
df2 = df2.rename(columns={'Content': 'content', 'Spam or Ham': 'spam'})
df2 = df2.reindex(columns=['spam', 'content'])

# Spam = 1, Ham = 0
df2.spam = df2.spam.map({'Spam': 1, 'Ham': 0})

# transform content to lower case before any further process
df2.content = df2.content.str.lower()

print(df2)


     spam                                            content
0       1    [netflix] : 無法處理你的自動付款。你的帳戶將被禁用。t.co/ntakhpfwqr
1       1  您好！我系橙橙\n邀請您加细妹微信：76169639 睇朋友圈保有您喜歡嘅哦！\n有國際大牌...
2       1  您好！{本店支持淘宝店铺下单，淘宝店铺付款}\n請加我微信（wechat）: 1198632...
3       0  aeon: 即日起至3月5日，登入「aeon 香港」手機應用程式或aeon網上客戶服務，申請...
4       1  恒生hang seng：信用卡「現金分期」計劃：\n已為你預先批核多一筆現金，你的稅季限定個...
..    ...                                                ...
143     0  渣打香港:您的渣打信用卡結尾7645於03/27 在 vennic limit 有一項hkd...
144     0  nti have received your order for\n1 x cwp - ca...
145     0         your nti mall verification code is: 467255
146     0    【阿里巴巴】验证码798505，您正在登录验证，切勿将验证码泄露于他人，验证码15分钟内有效。
147     0         your requested authentication code: 281577

[148 rows x 2 columns]


In [3]:
import re
from zhconv import convert


def scToTc(text):
    text = convert(text, 'zh-tw')

    return text


# for testing only
print(re.sub(r'[\r\n]', ' ', df2.content[26]))
print(re.sub(r'[\r\n]', ' ', scToTc(df2.content[26])))
print(re.sub(r'[\r\n]', ' ', df2.content[146]))
print(re.sub(r'[\r\n]', ' ', scToTc(df2.content[146])))

df2.content = df2.content.map(scToTc)

print(df2)


亲爱的chankwan pok先生或女士，欢迎您加入“心享”计划 hpr”，您的会员编码为：111023621183，初始密码为：006653。恭喜您可尊享通过“中旅酒店心享会”公众号推荐朋友加入会员活动，被推荐的新会员还可以领取代金券礼包！详询400-669-0000（大陆地区）、852-36040000（港澳地区）。
親愛的chankwan pok先生或女士，歡迎您加入「心享」計劃 hpr」，您的會員編碼為：111023621183，初始密碼為：006653。恭喜您可尊享通過「中旅酒店心享會」公眾號推薦朋友加入會員活動，被推薦的新會員還可以領取代金券禮包！詳詢400-669-0000（大陸地區）、852-36040000（港澳地區）。
【阿里巴巴】验证码798505，您正在登录验证，切勿将验证码泄露于他人，验证码15分钟内有效。
【阿里巴巴】驗證碼798505，您正在登錄驗證，切勿將驗證碼洩露於他人，驗證碼15分鐘內有效。
     spam                                            content
0       1    [netflix] : 無法處理你的自動付款。你的帳戶將被禁用。t.co/ntakhpfwqr
1       1  您好！我系橙橙\n邀請您加細妹微信：76169639 睇朋友圈保有您喜歡嘅哦！\n有國際大牌...
2       1  您好！{本店支持淘寶店鋪下單，淘寶店鋪付款}\n請加我微信（wechat）: 1198632...
3       0  aeon: 即日起至3月5日，登入「aeon 香港」手機應用程式或aeon網上客戶服務，申請...
4       1  恒生hang seng：信用卡「現金分期」計劃：\n已為你預先批核多一筆現金，你的稅季限定個...
..    ...                                                ...
143     0  渣打香港:您的渣打信用卡結尾7645於03/27 在 vennic limit 有一項hkd...
144     0  nti have received your order for\n1 x cwp - ca...
145     0         your nti mall

In [4]:
# merge two datasets
df = pd.concat([df1, df2], ignore_index=True)

print(df)


      spam                                            content
0        1  urgent! call 09061749602 from landline. your c...
1        1  +449071512431 urgent! this is the 2nd attempt ...
2        1  free for 1st week! no1 nokia tone 4 ur mob eve...
3        1  urgent! call 09066612661 from landline. your c...
4        1  winner!! as a valued network customer you have...
...    ...                                                ...
6029     0  渣打香港:您的渣打信用卡結尾7645於03/27 在 vennic limit 有一項hkd...
6030     0  nti have received your order for\n1 x cwp - ca...
6031     0         your nti mall verification code is: 467255
6032     0    【阿里巴巴】驗證碼798505，您正在登錄驗證，切勿將驗證碼洩露於他人，驗證碼15分鐘內有效。
6033     0         your requested authentication code: 281577

[6034 rows x 2 columns]


In [5]:
def deContact(text):
    text = str(text)

    # use regex for handling some ' is ?
    # ??? if asdfklhli'maksdjhfl

    # specific
    text = re.sub(r'i[\'?]m', 'i am', text)
    text = re.sub(r'let[\'?]s', 'let us', text)
    text = re.sub(r'don[\'?]t', 'do not', text)
    text = re.sub(r'can[\'?]t', 'can not', text)
    text = re.sub(r'won[\'?]t', 'will not', text)

    # general
    text = re.sub(r'[\'?]s', ' is', text)
    text = re.sub(r'[\'?]re', ' are', text)
    text = re.sub(r'[\'?]ll', ' will', text)
    text = re.sub(r'[\'?]d', ' would', text)
    text = re.sub(r'[\'?]ve', ' have', text)
    text = re.sub(r'n[\'?]t', ' not', text)

    return text


# for testing only
print(df.content[24])
print(deContact(df.content[24]))
print(df.content[110])
print(deContact(df.content[110]))


sorry i missed your call let's talk when you have the time. i'm on 07090201529
sorry i missed your call let us talk when you have the time. i am on 07090201529
i luv u soo much u don?t understand how special u r 2 me ring u 2morrow luv u xxx
i luv u soo much u do not understand how special u r 2 me ring u 2morrow luv u xxx


In [6]:
# const for controlling the level of data cleansing
DECONTACT = True

REPLACE_HYPERLINK = True
REPLACE_EMAIL_ADDRESS = True
REPLACE_CURRENCY_SIGN = True
REPLACE_NUMBER = True
REPLACE_SPECIAL_CHAR = True
REPLACE_NEW_LINE = True
REPLACE_WHITE_SPACE = True

LEMMATIZE = True
REMOVE_STOP_WORDS = True


In [7]:
if DECONTACT:
    df.content = df.content.map(deContact)

if REPLACE_HYPERLINK:
    df.content = df.content.map(
        lambda row: re.sub(r'http[s]?:\/\/[\w\/.?=-]+', ' link ', row))

if REPLACE_EMAIL_ADDRESS:
    df.content = df.content.map(
        lambda row: re.sub(r'[\w\.+]+@[\w\.]+\.[a-z]{2,}', ' email ', row))

if REPLACE_CURRENCY_SIGN:
    df.content = df.content.map(lambda row: re.sub(r'[\$€£¥]', ' money ', row))

if REPLACE_NUMBER:
    df.content = df.content.map(lambda row: re.sub(r'[\d]+', ' number ', row))

if REPLACE_SPECIAL_CHAR:
    df.content = df.content.map(lambda row: re.sub(
        r'[^a-zA-Z0-9\u4E00-\u9FFF]+', ' ', row))

if REPLACE_NEW_LINE:
    df.content = df.content.map(lambda row: re.sub(r'[\r\n]', ' ', row))

if REPLACE_WHITE_SPACE:
    df.content = df.content.map(lambda row: re.sub(r'[\s]{2,}', ' ', row))
    df.content = df.content.map(lambda row: re.sub(r'^[\s]+|[\s]+$', '', row))


In [8]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stopwords = stopwords.words('english')

df.content = df.content.map(
    lambda row: ' '.join([word for word in row.split() if word not in (stopwords)]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

df.content = df.content.map(
    lambda row: ' '.join([lemmatizer.lemmatize(word) for word in row.split()]))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
from pathlib import Path, PurePath


def transFileToTc(sc_path, force=False):
    path = Path(sc_path)

    folder = path.parent.absolute()
    stem = path.stem
    suffix = path.suffix

    tc_path = PurePath(folder, stem + "_tc" + suffix)

    if not Path(tc_path).is_file() or force:
        sc_file = open(sc_path, 'r', encoding=UTF_8)

        tc_content = scToTc(sc_file.read())
        tc_content = tc_content.lower()

        tc_array = tc_content.split('\n')
        tc_array = list(dict.fromkeys(tc_array))

        tc_file = open(tc_path, 'w', encoding=UTF_8)
        tc_file.write("\n".join(tc_array))

    return str(tc_path)


dict_big_tc = transFileToTc('./jieba/dict_big.txt', True)


In [11]:
import jieba

jieba.load_userdict(dict_big_tc)
# jieba.load_userdict('./jieba/dict_custom.txt')

df.content = df.content.map(lambda row: ' '.join(jieba.cut(row)))
df.content = df.content.map(lambda row: re.sub(r'[\s]{2,}', ' ', row))


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.cache
Loading model cost 0.965 seconds.
Prefix dict has been built successfully.


In [12]:
# drop rows can't be used
df = df.dropna()
df = df.drop_duplicates()

print(df)


      spam                                            content
0        1  urgent call number landline complimentary numb...
1        1  number urgent number nd attempt contact u u nu...
2        1  free number st week number nokia tone number u...
3        1  urgent call number landline complementary numb...
4        1  winner valued network customer selected receiv...
...    ...                                                ...
6028     0  渣打 香港 您 的 渣打 信用卡 結尾 number 於 number number 在 m...
6029     0  渣打 香港 您 的 渣打 信用卡 結尾 number 於 number number 在 v...
6030     0  nti received order number x cwp car wash packa...
6031     0                  nti mall verification code number
6033     0               requested authentication code number

[5194 rows x 2 columns]


In [13]:
df.to_csv('../dataset/set_01_02_new.csv',
          header=None, index=False, encoding=UTF_8)


In [14]:
from sklearn.model_selection import train_test_split

X_train = None
X_test = None
y_train = None
y_test = None


def useVectorizer(vectorizer):
    global X_train, X_test, y_train, y_test

    X = vectorizer.fit_transform(df.content).toarray()
    y = df.spam

    print(X.shape)
    print(y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=0)

    print(X_train)
    print(X_train.shape)


In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score


def useClassifier(classifier):
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(accuracy)


def useAllClassifier():
    useClassifier(AdaBoostClassifier(n_estimators=100, random_state=0))
    useClassifier(DecisionTreeClassifier(random_state=0))
    useClassifier(DecisionTreeRegressor(random_state=0))
    useClassifier(GaussianNB())
    useClassifier(GradientBoostingClassifier(n_estimators=100,
                  learning_rate=1.0, max_depth=1, random_state=0))
    useClassifier(KMeans(n_clusters=2, random_state=0))
    useClassifier(KNeighborsClassifier(n_neighbors=3))
    useClassifier(LogisticRegression(random_state=0))
    useClassifier(MultinomialNB())
    useClassifier(RandomForestClassifier(max_depth=2, random_state=0))
    useClassifier(SGDClassifier(max_iter=1000, tol=1e-3))


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

useVectorizer(CountVectorizer())
useAllClassifier()

# not run, will error
if False:
    print()

    useVectorizer(TfidfVectorizer())
    useAllClassifier()

    print()

    useVectorizer(HashingVectorizer(n_features=2**4))
    useAllClassifier()


(5194, 8545)
(5194,)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(4155, 8545)
0.9643888354186718
0.9624639076034649
0.9624639076034649
0.8960538979788258
0.9499518768046198




0.9278152069297402
0.9432146294513956
0.971126082771896
0.9884504331087585
0.8671799807507219
0.9788257940327237


In [17]:
model = MultinomialNB().fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)


0.9884504331087585
