In [1]:
# set_01 is merged by below samples, by a .net program
# all of them are english messages
# https://github.com/semnan-university-ai/SMS-Spam-Collection/blob/main/english_big.txt
# https://github.com/rodrigodelmonte/pytext-lab/blob/master/dataset/smsspam_train.tsv
# https://github.com/rodrigodelmonte/pytext-lab/blob/master/dataset/smsspam_test.tsv
# https://github.com/Peviroy/MailChecker/blob/master/data/spam.csv

import pandas as pd

UTF_8 = 'utf-8'

# read csv, no header
df1 = pd.read_csv('../dataset/set_01/set_01.csv', header=None, encoding=UTF_8)

# define header
df1.rename(columns={0: 'spam', 1: 'content'}, inplace=True)

# transform content to lower case before any further process
df1.content = df1.content.str.lower()

print(df1)


      spam                                            content
0        1  urgent! call 09061749602 from landline. your c...
1        1  +449071512431 urgent! this is the 2nd attempt ...
2        1  free for 1st week! no1 nokia tone 4 ur mob eve...
3        1  urgent! call 09066612661 from landline. your c...
4        1  winner!! as a valued network customer you have...
...    ...                                                ...
5881     0                ok lor ?_ reaching then message me.
5882     1  marvel mobile play the official ultimate spide...
5883     0             it???s reassuring in this crazy world.
5884     1  asked 3mobile if 0870 chatlines inclu in free ...
5885     0              will ?_ b going to esplanade fr home?

[5886 rows x 2 columns]


In [2]:
# set_02 is some samples inputted by our team member, through a google form

import re
from zhconv import convert


def scToTc(text):
    text = convert(text, 'zh-tw')

    return text


# read csv, get two columns
df2 = pd.read_csv('../dataset/set_02/SMS collect form (Responses) - Form Responses 1.csv', header=0, usecols=['Content', 'Spam or Ham'], encoding=UTF_8)

# rename and reorder columns
df2 = df2.rename(columns={'Content': 'content', 'Spam or Ham': 'spam'})
df2 = df2.reindex(columns=['spam', 'content'])

# Spam = 1, Ham = 0
df2.spam = df2.spam.map({'Spam': 1, 'Ham': 0})

# transform content to lower case before any further process
df2.content = df2.content.str.lower()

# for testing only
print(re.sub(r'[\r\n]', ' ', df2.content[26]))
print(re.sub(r'[\r\n]', ' ', scToTc(df2.content[26])))
print(re.sub(r'[\r\n]', ' ', df2.content[146]))
print(re.sub(r'[\r\n]', ' ', scToTc(df2.content[146])))

# transform content to traditional chinese (taiwan)
df2.content = df2.content.map(scToTc)

print(df2)


亲爱的chankwan pok先生或女士，欢迎您加入“心享”计划 hpr”，您的会员编码为：111023621183，初始密码为：006653。恭喜您可尊享通过“中旅酒店心享会”公众号推荐朋友加入会员活动，被推荐的新会员还可以领取代金券礼包！详询400-669-0000（大陆地区）、852-36040000（港澳地区）。
親愛的chankwan pok先生或女士，歡迎您加入「心享」計劃 hpr」，您的會員編碼為：111023621183，初始密碼為：006653。恭喜您可尊享通過「中旅酒店心享會」公眾號推薦朋友加入會員活動，被推薦的新會員還可以領取代金券禮包！詳詢400-669-0000（大陸地區）、852-36040000（港澳地區）。
【阿里巴巴】验证码798505，您正在登录验证，切勿将验证码泄露于他人，验证码15分钟内有效。
【阿里巴巴】驗證碼798505，您正在登錄驗證，切勿將驗證碼洩露於他人，驗證碼15分鐘內有效。
     spam                                            content
0       1    [netflix] : 無法處理你的自動付款。你的帳戶將被禁用。t.co/ntakhpfwqr
1       1  您好！我系橙橙\n邀請您加細妹微信：76169639 睇朋友圈保有您喜歡嘅哦！\n有國際大牌...
2       1  您好！{本店支持淘寶店鋪下單，淘寶店鋪付款}\n請加我微信（wechat）: 1198632...
3       1  aeon: 即日起至3月5日，登入「aeon 香港」手機應用程式或aeon網上客戶服務，申請...
4       1  恒生hang seng：信用卡「現金分期」計劃：\n已為你預先批核多一筆現金，你的稅季限定個...
..    ...                                                ...
495     0                             327886 是你的moneyhero驗證碼
496     0  nti has received your order\n[services redeeme...
497     0  渣打香港:您的渣打信用卡結尾8492於0

In [3]:
# set_04
# https://github.com/youzan/YZSpamFilter/blob/master/spam.txt
# https://github.com/youzan/YZSpamFilter/blob/master/ham.txt

df4_spam = pd.read_csv('../dataset/set_04/spam.txt', header=None, encoding=UTF_8)
df4_spam[1] = 1

df4_ham = pd.read_csv('../dataset/set_04/ham.txt', header=None, encoding=UTF_8)
df4_ham[1] = 0

df4 = pd.concat([df4_spam, df4_ham], ignore_index=True)

# define header and reorder columns
df4.rename(columns={0: 'content', 1: 'spam'}, inplace=True)
df4 = df4.reindex(columns=['spam', 'content'])

# transform content to lower case before any further process
df4.content = df4.content.str.lower()

# transform to tc
df4.content = df4.content.map(scToTc)

print(df4)


       spam                                            content
0         1  官網認證日入官網認證日入官網認證日入官網認證日入官網認證日入官網認證日入官網認證日入官網認證...
1         1  我有芳罰跟著芳法操作不愁沒克源而且都是靜準顧克主動家你的才是正真需要鏟品的聲明我們不是佳人軟...
2         1  官方認證急招打字員官方認證急招打字員官方認證急招打字員官方認證急招打字員官方認證急招打字員官...
3         1                        誠日入誠日入誠日入誠日入誠日入誠日入誠日入誠日入誠日入
4         1      官方認證急招打字員官方認證急招打字員官方認證急招打字員官方認證急招打字員官方認證急招打字員
...     ...                                                ...
15959     0   我本人申請的有贊帳號為什麼不能進行微信公眾號授權我本人申請的有贊帳號為什麼不能進行微信公眾號授權
15960     0                   怎麼綁定不了公眾號呢有圖點我有微信公眾號立即設置進入就變成這樣了
15961     0  為什麼訂單已生成卻沒有發貨按鈕選擇我們的蛋糕有贊微商城客人昨天上午拍了個蛋糕我們已在下午送貨...
15962     0                        為什麼我註冊後再次登陸打不開頁面電腦上顯示的是這個內容
15963     0                          有贊支持商品素材導出嗎有贊支持商品素材導出嗎格式的

[15964 rows x 2 columns]


In [4]:
# merge set_01, set_02 and set_04 data
df = pd.concat([df1, df2, df4], ignore_index=True)

print(df)


       spam                                            content
0         1  urgent! call 09061749602 from landline. your c...
1         1  +449071512431 urgent! this is the 2nd attempt ...
2         1  free for 1st week! no1 nokia tone 4 ur mob eve...
3         1  urgent! call 09066612661 from landline. your c...
4         1  winner!! as a valued network customer you have...
...     ...                                                ...
22345     0   我本人申請的有贊帳號為什麼不能進行微信公眾號授權我本人申請的有贊帳號為什麼不能進行微信公眾號授權
22346     0                   怎麼綁定不了公眾號呢有圖點我有微信公眾號立即設置進入就變成這樣了
22347     0  為什麼訂單已生成卻沒有發貨按鈕選擇我們的蛋糕有贊微商城客人昨天上午拍了個蛋糕我們已在下午送貨...
22348     0                        為什麼我註冊後再次登陸打不開頁面電腦上顯示的是這個內容
22349     0                          有贊支持商品素材導出嗎有贊支持商品素材導出嗎格式的

[22350 rows x 2 columns]


In [5]:
import contractions


def expandContraction(text):
    text = str(text)

    # use regex for handling some ' is ?
    # ??? if asdfklhli'maksdjhfl

    # specific
    text = re.sub(r'i[\'?]m', 'i am', text)
    text = re.sub(r'let[\'?]s', 'let us', text)
    text = re.sub(r'don[\'?]t', 'do not', text)
    text = re.sub(r'can[\'?]t', 'can not', text)
    text = re.sub(r'won[\'?]t', 'will not', text)

    # general
    text = re.sub(r'[\'?]s', ' is', text)
    text = re.sub(r'[\'?]re', ' are', text)
    text = re.sub(r'[\'?]ll', ' will', text)
    text = re.sub(r'[\'?]d', ' would', text)
    text = re.sub(r'[\'?]ve', ' have', text)
    text = re.sub(r'n[\'?]t', ' not', text)

    # library
    text = contractions.fix(text)

    return text


# for testing only
print(df.content[24])
print(expandContraction(df.content[24]))
print(df.content[110])
print(expandContraction(df.content[110]))


sorry i missed your call let's talk when you have the time. i'm on 07090201529
sorry i missed your call let us talk when you have the time. i am on 07090201529
i luv u soo much u don?t understand how special u r 2 me ring u 2morrow luv u xxx
i love you soo much you do not understand how special you r 2 me ring you 2morrow love you xxx


In [6]:
def cleanData(df):
    # expand contraction
    df.content = df.content.map(expandContraction)

    # replace hyperlink
    df.content = df.content.map(lambda row: re.sub(r'http[s]?:\/\/[\w\/.?=-]+', ' link ', row))

    # replace email address
    df.content = df.content.map(lambda row: re.sub(r'[\w\.+]+@[\w\.]+\.[a-z]{2,}', ' email ', row))

    # replace currency sign
    df.content = df.content.map(lambda row: re.sub(r'[\$€£¥]', ' money ', row))

    # replace number
    df.content = df.content.map(lambda row: re.sub(r'[\d]+', ' number ', row))

    # replace special char, other than a-z, A-Z, 0-9 and chinese
    df.content = df.content.map(lambda row: re.sub(r'[^a-zA-Z0-9\u4E00-\u9FFF]+', ' ', row))

    # replace new line (carriage return and line feed)
    df.content = df.content.map(lambda row: re.sub(r'[\r\n]', ' ', row))

    # replace white space
    df.content = df.content.map(lambda row: re.sub(r'[\s]{2,}', ' ', row))
    df.content = df.content.map(lambda row: re.sub(r'^[\s]+|[\s]+$', '', row))


# clean the data for set_01 and set_02
cleanData(df)

print(df)


       spam                                            content
0         1  urgent call number from landline your complime...
1         1  number urgent this is the number nd attempt to...
2         1  free for number st week no number nokia tone n...
3         1  urgent call number from landline your compleme...
4         1  winner as a valued network customer you have b...
...     ...                                                ...
22345     0   我本人申請的有贊帳號為什麼不能進行微信公眾號授權我本人申請的有贊帳號為什麼不能進行微信公眾號授權
22346     0                   怎麼綁定不了公眾號呢有圖點我有微信公眾號立即設置進入就變成這樣了
22347     0  為什麼訂單已生成卻沒有發貨按鈕選擇我們的蛋糕有贊微商城客人昨天上午拍了個蛋糕我們已在下午送貨...
22348     0                        為什麼我註冊後再次登陸打不開頁面電腦上顯示的是這個內容
22349     0                          有贊支持商品素材導出嗎有贊支持商品素材導出嗎格式的

[22350 rows x 2 columns]


In [7]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')


def stopWords(df):
    words = stopwords.words('english')

    df.content = df.content.map(lambda row: ' '.join([word for word in row.split() if word not in (words)]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from nltk.stem import PorterStemmer

nltk.download('punkt')


def stemming(df):
    stemmer = PorterStemmer()

    df.content = df.content.map(lambda row: ' '.join([stemmer.stem(word) for word in row.split()]))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')


def lemmatization(df):
    lemmatizer = WordNetLemmatizer()

    df.content = df.content.map(lambda row: ' '.join([lemmatizer.lemmatize(word) for word in row.split()]))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
from pathlib import Path, PurePath


def transFileToTc(sc_path, force=False):
    path = Path(sc_path)

    folder = path.parent.absolute()
    stem = path.stem
    suffix = path.suffix

    tc_path = PurePath(folder, stem + '_tc' + suffix)

    if not Path(tc_path).is_file() or force:
        sc_file = open(sc_path, 'r', encoding=UTF_8)

        tc_content = scToTc(sc_file.read())
        tc_content = tc_content.lower()

        tc_array = tc_content.split('\n')
        tc_array = list(dict.fromkeys(tc_array))

        tc_file = open(tc_path, 'w', encoding=UTF_8)
        tc_file.write('\n'.join(tc_array))

    return str(tc_path)


# use it later, for chinese text segmentation
dict_big_tc = transFileToTc('./jieba/dict_big.txt', True)


In [11]:
# enhance dictionary for chinese text segmentation
def enhanceDictionary(tc_path, other_list):
    if Path(tc_path).is_file():
        tc_file = open(tc_path, 'r', encoding=UTF_8)

        tc_content = tc_file.read()

        tc_key_array = tc_content.split('\n')
        tc_key_array = list(dict.fromkeys(tc_key_array))
        tc_key_array = [word.split()[0] for word in tc_key_array if len(word) > 0]

        # convert to set for faster comparison
        tc_key_set = set(tc_key_array)

        for other_path in other_list:
            other_file = open(other_path, 'r', encoding=UTF_8)

            other_content = scToTc(other_file.read())
            other_content = other_content.lower()

            other_array = other_content.split('\n')
            other_array = list(dict.fromkeys(other_array))
            other_array = [word for word in other_array if len(word) > 0]

            new_array = [word for word in other_array if word.split()[0] not in tc_key_set]

            for word in new_array:
                tc_content += word + '\n'

                tc_key_array.append(word)

        tc_file = open(tc_path, 'w', encoding=UTF_8)
        tc_file.write(tc_content)


# merge two more dictionaries
enhanceDictionary(dict_big_tc, ['./jieba/dict_original.txt', './jieba/dict_taiwan.txt'])


In [12]:
import jieba


def segmentation(df):
    jieba.load_userdict(dict_big_tc)
    # jieba.load_userdict('./jieba/dict_custom.txt')

    df.content = df.content.map(lambda row: ' '.join(jieba.cut(row)))
    df.content = df.content.map(lambda row: re.sub(r'[\s]{2,}', ' ', row))


In [13]:
# set_03
# https://github.com/Cypher-Z/FBS_SMS_Dataset

df3 = pd.DataFrame()

set_03_dir = '../dataset/set_03/'
set_03_files = Path(set_03_dir).glob('*')

for file in set_03_files:
    df3_one = pd.read_csv(file, header=None, encoding=UTF_8)
    df3_one[1] = 1

    # define header and reorder columns
    df3_one.rename(columns={0: 'content', 1: 'spam'}, inplace=True)
    df3_one = df3_one.reindex(columns=['spam', 'content'])

    # since df3 is well processed, special handling for some keywords
    df3_one.content = df3_one.content.map(lambda row:  row.replace('URL', ' link '))
    df3_one.content = df3_one.content.map(lambda row:  row.replace('HOTLINE', ' number '))
    df3_one.content = df3_one.content.map(lambda row:  row.replace('CELLPHONE', ' number '))
    df3_one.content = df3_one.content.map(lambda row:  row.replace('PHONE', ' number '))
    df3_one.content = df3_one.content.map(lambda row:  row.replace('DIGIT', ' number '))
    df3_one.content = df3_one.content.map(lambda row:  row.replace('NAME', ' '))
    df3_one.content = df3_one.content.map(lambda row:  row.replace('PLACE', ' '))

    # transform content to lower case before any further process
    df3_one.content = df3_one.content.str.lower()

    # transform to tc
    df3_one.content = df3_one.content.map(scToTc)

    cleanData(df3_one)

    # remove if content no any space
    df3_one = df3_one[df3_one.content.str.contains(r'[\s]+')]

    df3 = pd.concat([df3, df3_one.copy()], ignore_index=True)


In [14]:
import itertools

actions = 'STOP_WORDS STEMMING LEMMATIZATION'.split()

for p in itertools.product([0, 1], repeat=3):
    params = dict(zip(actions, p))

    STOP_WORDS = params['STOP_WORDS']
    STEMMING = params['STEMMING']
    LEMMATIZATION = params['LEMMATIZATION']

    newDf = df.copy()

    if (bool(STOP_WORDS)):
        stopWords(newDf)

    if (bool(STEMMING)):
        stemming(newDf)

    if (bool(LEMMATIZATION)):
        lemmatization(newDf)

    segmentation(newDf)

    newDf = pd.concat([newDf, df3], ignore_index=True)
    newDf = newDf.dropna()
    newDf = newDf.drop_duplicates()

    newDf.to_csv(f'../dataset/set_01_02_03_04_{STOP_WORDS}_{STEMMING}_{LEMMATIZATION}_new.csv', header=None, index=False, encoding=UTF_8)

    if (STOP_WORDS == 1 and STEMMING == 1 and LEMMATIZATION == 1):
        df = newDf


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.cache
Loading model cost 1.201 seconds.
Prefix dict has been built successfully.
