In [None]:
import pandas as pd

UTF_8 = 'utf-8'

# read csv, no header
df1 = pd.read_csv('../dataset/set_01/set_01.csv',
                  header=None, encoding=UTF_8)

# define header
df1.rename(columns={0: 'spam', 1: 'content'}, inplace=True)

# transform content to lower case before any further process
df1.content = df1.content.str.lower()

print(df1)


In [None]:
# read csv, get two columns
df2 = pd.read_csv('../dataset/set_02/SMS collect form (Responses) - Form Responses 1.csv',
                  usecols=['Content', 'Spam or Ham'], encoding=UTF_8)

# rename and reorder columns
df2 = df2.rename(columns={'Content': 'content', 'Spam or Ham': 'spam'})
df2 = df2.reindex(columns=['spam', 'content'])

# Spam = 1, Ham = 0
df2.spam = df2.spam.map({'Spam': 1, 'Ham': 0})

# transform content to lower case before any further process
df2.content = df2.content.str.lower()

print(df2)


In [None]:
import re
from zhconv import convert


def scToTc(text):
    text = convert(text, 'zh-tw')

    return text


# for testing only
print(re.sub(r'[\r\n]', ' ', df2.content[26]))
print(re.sub(r'[\r\n]', ' ', scToTc(df2.content[26])))
print(re.sub(r'[\r\n]', ' ', df2.content[146]))
print(re.sub(r'[\r\n]', ' ', scToTc(df2.content[146])))

df2.content = df2.content.map(scToTc)

print(df2)


In [None]:
# merge two datasets
df = pd.concat([df1, df2], ignore_index=True)

print(df)


In [None]:
def expandContract(text):
    text = str(text)

    # use regex for handling some ' is ?
    # ??? if asdfklhli'maksdjhfl

    # specific
    text = re.sub(r'i[\'?]m', 'i am', text)
    text = re.sub(r'let[\'?]s', 'let us', text)
    text = re.sub(r'don[\'?]t', 'do not', text)
    text = re.sub(r'can[\'?]t', 'can not', text)
    text = re.sub(r'won[\'?]t', 'will not', text)

    # general
    text = re.sub(r'[\'?]s', ' is', text)
    text = re.sub(r'[\'?]re', ' are', text)
    text = re.sub(r'[\'?]ll', ' will', text)
    text = re.sub(r'[\'?]d', ' would', text)
    text = re.sub(r'[\'?]ve', ' have', text)
    text = re.sub(r'n[\'?]t', ' not', text)

    return text


# for testing only
print(df.content[24])
print(expandContract(df.content[24]))
print(df.content[110])
print(expandContract(df.content[110]))


In [None]:
# const for controlling the level of data cleansing
EXPAND_CONTRACT = True

REPLACE_HYPERLINK = True
REPLACE_EMAIL_ADDRESS = True
REPLACE_CURRENCY_SIGN = True
REPLACE_NUMBER = True
REPLACE_SPECIAL_CHAR = True
REPLACE_NEW_LINE = True
REPLACE_WHITE_SPACE = True

REMOVE_STOP_WORDS = True
STEM = True
LEMMATIZE = True


In [None]:
def dataCleansing(df, replaceWhiteSpace=True):
    if EXPAND_CONTRACT:
        df.content = df.content.map(expandContract)

    if REPLACE_HYPERLINK:
        df.content = df.content.map(
            lambda row: re.sub(r'http[s]?:\/\/[\w\/.?=-]+', ' link ', row))

    if REPLACE_EMAIL_ADDRESS:
        df.content = df.content.map(
            lambda row: re.sub(r'[\w\.+]+@[\w\.]+\.[a-z]{2,}', ' email ', row))

    if REPLACE_CURRENCY_SIGN:
        df.content = df.content.map(
            lambda row: re.sub(r'[\$€£¥]', ' money ', row))

    if REPLACE_NUMBER:
        df.content = df.content.map(
            lambda row: re.sub(r'[\d]+', ' number ', row))

    if REPLACE_SPECIAL_CHAR:
        df.content = df.content.map(lambda row: re.sub(
            r'[^a-zA-Z0-9\u4E00-\u9FFF]+', ' ', row))

    if REPLACE_NEW_LINE:
        df.content = df.content.map(lambda row: re.sub(r'[\r\n]', ' ', row))

    if REPLACE_WHITE_SPACE and replaceWhiteSpace:
        df.content = df.content.map(lambda row: re.sub(r'[\s]{2,}', ' ', row))
        df.content = df.content.map(
            lambda row: re.sub(r'^[\s]+|[\s]+$', '', row))


dataCleansing(df)


In [None]:
import nltk

if REMOVE_STOP_WORDS:
    from nltk.corpus import stopwords

    nltk.download('stopwords')

    stopwords = stopwords.words('english')

    df.content = df.content.map(
        lambda row: ' '.join([word for word in row.split() if word not in (stopwords)]))


In [None]:
if STEM:
    from nltk.stem import PorterStemmer

    nltk.download('punkt')

    stemmer = PorterStemmer()

    df.content = df.content.map(
        lambda row: ' '.join([stemmer.stem(word) for word in row.split()]))


In [None]:
if LEMMATIZE:
    from nltk.stem import WordNetLemmatizer

    nltk.download('wordnet')

    lemmatizer = WordNetLemmatizer()

    df.content = df.content.map(
        lambda row: ' '.join([lemmatizer.lemmatize(word) for word in row.split()]))


In [None]:
from pathlib import Path, PurePath


def transFileToTc(sc_path, force=False):
    path = Path(sc_path)

    folder = path.parent.absolute()
    stem = path.stem
    suffix = path.suffix

    tc_path = PurePath(folder, stem + '_tc' + suffix)

    if not Path(tc_path).is_file() or force:
        sc_file = open(sc_path, 'r', encoding=UTF_8)

        tc_content = scToTc(sc_file.read())
        tc_content = tc_content.lower()

        tc_array = tc_content.split('\n')
        tc_array = list(dict.fromkeys(tc_array))

        tc_file = open(tc_path, 'w', encoding=UTF_8)
        tc_file.write('\n'.join(tc_array))

    return str(tc_path)


dict_big_tc = transFileToTc('./jieba/dict_big.txt', True)


In [None]:
import jieba

jieba.load_userdict(dict_big_tc)
# jieba.load_userdict('./jieba/dict_custom.txt')

df.content = df.content.map(lambda row: ' '.join(jieba.cut(row)))
df.content = df.content.map(lambda row: re.sub(r'[\s]{2,}', ' ', row))


In [None]:
set_03_dir = '../dataset/set_03/'
set_03_files = Path(set_03_dir).glob('*')

for file in set_03_files:
    df3 = pd.read_csv(file, header=None, encoding=UTF_8)
    df3[1] = 1

    # define header
    df3.rename(columns={0: 'content', 1: 'spam'}, inplace=True)
    # reorder columns
    df3 = df3.reindex(columns=['spam', 'content'])

    # since df3 is well processed, special handling for some keywords
    df3.content = df3.content.map(lambda row:  row.replace('URL', ' link '))
    df3.content = df3.content.map(
        lambda row:  row.replace('HOTLINE', ' number '))
    df3.content = df3.content.map(
        lambda row:  row.replace('CELLPHONE', ' number '))
    df3.content = df3.content.map(
        lambda row:  row.replace('PHONE', ' number '))
    df3.content = df3.content.map(
        lambda row:  row.replace('DIGIT', ' number '))
    df3.content = df3.content.map(lambda row:  row.replace('NAME', ' '))
    df3.content = df3.content.map(lambda row:  row.replace('PLACE', ' '))

    # transform content to lower case before any further process
    df3.content = df3.content.str.lower()

    # transform to tc
    df3.content = df3.content.map(scToTc)

    dataCleansing(df3)

    # remove if content no any space
    df3 = df3[df3.content.str.contains(r'[\s]+')]

    df = pd.concat([df, df3], ignore_index=True)


In [None]:
# drop rows can't be used
df = df.dropna()
df = df.drop_duplicates()

print(df)


In [None]:
df.to_csv('../dataset/set_01_02_03_new.csv',
          header=None, index=False, encoding=UTF_8)


In [None]:
from sklearn.model_selection import train_test_split

X_train = None
X_test = None
y_train = None
y_test = None


def useVectorizer(vectorizer):
    global X_train, X_test, y_train, y_test

    X = vectorizer.fit_transform(df.content).toarray()
    y = df.spam

    print(X.shape)
    print(y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=0)

    print(X_train)
    print(X_train.shape)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score


def useClassifier(classifier):
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(accuracy)


def useAllClassifier():
    useClassifier(AdaBoostClassifier(n_estimators=100, random_state=0))
    useClassifier(DecisionTreeClassifier(random_state=0))
    useClassifier(DecisionTreeRegressor(random_state=0))
    useClassifier(GaussianNB())
    useClassifier(GradientBoostingClassifier(n_estimators=100,
                  learning_rate=1.0, max_depth=1, random_state=0))
    useClassifier(KMeans(n_clusters=2, random_state=0))
    useClassifier(KNeighborsClassifier(n_neighbors=3))
    useClassifier(LogisticRegression(random_state=0))
    useClassifier(MultinomialNB())
    useClassifier(RandomForestClassifier(max_depth=2, random_state=0))
    useClassifier(SGDClassifier(max_iter=1000, tol=1e-3))


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

useVectorizer(CountVectorizer())
useAllClassifier()

# not run, will error
if False:
    print()

    useVectorizer(TfidfVectorizer())
    useAllClassifier()

    print()

    useVectorizer(HashingVectorizer(n_features=2**4))
    useAllClassifier()


In [None]:
model = MultinomialNB().fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)
