In [None]:
import re
import contractions
import jieba
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from zhconv import convert
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import load_model

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
jieba.load_userdict('./jieba/dict_big.txt')


def scToTc(text):
    text = convert(text, 'zh-tw')

    return text


def expandContraction(text):
    # specific
    text = re.sub(r'i[\'?]m', 'i am', text)
    text = re.sub(r'let[\'?]s', 'let us', text)
    text = re.sub(r'don[\'?]t', 'do not', text)
    text = re.sub(r'can[\'?]t', 'can not', text)
    text = re.sub(r'won[\'?]t', 'will not', text)

    # general
    text = re.sub(r'[\'?]s', ' is', text)
    text = re.sub(r'[\'?]re', ' are', text)
    text = re.sub(r'[\'?]ll', ' will', text)
    text = re.sub(r'[\'?]d', ' would', text)
    text = re.sub(r'[\'?]ve', ' have', text)
    text = re.sub(r'n[\'?]t', ' not', text)

    # library
    text = contractions.fix(text)

    return text


def cleanData(text):
    # expand contraction
    text = expandContraction(text)

    # replace hyperlink
    text = re.sub(r'http[s]?:\/\/[\w\/.?=-]+', ' link ', text)

    # replace email address
    text = re.sub(r'[\w\.+]+@[\w\.]+\.[a-z]{2,}', ' email ', text)

    # replace currency sign
    text = re.sub(r'[\$€£¥]', ' money ', text)

    # replace number
    text = re.sub(r'[\d]+', ' number ', text)

    # replace special char, other than a-z, A-Z, 0-9 and chinese
    text = re.sub(r'[^a-zA-Z0-9\u4E00-\u9FFF]+', ' ', text)

    # replace new line (carriage return and line feed)
    text = re.sub(r'[\r\n]', ' ', text)

    # replace white space
    text = re.sub(r'[\s]{2,}', ' ', text)
    text = re.sub(r'^[\s]+|[\s]+$', '', text)

    return text


def stopWords(text, words):
    text = ' '.join([word for word in text.split() if word not in (words)])

    return text


def stemming(text, stemmer):
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    return text


def lemmatization(text, lemmatizer):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text


def segmentation(text):
    text = ' '.join(jieba.cut(text))

    return text


In [None]:
input = ['[Netflix] : 無法處理你的自動付款。你的帳戶將被禁用。t.co/ntAkhpFWqR',
         '亲爱的CHANKWAN POK先生或女士，欢迎您加入“心享”计划 HPR”，您的会员编码为：111023621183，初始密码为：006653。恭喜您可尊享通过“中旅酒店心享会”公众号推荐朋友加入会员活动，被推荐的新会员还可以领取代金券礼包！详询400-669-0000（大陆地区）、852-36040000（港澳地区）。',
         '《天外》全新資料片登場!開放亞特蘭提斯城!推出英雄升變!英雄能力大解放! mo.alta.hk 查詢EN/取消UN81060822',
         '渣打香港: 您現正使用尾數為0376之信用卡在Pure International HK L進行一項網上交易，金額HKD 2,038.00;你的一次有效密碼為NGY-730596。',
         '''您好！我系橙橙
邀請您加细妹微信：76169639 睇朋友圈保有您喜歡嘅哦！
有國際大牌：L.V、愛馬仕、普拉達、迪奧、聖羅蘭、古奇、芬迪、巴寶莉、阿瑪尼、寶格麗、勞力士等（男女服裝、鞋子、包包、手錶、皮帶、圍巾、首飾等）第三方担保交易，驗貨滿意再签收，購物零風險
我要告訴您：我賣嘅系超A貨，我沒有去欺騙任何人，嗰啲揾我買嘅人，他們都很清楚''']

# 1, 1, 0, 0, 1


def preprocess(input, remove_stop, stem, lemmatize):
    words = stopwords.words('english')
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    output = []

    for text in input:
        text = text.lower()
        text = scToTc(text)
        text = cleanData(text)

        if remove_stop:
            text = stopWords(text, words)

        if stem:
            text = stemming(text, stemmer)

        if lemmatize:
            text = lemmatization(text, lemmatizer)

        text = segmentation(text)
        text = re.sub(r'[\s]{2,}', ' ', text)

        output.append(text)
    return output


output = preprocess(input, True, True, True)

print(output)


In [None]:
def load_data(filename):
    df = pd.read_csv(filename, header=None, encoding='utf-8').dropna()
    df.columns = ['label', 'data']

    return df['data']


def featureExtraction(input):
    X = load_data('../dataset/set_01_02_03_04_0_0_0_new.csv')

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X)

    print(vectorizer.get_feature_names_out())

    output = input.copy()
    output = vectorizer.transform(output).toarray()

    return output


output = featureExtraction(output)

print(output.shape)
print(output)


In [None]:
loaded_model = load_model('./best_model.h5')
loaded_model.summary()

pred = loaded_model.predict(output)
true = [1, 1, 0, 0, 1]

print(pred)

score = accuracy_score(true, pred > 0.5)

print(score)
