In [None]:
#supposed the input is in df, the data of X_test=['อย่า ลืม ติดตาม ชม รายการ พิเศษ','ด่วน ลด แบบ จัดเต็ม ของแถม มากมาย']
#keras embed method, will update with the tokenizer and weight file

In [49]:
import re
import pandas as pd
from pythainlp.corpus import thai_stopwords
from pythainlp.tokenize import word_tokenize
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences



In [44]:
def load_model(file):
    with open(file, "r") as json_file:
        loaded_model_json = json_file.read()
    model = model_from_json(loaded_model_json)
    model.load_weights("./org_model_weights.h5")
    return model

def preproces_df(csv):
    #preprocessing the data from df 
    df_a = pd.read_csv(csv)
    df_a['split_text'] = df_a.apply(lambda row: word_tokenize(row['tweet_text'],engine="newmm",keep_whitespace=False), axis=1) #use this
    df_a['combined'] = [' '.join(lst) for lst in df_a['split_text']]
    df_a['cleaned'] = df_a['combined'].apply(deEmojify)
    df_a['cleaned'] = df_a['cleaned'].apply(stopwords_rm) 
    df_a['split_cleaned'] = df_a.apply(lambda row: word_tokenize(row['cleaned'],engine="newmm",keep_whitespace=False), axis=1)
    df_a['word_length'] = df_a['split_cleaned'].str.len()
    return df_a

#loaded_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#sub function below---------------------------------------------------------------------------------------------------
def stopwords_rm(text):
    stopwords = set(thai_stopwords())
    stopwords.update(["nan", "-", "_", "", " ", "฿" ,"ค่ะ", "ครับ", "จ้า"])  # Add more stopwords as needed

    # Remove stopwords from the text
    cleaned_text = ' '.join(word for word in text.split() if word not in stopwords)
    return cleaned_text

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

# Load model and weight
def Main(df):
    model = load_model("./org_model_architecture.json")
    df = preproces_df(df)
    print('predicting...')
    MAX_SEQUENCE_LENGTH = 63 #training file 44
    MAX_WORDS = 2500 #2000

    tokenizer = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
    tokenizer.fit_on_texts(df.cleaned.values)

    X_test=[df['cleaned'].iloc[-1]]
    X_test=tokenizer.texts_to_sequences(X_test)
    X_test=pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(X_test)
    org = ['public work (โยธา)', 'municipal office (เทศกิจ)' ,'police department']

    print('result:', predictions)
    print('classs predicted: ', org[int(predictions.argmax(axis=-1))])
    print('confidence %: ', predictions[0][[int(predictions.argmax(axis=-1))]])
    #for checking from ori excel
    #print('real cat is : ',df['org'].iloc[-1])
    if 'org' not in df.index:
        df.loc['org'] = 0 

    df['org'].iloc[-1] = org[int(predictions.argmax(axis=-1))]
    return df


def Main_relevant(df):
    model = load_model("./relevant_model_architecture.json")
    df = preproces_df(df) #df['tweet_text'] เป็นstring ของ tweet 
    print('predicting...')

    MAX_SEQUENCE_LENGTH = 63 #same as training file
    MAX_WORDS = 3000

    tokenizer = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
    tokenizer.fit_on_texts(df.cleaned.values)

    X_test=[df['cleaned'].iloc[-1]]
    X_test=tokenizer.texts_to_sequences(X_test)
    X_test=pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(X_test)
    rev = ['not relevant','relevant']

    print('result:', predictions)
    print('classs predicted: ', rev[int(predictions.argmax(axis=-1))])
    print('confidence %: ', predictions[0][[int(predictions.argmax(axis=-1))]])
    #for checking
    print('real cat is : ',df['rev'].iloc[-1])

    df_drop = df #if not change then it is relevant

    if int(predictions.argmax(axis=-1))== 0:
        print('not relevant')
        df_drop = df[-1] #store the dropped df
        df = df[:-1] #drop the last one
    return df, df_drop


In [45]:
df_updated, df_drop = Main_relevant("./TestNLP_labeled_lstm.csv") #input the df location
if df_drop[-1] == df_updated[-1]: #relevant
    df = Main(df_updated)
    print('relevant, updated')
    print('org is : ', )
else: 
    print('irrelevant, row deleted')
    print('deleted row : ', df_drop)

predicting...
result: [[0.11716956 0.44871703 0.43411338]]
classs predicted:  municipal office (เทศกิจ)
confidence %:  [0.44871703]
real cat is :  nan


In [48]:
df.iloc[-1]

index                                                              1004
username                                                    AitKanphong
tweet_id                                            1701505503111819503
tweet_text            เหล่าสิงห์มอเตอร์ไซด์สายเท่ทุกท่านครับ กทม จับ...
org                                                                 NaN
relevant                                                          False
category                                                        ทางเท้า
query                 ("ทางเท้า" OR "ทางเดิน" OR "ฟุตบาท") AND ("กรุ...
datetime_of_tweet                                    2023-09-12 7:58:19
datetime_of_query                                   2023-11-16 14:52:12
link                  https://twitter.com/AitKanphong/status/1701505...
mentioned_location                                              มีนบุรี
location                                                            NaN
image                 ['https://pbs.twimg.com/media/F5z1289bgAAz

Note

In [None]:
X_test=['อย่า ลืม ติดตาม ชม รายการ พิเศษ','ด่วน ลด แบบ จัดเต็ม ของแถม มากมาย']
X_test=tokenizer.texts_to_sequences(X_test)
X_test=pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
model.predict(X_test)