In [1]:
import pandas as pd
from urllib.parse import unquote
from time import time
starttime = time()

# Read data

In [2]:
df = pd.read_json('data-1.json',encoding='utf-8')

# Match similar category labels

In [3]:
CAT= {
    #category: change to this
    'ข่าวบันเทิง':"entertainment",
    'regional':'local',
    'sport':'sports',
    'ข่าวกีฬา':'sports',
    'ข่าวอาชญากรรม':'crime',
    'khrongkhaidaasapparod':'ข่าวโครงข่ายตาสับปะรด',
    'around-the-world-news':'foreign',
    'abroad':'foreign',
    'ข่าวต่างประเทศ':'foreign',
    'ข่าวไลฟ์สไตล์':'lifestyle',
    'analysis-today-politics':'politics',
    'politic':'politics',
    'ข่าวเศรษฐกิจ':'economics',
    'economy':'economics',
    'economic':'economics',
    'uncategorized':'others',
    'auto':'car-vehicle',
    'thai-soccer':'thaifootball',
    'ข่าวพระราชสำนัก':'royalnews',
    'it':'tech'
}

def conv_cate(category):
    result = CAT.get(category,-1)
    if result!=-1:
        return result
    else :
        #in case want to update dictionary
        #CAT[category] = category
        return category

# Undo URL encoding and match similar categories

In [4]:
df['category'] = df['category'].apply(unquote).apply(conv_cate)

# Choose Top 9 categories from data

In [5]:
import numpy as np
category = np.array(df['category'])
tags, freq = np.unique(category,return_counts=True)
freqList = list(zip(tags,freq))
freqList.sort(key=lambda elem:elem[1],reverse=True) #sort by frequency

#get category with lots of data examples
topCats = [category[0] for category in freqList[:9]]
topCats

['local',
 'foreign',
 'politics',
 'sports',
 'entertainment',
 'article',
 'economics',
 'crime',
 'eurofootball']

In [6]:
X = np.array(df[df.category.isin(topCats)]['news_content'])
y = np.array(df[df.category.isin(topCats)]['category'])

# Clean Text

In [7]:
import re

def text_prepare(text):
    text = re.sub('[^ก-๙‘’“”]', '', text)
    return text

In [8]:
X = np.array([text_prepare(body) for body in X])

# Split data into train and test sets

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Tokenize thai words

In [10]:
from pythainlp.tokenize import word_tokenize

In [11]:
#tokenize function
def Thaitokenize(text):
    """
    turn a Thai sentence into Thai words with meaning
    """
    #text = ''.join(text.split('"')) ##handled in regex??
    text = word_tokenize(text , engine = 'pyicu')
    return text

# Count Vectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vec1 = CountVectorizer(tokenizer=Thaitokenize)
X_train_bow = vec1.fit_transform(X_train)
X_test_bow = vec1.transform(X_test)

# TFIDF Vectorizer

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec2 = TfidfVectorizer(tokenizer=Thaitokenize, max_df=0.9, min_df = 5)
X_train_tfidf = vec2.fit_transform(X_train)
X_test_tfidf = vec2.transform(X_test)
#print(vectorizer.get_feature_names())
#print(X_train_tfidf.shape)

# Train Model and Accuracy

In [14]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

bowScores = []
tfidfScores = []

for i in range(1,100):
    #BOW
    print("Iteration: ",i)
    clf1 = OneVsRestClassifier(RandomForestClassifier(n_estimators = i, 
                                                      criterion = 'entropy', 
                                                      random_state = 0)).fit(X_train_bow, y_train)
    y_pred_bow = clf1.predict(X_test_bow)
    score = accuracy_score(y_test, y_pred_bow)
    print("BOW: ", score)
    bowScores.append(score)
    
    #Tfidf
    clf2 = OneVsRestClassifier(RandomForestClassifier(n_estimators = i, 
                                                      criterion = 'entropy', 
                                                      random_state = 0)).fit(X_train_tfidf, y_train)
    y_pred_tfidf = clf2.predict(X_test_tfidf)
    score = accuracy_score(y_test, y_pred_tfidf)
    print("TFIDF: ", score)
    tfidfScores.append(score)

Iteration:  1
BOW:  0.375
TFIDF:  0.37264150943396224
Iteration:  2
BOW:  0.46226415094339623
TFIDF:  0.5023584905660378
Iteration:  3
BOW:  0.5188679245283019
TFIDF:  0.5047169811320755
Iteration:  4
BOW:  0.5707547169811321
TFIDF:  0.5448113207547169
Iteration:  5
BOW:  0.5849056603773585
TFIDF:  0.5754716981132075
Iteration:  6
BOW:  0.5990566037735849
TFIDF:  0.5990566037735849
Iteration:  7
BOW:  0.6084905660377359
TFIDF:  0.6202830188679245
Iteration:  8
BOW:  0.6391509433962265
TFIDF:  0.6367924528301887
Iteration:  9
BOW:  0.6556603773584906
TFIDF:  0.6320754716981132
Iteration:  10
BOW:  0.6650943396226415
TFIDF:  0.6438679245283019
Iteration:  11
BOW:  0.6698113207547169
TFIDF:  0.6391509433962265
Iteration:  12
BOW:  0.6816037735849056
TFIDF:  0.6438679245283019
Iteration:  13
BOW:  0.6839622641509434
TFIDF:  0.6674528301886793
Iteration:  14
BOW:  0.6981132075471698
TFIDF:  0.6674528301886793
Iteration:  15
BOW:  0.6886792452830188
TFIDF:  0.6745283018867925
Iteration:  16


In [15]:
print('Execution time: ',(time()-starttime), 's')

Execution time:  666.8799669742584 s


In [None]:
from matplotlib.pyplot import plt
plt.plot(range(1,100),bowScores)
plt.show()

# Predict category with a single sentence

In [None]:
test = text_prepare('“อนุทิน” ขอ ผู้ร้อง เปิดชื่อ อักษรย่อ \"พ\" คนหักหัวคิวโรงแรม 40%')

#BOW
test1 = vec1.transform([test])
print(clf1.predict(test1))

#TFIDF
test2 = vec2.transform([test])
print(clf2.predict(test2))