In [22]:
import pandas as pd
from urllib.parse import unquote
from time import time
starttime = time()

# Read data

In [23]:
df = pd.read_json('data-1.json',encoding='utf-8')

# Match similar category labels

In [24]:
CAT= {
    #category: change to this
    'ข่าวบันเทิง':"entertainment",
    'regional':'local',
    'sport':'sports',
    'ข่าวกีฬา':'sports',
    'ข่าวอาชญากรรม':'crime',
    'khrongkhaidaasapparod':'ข่าวโครงข่ายตาสับปะรด',
    'around-the-world-news':'foreign',
    'abroad':'foreign',
    'ข่าวต่างประเทศ':'foreign',
    'ข่าวไลฟ์สไตล์':'lifestyle',
    'analysis-today-politics':'politics',
    'politic':'politics',
    'ข่าวเศรษฐกิจ':'economics',
    'economy':'economics',
    'economic':'economics',
    'uncategorized':'others',
    'auto':'car-vehicle',
    'thai-soccer':'thaifootball',
    'ข่าวพระราชสำนัก':'royalnews',
    'it':'tech'
}

def conv_cate(category):
    result = CAT.get(category,-1)
    if result!=-1:
        return result
    else :
        #in case want to update dictionary
        #CAT[category] = category
        return category

# Undo URL encoding and match similar categories

In [25]:
df['category'] = df['category'].apply(unquote).apply(conv_cate)

# Choose Top 9 categories from data

In [26]:
import numpy as np
category = np.array(df['category'])
tags, freq = np.unique(category,return_counts=True)
freqList = list(zip(tags,freq))
freqList.sort(key=lambda elem:elem[1],reverse=True) #sort by frequency

#get category with lots of data examples
topCats = [category[0] for category in freqList[:9]]
topCats

['local',
 'foreign',
 'politics',
 'sports',
 'entertainment',
 'article',
 'economics',
 'crime',
 'eurofootball']

In [27]:
X = np.array(df[df.category.isin(topCats)]['news_content'])
y = np.array(df[df.category.isin(topCats)]['category'])

# Clean Text

In [28]:
import re

def text_prepare(text):
    text = re.sub('[^ก-๙‘’“”]', '', text)
    return text

In [29]:
X = np.array([text_prepare(body) for body in X])

# Split data into train and test sets

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Tokenize thai words

In [31]:
from pythainlp.tokenize import word_tokenize

In [32]:
#tokenize function
def Thaitokenize(text):
    """
    turn a Thai sentence into Thai words with meaning
    """
    #text = ''.join(text.split('"')) ##handled in regex??
    text = word_tokenize(text , engine = 'pyicu')
    return text

# Count Vectorizer

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
vec1 = CountVectorizer(tokenizer=Thaitokenize)
X_train_bow = vec1.fit_transform(X_train)
X_test_bow = vec1.transform(X_test)

# TFIDF Vectorizer

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec2 = TfidfVectorizer(tokenizer=Thaitokenize, max_df=0.9, min_df = 5)
X_train_tfidf = vec2.fit_transform(X_train)
X_test_tfidf = vec2.transform(X_test)
#print(vectorizer.get_feature_names())
#print(X_train_tfidf.shape)

# Train Model

In [35]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.sparse import csr_matrix

#BOW
clf1 = OneVsRestClassifier(GaussianNB()).fit(X_train_bow.toarray(), y_train)
y_pred_bow = clf1.predict(X_test_bow.toarray())

#Tfidf
clf2 = OneVsRestClassifier(GaussianNB()).fit(X_train_tfidf.toarray(), y_train)
y_pred_tfidf = clf2.predict(X_test_tfidf.toarray())

# Accuracy

In [36]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm1 = confusion_matrix(y_test, y_pred_bow)
print(cm1)
print(accuracy_score(y_test, y_pred_bow))

cm2 = confusion_matrix(y_test, y_pred_tfidf)
print(cm2)
print(accuracy_score(y_test, y_pred_tfidf))

[[ 0  0  0  0  0  2  9  1 12]
 [ 0  1  0  0  0  0  5  0 14]
 [ 0  0  1  0  0  0  5  2 19]
 [ 0  0  0  4  0  0  2  1 21]
 [ 0  0  0  0  1  0  0  0 19]
 [ 0  0  0  0  0 43  2  1 37]
 [ 0  0  0  1  0  0 33  2 56]
 [ 0  0  0  0  0  1  6 26 32]
 [ 0  0  0  0  0  1  1  0 63]]
0.4056603773584906
[[ 0  0  0  1  0  3  5  3 12]
 [ 0  1  0  0  0  0  4  0 15]
 [ 0  0  0  0  0  1  2  2 22]
 [ 0  0  0  1  0  1  7  1 18]
 [ 0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0 45  2  0 36]
 [ 0  0  0  0  0  0 29  3 60]
 [ 0  0  0  0  0  1  4 21 39]
 [ 0  0  0  0  0  0  0  1 64]]
0.37971698113207547


In [37]:
print('Execution time: ',(time()-starttime), 's')

Execution time:  20.38533878326416 s


# Predict category with a single sentence

In [38]:
test = text_prepare('“อนุทิน” ขอ ผู้ร้อง เปิดชื่อ อักษรย่อ \"พ\" คนหักหัวคิวโรงแรม 40%')

#BOW
test1 = vec1.transform([test])
print(clf1.predict(test1))

#TFIDF
test2 = vec2.transform([test])
print(clf2.predict(test2))

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

# Observe relationship of amount of data supplied and correctly predicted

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

In [None]:
plot_confusion_matrix(clf1, X_test_bow, y_test)

In [None]:
plot_confusion_matrix(clf2, X_test_tfidf, y_test)

In [None]:
tags, freq = np.unique(y_train,return_counts=True)
list(zip(tags,freq))