In [1]:
import sklearn
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [2]:
X_train = pickle.load(open('../dataset/processed-data/X_train.pkl', 'rb'))
y_train = pickle.load(open('../dataset/processed-data/y_train.pkl', 'rb'))

X_test = pickle.load(open('../dataset/processed-data/X_test.pkl', 'rb'))
y_test = pickle.load(open('../dataset/processed-data/y_test.pkl', 'rb'))

In [3]:
# Tf-Idf Vectors as features

# Word level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
# Assume that we don't have test set before
X_test_tfidf = tfidf_vect.transform(X_test)

# N-gram level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(2, 3))
tfidf_vect_ngram.fit(X_train)
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
# Assume that we don't have test set before
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(X_test)

# N-gram-char level - we choose max number of words equal to 30000 except all words (100k+ words)
tfidf_vect_ngram_char = TfidfVectorizer(analyzer='char', max_features=30000, ngram_range=(2, 3))
tfidf_vect_ngram_char.fit(X_train)
X_train_tfidf_ngram_char =  tfidf_vect_ngram_char.transform(X_train)
# Assume that we don't have test set before
X_test_tfidf_ngram_char =  tfidf_vect_ngram_char.transform(X_test)

In [4]:
# Singular value decomposition
from sklearn.decomposition import TruncatedSVD

# Word level
svd = TruncatedSVD(n_components=300, random_state=42)
svd.fit(X_train_tfidf)

X_train_tfidf_svd = svd.transform(X_train_tfidf)
X_test_tfidf_svd = svd.transform(X_test_tfidf)

# N-gram level
svd_ngram = TruncatedSVD(n_components=300, random_state=42)
svd_ngram.fit(X_train_tfidf_ngram)

X_train_tfidf_ngram_svd = svd_ngram.transform(X_train_tfidf_ngram)
X_test_tfidf_ngram_svd = svd_ngram.transform(X_test_tfidf_ngram)

# N-gram Char level
svd_ngram_char = TruncatedSVD(n_components=300, random_state=42)
svd_ngram_char.fit(X_train_tfidf_ngram_char)

X_train_tfidf_ngram_char_svd = svd_ngram_char.transform(X_train_tfidf_ngram_char)
X_test_tfidf_ngram_char_svd = svd_ngram_char.transform(X_test_tfidf_ngram_char)

In [4]:
encoder = LabelEncoder()
y_train_n = encoder.fit_transform(y_train)
y_test_n = encoder.fit_transform(y_test)

encoder.classes_

array(['doi-song', 'du-lich', 'giai-tri', 'giao-duc', 'khoa-hoc',
       'kinh-doanh', 'phap-luat', 'suc-khoe', 'the-gioi', 'the-thao',
       'thoi-su'], dtype='<U10')

In [5]:
def train(classifier, X_train, y_train, X_test, y_test, is_neuralnet=False, n_epochs=3):       
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
    
    if is_neuralnet:
        classifier.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=n_epochs, batch_size=512)
        val_predictions = classifier.predict(X_val)
        test_predictions = classifier.predict(X_test)
        val_predictions = val_predictions.argmax(axis=-1)
        test_predictions = test_predictions.argmax(axis=-1)
    else:
        classifier.fit(X_train, y_train)
        train_predictions = classifier.predict(X_train)
        val_predictions = classifier.predict(X_val)
        test_predictions = classifier.predict(X_test)
    
    print("Train accuracy: ", accuracy_score(train_predictions, y_train))
    print("Validation accuracy: ", accuracy_score(val_predictions, y_val))
    print("Test accuracy: ", accuracy_score(test_predictions, y_test))

## Naive Bayes Classifier
- Multinomal Naive Bayes
- Bernoulli Naive Bayes

In [6]:
train(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf, y_test)

Train accuracy:  0.9326607226654153
Validation accuracy:  0.8924050632911392
Test accuracy:  0.8788177339901477


In [8]:
train(BernoulliNB(), X_train_tfidf_svd, y_train, X_test_tfidf_svd, y_test)

Train accuracy:  0.8137445281901063
Validation accuracy:  0.8080568720379147
Test accuracy:  0.8149008397355726


## Linear Classifier
- Logistic Regression

In [9]:
train(LogisticRegression(), X_train_tfidf, y_train, X_test_tfidf, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train accuracy:  0.9557976500016456
Validation accuracy:  0.9161729857819905
Test accuracy:  0.9236892779862228


## Support Vector Machine (SVM) Classifier

In [10]:
train(SVC(), X_train_tfidf_svd, y_train, X_test_tfidf_svd, y_test)

Train accuracy:  0.9573116545436593
Validation accuracy:  0.9203199052132701
Test accuracy:  0.9186865979790761


## Random Forest Classifier 

In [11]:
train(RandomForestClassifier(), X_train_tfidf_svd, y_train, X_test_tfidf_svd, y_test)

Train accuracy:  0.9986834743112925
Validation accuracy:  0.8809241706161137
Test accuracy:  0.8841045798344351


## Gradient Boosting Classifier

In [12]:
train(GradientBoostingClassifier(), X_train_tfidf_svd, y_train, X_test_tfidf_svd, y_test)

KeyboardInterrupt: 