In [1]:
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [2]:
filename_train = ["Dataset/giaoduc" + str(i) + ".txt" for i in range(50)]
filename_train+=["Dataset/chinhtri" + str(i) + ".txt" for i in range(50)]
filename_train+=["Dataset/phapluat" + str(i) + ".txt" for i in range(50)]
filename_train+=["Dataset/suckhoe" + str(i) + ".txt" for i in range(50)]

filename_test = ["Dataset/giaoduc" + str(i) + ".txt" for i in range(50,60)]
filename_test+=["Dataset/chinhtri" + str(i) + ".txt" for i in range(50,60)]
filename_test+=["Dataset/phapluat" + str(i) + ".txt" for i in range(50,60)]
filename_test+=["Dataset/suckhoe" + str(i) + ".txt" for i in range(50,70)]

In [3]:
X_train = []
for filename in filename_train:
    X_train.append(open(filename, 'r').read())
    
X_test = []
for filename in filename_test:
    X_test.append(open(filename, 'r').read())

In [4]:
Y_train = []
for i in range(0,200):
    if (i<50):    Y_train.append(0) 
    elif (i<100): Y_train.append(1)
    elif (i<150): Y_train.append(2)
    else: Y_train.append(3)       

Y_test = []
for i in range(0,50):
    if (i<10):  Y_test.append(0) 
    elif (i<20): Y_test.append(1)
    elif (i<30): Y_test.append(2)
    else: Y_test.append(3)

In [5]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(200, 4464)

In [6]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(200, 4464)

In [7]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_train_tfidf.shape

(200, 4464)

In [8]:
text_clf = Pipeline([('vect', TfidfVectorizer()),('clf', LinearSVC())])
text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)

In [9]:
print(metrics.classification_report(Y_test, predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        10
          1       1.00      1.00      1.00        10
          2       0.77      1.00      0.87        10
          3       1.00      0.85      0.92        20

avg / total       0.95      0.94      0.94        50



In [10]:
scores = cross_val_score(text_clf, X_train + X_test, Y_train + Y_test, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.96 0.96 1.   1.   0.96 1.   0.96 1.   0.92 0.96]
Accuracy: 0.97 (+/- 0.05)


In [11]:
parameters={'vect__ngram_range':[(1,1),(1,2)],'vect__use_idf':(True, False),'clf__C':(1.0,0.1,1e-2,1e-3),}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, Y_train)
clf = gs_clf.best_estimator_
predicted = clf.predict(X_test)

In [12]:
print(metrics.classification_report(Y_test, predicted))

             precision    recall  f1-score   support

          0       0.83      1.00      0.91        10
          1       1.00      1.00      1.00        10
          2       0.77      1.00      0.87        10
          3       1.00      0.75      0.86        20

avg / total       0.92      0.90      0.90        50



In [13]:
scores = cross_val_score(gs_clf, X_train + X_test, Y_train + Y_test, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.98 1.   0.98 0.98 0.94]
Accuracy: 0.98 (+/- 0.04)


In [7]:
print(type(X_train[0]))

<class 'str'>
