In [1]:
from nbsvm import NBSVM
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Prepare Data From 20newsgroups

In [2]:
cats = ['alt.atheism',
        'comp.graphics',
        'comp.os.ms-windows.misc',
        'talk.politics.misc',
        'rec.sport.hockey',
        'sci.crypt',
        'sci.electronics',
        'sci.med',
        'sci.space',
        'soc.religion.christian',]

newsgroups = fetch_20newsgroups(categories=cats, subset='all')


### Raw Text Data

In [3]:
newsgroups.data[:2]

['From: ednclark@kraken.itc.gu.edu.au (Jeffrey Clark)\nSubject: Re: Societally acceptable behavior\nNntp-Posting-Host: kraken.itc.gu.edu.au\nOrganization: ITC, Griffith University, Brisbane, Australia\nLines: 49\n\ncobb@alexia.lis.uiuc.edu (Mike Cobb) writes:\n\n>Merely a question for the basis of morality\n\n>Moral/Ethical behavior = _Societally_ _acceptable_ _behavior_.\n\n>1)Who is society\n\nSociety is the collection of individuals which will fall under self-defined\nrules.  In terms of UN decisions all the sets of peoples who are represented\nat the UN are considered part of that society. If we then look at US federal\nlaws provided by representatives of purely US citizens then the society for\nthat case would be the citizens of the US and so on.\n\n>2)How do "they" define what is acceptable?\n\n"Acceptable" are those behaviours which are either legislated for the\nsociety by representatives of that society or those behaviours which are\nnon-verbally and, in effect, non-consciousl

### Label should be  0 ~ 9 because we request 10 topics

In [4]:
newsgroups.target

array([0, 6, 7, ..., 1, 4, 5], dtype=int64)

In [5]:
from collections import Counter
Counter(newsgroups.target)

Counter({0: 799,
         6: 990,
         7: 987,
         9: 775,
         8: 997,
         1: 973,
         5: 984,
         3: 999,
         4: 991,
         2: 985})

# Transform text to vector
Here we use tfidf vectorzier and split it to train set and test set

In [6]:
text_vectors = TfidfVectorizer(ngram_range=(1, 2)).fit_transform(newsgroups.data)

x_train, x_test, y_train, y_test =  train_test_split(
                                        text_vectors,
                                        newsgroups.target,
                                        test_size=0.2,
                                        random_state=9487)

print(f"x_train.shape:{x_train.shape}")
print(f"x_test.shape:{x_test.shape}")
print(f"y_train.shape:{y_train.shape}")
print(f"y_test.shape:{y_test.shape}")

x_train.shape:(7584, 1052830)
x_test.shape:(1896, 1052830)
y_train.shape:(7584,)
y_test.shape:(1896,)


In [7]:
x_train, y_train

(<7584x1052830 sparse matrix of type '<class 'numpy.float64'>'
 	with 3356839 stored elements in Compressed Sparse Row format>,
 array([1, 9, 1, ..., 5, 4, 9], dtype=int64))

# Train & Test

### Training

In [8]:
clf = NBSVM(class_num=len(cats))
clf.fit(x_train, y_train)

2019/05/17 02:11:15 [INFO] Shape of Training Set: (7584, 1052830)
2019/05/17 02:13:02 [INFO] Training comsuming time: 107.102134 s
2019/05/17 02:13:13 [INFO] Train set acc: 0.9988132911392406


<NBSVM: classes=10>

### Training by multi-process

In [9]:
clf = NBSVM(class_num=len(cats), n_jobs=-1)
clf.fit(x_train, y_train)

2019/05/17 02:13:13 [INFO] Shape of Training Set: (7584, 1052830)
2019/05/17 02:14:10 [INFO] Training comsuming time: 57.527142 s
2019/05/17 02:14:21 [INFO] Train set acc: 0.9988132911392406


<NBSVM: classes=10>

### Testing

In [10]:
print("Test Acc:", clf.evaluate(x_test, y_test))

Test Acc: 0.9556962025316456


### Get Probablities of each class

In [11]:
print("Predict Probabilites for first 3 training-sample:\n", clf.predict_prob(x_train[:3]))

print(f'-\nLabel of first 3 training-sample: {y_train[:3]}')

Predict Probabilites for first 3 training-sample:
 [[0.01399269 0.71623784 0.0250064  0.01469968 0.01539825 0.03450357
  0.02821238 0.02011785 0.01565684 0.01549794]
 [0.02142355 0.01692018 0.01417318 0.01663919 0.02041968 0.02307865
  0.0277924  0.02171772 0.0371468  0.92540225]
 [0.01287614 0.93184883 0.07596478 0.01515449 0.01586291 0.04773813
  0.0259994  0.0168499  0.02104456 0.01509836]]
-
Label of first 3 training-sample: [1 9 1]


### Predict Label

In [12]:
clf.predict(x_train[:3])

array([1, 9, 1], dtype=int64)

# Save & Load

In [13]:
clf

<NBSVM: classes=10>

### Save

In [14]:
save_path = './model/nbsvm.demo.model'
clf.save(save_path)

2019/05/17 02:14:27 [INFO] save models to ./model/nbsvm.demo.model, md5=d032c5a2ea5909cf8fe6c4c983528ccf


### Load and Validate

In [15]:
new_clf = NBSVM()
new_clf.load(save_path)
new_clf

2019/05/17 02:14:27 [INFO] Load models from :./model/nbsvm.demo.model, md5=d032c5a2ea5909cf8fe6c4c983528ccf


<NBSVM: classes=10>

In [16]:
new_clf.evaluate(x_test, y_test) == clf.evaluate(x_test, y_test)

True

In [17]:
new_clf.predict(x_train[:3])

array([1, 9, 1], dtype=int64)