In [1]:
from sklearn.datasets import fetch_20newsgroups 
from sklearn.naive_bayes import MultinomialNB 

# these are used to process the data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [8]:
# taking messages and placing them into classes based on words found 
# and probabilities calculated

In [10]:
# the desired newsgroups
newsgroup_names = ['comp.graphics', 'rec.sport.hockey', 'sci.electronics', 'sci.space']

# getting the data
newsgroups = fetch_20newsgroups(categories=newsgroup_names, shuffle=True, random_state=265)
newsgroups.keys()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [11]:
# text into numbers for calculations
word_vector = CountVectorizer()
word_vector_counts = word_vector.fit_transform(newsgroups.data)

# frequency of words: 
term_freq_transformer = TfidfTransformer()
term_freq = term_freq_transformer.fit_transform(word_vector_counts)

In [12]:
# training the Naive Bayes model
model = MultinomialNB().fit(term_freq, newsgroups.target)

In [13]:
# Predict some new fake documents
fake_docs = [
    'That GPU has amazing performance with a lot of shaders',
    'The player had a wicked slap shot',
    'I spent all day yesterday soldering banks of capacitors',
    'Today I have to solder a bank of capacitors',
    'NASA has rovers on Mars']
fake_counts = word_vector.transform(fake_docs)
fake_term_freq = term_freq_transformer.transform(fake_counts)

predicted = model.predict(fake_term_freq)
print('Predictions:')
for doc, group in zip(fake_docs, predicted):
    print('\t{0} => {1}'.format(doc, newsgroups.target_names[group]))

probabilities = model.predict_proba(fake_term_freq)
print('Probabilities:')
print(''.join(['{:17}'.format(name) for name in newsgroups.target_names]))
for probs in probabilities:
    print(''.join(['{:<17.8}'.format(prob) for prob in probs]))

Predictions:
	That GPU has amazing performance with a lot of shaders => comp.graphics
	The player had a wicked slap shot => rec.sport.hockey
	I spent all day yesterday soldering banks of capacitors => sci.space
	Today I have to solder a bank of capacitors => sci.electronics
	NASA has rovers on Mars => sci.space
Probabilities:
comp.graphics    rec.sport.hockey sci.electronics  sci.space        
0.29466149       0.22895149       0.24926344       0.22712357       
0.12948055       0.51155698       0.18248712       0.17647535       
0.18604814       0.24117771       0.27540452       0.29736963       
0.21285086       0.21081302       0.3486507        0.22768541       
0.079185633      0.066225915      0.10236622       0.75222223       


In [14]:
# Exercise Option Standard Difficulty

# The Naive Bayes model found some key words in each of the phrases 
# and classified each message within certain catagories. For the 
# first message, I'm guessing that the use of "GPU" and "shaders" 
# were the key words. 

# Below are my own messages that fall within the given catagories. 

In [18]:
# Predict some new fake documents
fake_docs = [
    'That graphics card is outdated by two years',
    'That player is such a bender',
    'I found the voltage of the closed circuit',
    'What is the resistance in amps?',
    'Elon Musk plans to launch his car to Mars']
fake_counts = word_vector.transform(fake_docs)
fake_term_freq = term_freq_transformer.transform(fake_counts)

predicted = model.predict(fake_term_freq)
print('Predictions:')
for doc, group in zip(fake_docs, predicted):
    print('\t{0} => {1}'.format(doc, newsgroups.target_names[group]))

probabilities = model.predict_proba(fake_term_freq)
print('Probabilities:')
print(''.join(['{:17}'.format(name) for name in newsgroups.target_names]))
for probs in probabilities:
    print(''.join(['{:<17.8}'.format(prob) for prob in probs]))

Predictions:
	That graphics card is outdated by two years => comp.graphics
	That player is such a bender => rec.sport.hockey
	I found the voltage of the closed circuit => sci.electronics
	What is the resistance in amps? => sci.electronics
	Elon Musk plans to launch his car to Mars => sci.space
Probabilities:
comp.graphics    rec.sport.hockey sci.electronics  sci.space        
0.4689949        0.15663217       0.20373913       0.1706338        
0.14659586       0.42207164       0.21543725       0.21589525       
0.1132552        0.081102297      0.67513708       0.13050542       
0.13857652       0.15302762       0.53160645       0.17678941       
0.076140732      0.13378088       0.16820211       0.62187628       


In [19]:
# I tried to use pretty obvious keywords such as "graphics," but for 
# some reason, the probability for that specific example was less than
# 50 percent. I also searched for hockey terms to create the second 
# example, but the model did not seem as confident as it did with the 
# electronics examples- 68 and 53 percent respectively. I'll try to 
# recreate this test with more obvious keywords. 

In [20]:
# Predict some new fake documents
fake_docs = [
    'I need a better CPU to render those high resolution graphics',
    'The centerman hit the puck straight down the rink',
    'I the voltage of the closed circuit and struggled to calculate the capacitance',
    'What is the resistivity in amps of this circuit as a whole?',
    'SpaceX created a self-landing spacecraft to optimize the reusability of their vehicles']
fake_counts = word_vector.transform(fake_docs)
fake_term_freq = term_freq_transformer.transform(fake_counts)

# creating a small chart with the predictions and their probabilities

predicted = model.predict(fake_term_freq)
print('Predictions:')
for doc, group in zip(fake_docs, predicted):
    print('\t{0} => {1}'.format(doc, newsgroups.target_names[group]))

probabilities = model.predict_proba(fake_term_freq)
print('Probabilities:')
print(''.join(['{:17}'.format(name) for name in newsgroups.target_names]))
for probs in probabilities:
    print(''.join(['{:<17.8}'.format(prob) for prob in probs]))

Predictions:
	I need a better CPU to render those high resolution graphics => comp.graphics
	The centerman hit the puck straight down the rink => rec.sport.hockey
	I the voltage of the closed circuit and struggled to calculate the capacitance => sci.electronics
	What is the resistivity in amps of this circuit as a whole? => sci.electronics
	SpaceX created a self-landing spacecraft to optimize the reusability of their vehicles => sci.space
Probabilities:
comp.graphics    rec.sport.hockey sci.electronics  sci.space        
0.48973754       0.098827902      0.25206809       0.15936647       
0.10771924       0.54864512       0.16801594       0.1756197        
0.16336696       0.15749811       0.50509739       0.17403754       
0.12219006       0.15400576       0.55876542       0.16503876       
0.16365548       0.16076378       0.17377995       0.5018008        


In [21]:
# Now, most of the probabilities lay around only about 50 percent.

In [22]:
# Predict some new fake documents
fake_docs = [
    'Graphics',
    'Hockey',
    'Electronics',
    'Electronics',
    'Space']
fake_counts = word_vector.transform(fake_docs)
fake_term_freq = term_freq_transformer.transform(fake_counts)

# creating a small chart with the predictions and their probabilities

predicted = model.predict(fake_term_freq)
print('Predictions:')
for doc, group in zip(fake_docs, predicted):
    print('\t{0} => {1}'.format(doc, newsgroups.target_names[group]))

probabilities = model.predict_proba(fake_term_freq)
print('Probabilities:')
print(''.join(['{:17}'.format(name) for name in newsgroups.target_names]))
for probs in probabilities:
    print(''.join(['{:<17.8}'.format(prob) for prob in probs]))

Predictions:
	Graphics => comp.graphics
	Hockey => rec.sport.hockey
	Electronics => sci.electronics
	Electronics => sci.electronics
	Space => sci.space
Probabilities:
comp.graphics    rec.sport.hockey sci.electronics  sci.space        
0.83097207       0.050320841      0.060914262      0.057792827      
0.045469352      0.86411871       0.045513981      0.044897959      
0.11569539       0.10114255       0.63993689       0.14322517       
0.11569539       0.10114255       0.63993689       0.14322517       
0.058143513      0.037732506      0.046317148      0.85780683       


In [23]:
# By only typing in the words that match the classnames, the probabilities 
# only increased to around 60 to 80 percent. 