In [1]:
#@title MIT License
#
# Copyright (c) 2020 Balázs Pintér
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score, KFold, ShuffleSplit, GridSearchCV, RandomizedSearchCV
import nltk
from nltk import word_tokenize
import pandas as pd

# Getting a corpus

In [3]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

## Obtain labeled examples

In [4]:
num_of_features = 5000
num_of_examples = 10000

In [5]:
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=num_of_features)

In [6]:
corpus = dataset.data[:num_of_examples]

In [7]:
print(corpus[100])


I've been a very intent NREN spectator of the NREN for years.  As a 
commercial IP software vendor, it really is my professional opinion that the 
NREN, at this point, is irrelevant to private sector networking.  If it had 
been deployed five years ago, it would have been a major development.  Now,
however, it's just an upgrade to the NSFnet, and an attempt to revive the 
lagging use of the national supercomputer centers.  You could cut out the 
NSFnet completely, and the Internet would continue chugging along without a 
hiccup (aside from a few universities).

Long-haul networking and Internet connectivity have long since ceased to be 
under federal sponsorship or regulation, at least in the USA.  The success of 
the CIX (Commercial Internet Exchange) is a prime example of this.  While our 
dear VP has been promoting his "data superhighway," the private sector has 
been building it, without the NSFnet's restrictions.

To illustrate, a connection from the machine on my desk to the mac

In [8]:
bows = count_vectorizer.fit_transform(corpus)
bows

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 442850 stored elements and shape (10000, 5000)>

In [9]:
vocabulary = count_vectorizer.get_feature_names_out()
print(vocabulary[100])

1eq


In [10]:
print(bows[100])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 51 stored elements and shape (1, 5000)>
  Coords	Values
  (0, 2502)	1
  (0, 1195)	1
  (0, 560)	1
  (0, 4700)	2
  (0, 4983)	2
  (0, 555)	1
  (0, 2746)	2
  (0, 4179)	1
  (0, 1096)	3
  (0, 1338)	1
  (0, 3425)	1
  (0, 2400)	3
  (0, 4661)	1
  (0, 4698)	1
  (0, 2765)	1
  (0, 402)	1
  (0, 1720)	1
  (0, 3395)	1
  (0, 2700)	2
  (0, 1164)	1
  (0, 3514)	1
  (0, 837)	1
  (0, 4750)	1
  (0, 1725)	1
  (0, 3686)	1
  :	:
  (0, 3490)	2
  (0, 3028)	1
  (0, 2425)	1
  (0, 4746)	1
  (0, 1314)	1
  (0, 1451)	1
  (0, 1124)	1
  (0, 605)	1
  (0, 2198)	2
  (0, 2387)	1
  (0, 2419)	2
  (0, 3176)	1
  (0, 4009)	2
  (0, 3058)	2
  (0, 4692)	1
  (0, 945)	1
  (0, 1833)	1
  (0, 3739)	1
  (0, 4353)	1
  (0, 3476)	1
  (0, 1360)	1
  (0, 3822)	2
  (0, 1431)	1
  (0, 2416)	1
  (0, 1992)	1


In [11]:
corpus[100]

'\nI\'ve been a very intent NREN spectator of the NREN for years.  As a \ncommercial IP software vendor, it really is my professional opinion that the \nNREN, at this point, is irrelevant to private sector networking.  If it had \nbeen deployed five years ago, it would have been a major development.  Now,\nhowever, it\'s just an upgrade to the NSFnet, and an attempt to revive the \nlagging use of the national supercomputer centers.  You could cut out the \nNSFnet completely, and the Internet would continue chugging along without a \nhiccup (aside from a few universities).\n\nLong-haul networking and Internet connectivity have long since ceased to be \nunder federal sponsorship or regulation, at least in the USA.  The success of \nthe CIX (Commercial Internet Exchange) is a prime example of this.  While our \ndear VP has been promoting his "data superhighway," the private sector has \nbeen building it, without the NSFnet\'s restrictions.\n\nTo illustrate, a connection from the machine o

In [12]:
bows[100][bows[100].nonzero()]

matrix([[1, 1, 1, 2, 2, 1, 2, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 1,
         1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1,
         1, 1, 1, 1, 1, 2, 1, 1, 1]])

In [13]:
print([vocabulary[ind] for ind in bows[100].nonzero()[1]])

['just', 'continue', 'aside', 'use', 'years', 'article', 'machine', 'software', 'commercial', 'data', 'posted', 'internet', 'uk', 'usa', 'major', 'ago', 'example', 'point', 'long', 'connection', 'professional', 'building', 'vendor', 'exchange', 'really', 'appropriate', 'private', 'national', 'irrelevant', 've', 'cut', 'development', 'completely', 'attempt', 'hits', 'intent', 'ip', 'opinion', 'sector', 'networking', 'upgrade', 'centers', 'federal', 'regulation', 'success', 'prime', 'dear', 'restrictions', 'desk', 'involves', 'gateway']


In [14]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [15]:
targets = dataset.target[:num_of_examples]

In [16]:
targets[:10]

array([17,  0, 17, 11, 10, 15,  4, 17, 13, 12])

In [17]:
set(targets)

{np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6),
 np.int64(7),
 np.int64(8),
 np.int64(9),
 np.int64(10),
 np.int64(11),
 np.int64(12),
 np.int64(13),
 np.int64(14),
 np.int64(15),
 np.int64(16),
 np.int64(17),
 np.int64(18),
 np.int64(19)}

## Produce training and test data

TODO: Create and use a OneHotEncoder or the to_categorical function

TODO: Split the test set using train_test_split

Tip: You need to convert sparse matrixes to np arrays to use them as input to the network (.to_array())

In [18]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

2024-11-19 14:16:10.874277: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-19 14:16:10.874955: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-19 14:16:10.877212: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-19 14:16:10.882960: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732025770.891723   32158 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732025770.89

In [19]:
# onehotencoder
depth = max(targets)
one_hot = tf.one_hot(targets, depth).numpy()

2024-11-19 14:16:11.952508: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [20]:
one_hot.shape

(10000, 19)

In [21]:
type(bows)

scipy.sparse._csr.csr_matrix

In [34]:
#traintestsplit
X_train, X_test, y_train, y_test = train_test_split(
  bows.todense() , one_hot , random_state=104,test_size=0.25, shuffle=True)

## Classify and look at results

TODO: Create, compile, fit the model

TODO: Evaluate the model on the test set

In [35]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(5000,)), 
    tf.keras.layers.Dense(256, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(128, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(19, activation='softmax')
])

In [36]:
y_train = np.array(y_train)
X_train = np.array(X_train)


assert X_train.size > 0, "X_train is empty!"
assert y_train.size > 0, "y_train is empty!"

print(y_train.shape)
print(X_train.shape)

(7500, 19)
(7500, 5000)


In [40]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)


Epoch 1/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.3742 - loss: 702.7924 - val_accuracy: 0.4360 - val_loss: 379.2150
Epoch 2/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.5328 - loss: 253.4762 - val_accuracy: 0.5547 - val_loss: 113.0280
Epoch 3/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.6779 - loss: 71.8092 - val_accuracy: 0.6200 - val_loss: 48.5627
Epoch 4/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.7690 - loss: 21.0207 - val_accuracy: 0.6640 - val_loss: 22.6173
Epoch 5/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.8173 - loss: 6.1022 - val_accuracy: 0.6960 - val_loss: 16.1676
Epoch 6/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8387 - loss: 3.4693 - val_accuracy: 0.6853 - val_loss: 17.5953
Epoch 7/

<keras.src.callbacks.history.History at 0x745579ea9310>