In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
csv_in = './newsgroups.csv'

# To show all columns and rows
pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [None]:
df = pd.read_csv(csv_in, skiprows=0, delimiter=',',
                 header=0, encoding='latin-1')
print(df.shape)
print(df.info())
display(df.head())

In [None]:
X = df["text"]
y = df["category"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    random_state=5)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
vocab = vectorizer.get_feature_names_out()
print('Vocabulary size:', len(vocab))
print(vocab[:10])  # debug

In [None]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [None]:
model = MultinomialNB(alpha=1.0)
model.fit(X_train_bow, y_train)
print(model.classes_)

In [None]:
train_score = model.score(X_train_bow, y_train)
print('Train accuracy:', train_score)

In [None]:
# Obtain each prediction
y_test_pred = model.predict(X_test_bow)
df_pred = pd.DataFrame({
            'pred': y_test_pred,
            'true': y_test
          }).reset_index(drop=True)
display(df_pred.head())

In [None]:
# Make crosstable
ctab = pd.crosstab(df_pred['pred'], df_pred['true'])
display(ctab)

In [None]:
# Prediction accuracy
test_score = model.score(X_test_bow, y_test)
print('Test accuracy:', test_score)