In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import string

In [2]:
#Load data
filename = 'uci-news-aggregator.csv'
df = pd.read_csv(filename, sep= ",")
df.CATEGORY.unique()

array(['b', 't', 'e', 'm'], dtype=object)

In [3]:
#Prprocess data
df['CATEGORY'] = df.CATEGORY.map({'b':1, 't':2, 'e':3, 'm':4})
df['TITLE'] = df.TITLE.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)

print(df['TITLE'])  

0         fed official says weak data caused by weather ...
1         feds charles plosser sees high bar for change ...
2         us open stocks fall after fed official hints a...
3         fed risks falling behind the curve charles plo...
4          feds plosser nasty weather has curbed job growth
                                ...                        
422414    surgeons to remove 4yearolds rib to rebuild da...
422415    boy to have surgery on esophagus after battery...
422416    child who swallowed battery to have reconstruc...
422417    phoenix boy undergoes surgery to repair throat...
422418    phoenix boy undergoes surgery to repair throat...
Name: TITLE, Length: 422419, dtype: object


In [4]:
#Split into train and test data sets
X_train, X_test, Y_train, Y_test = train_test_split(
    df['TITLE'], df['CATEGORY'], random_state = 1)
print("Training dataset: ", X_train.shape[0])
print("Test dataset: ", X_test.shape[0])

Training dataset:  316814
Test dataset:  105605


In [5]:
# Extract features
#Apply bag of words processing to the dataset
count_vector = CountVectorizer(stop_words = 'english')
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [6]:
#Train Multinomial Naive Bayes classifier

from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, Y_train)

In [7]:
predictions = naive_bayes.predict(testing_data)
predictions

array([1, 1, 3, ..., 1, 1, 4], dtype=int64)

In [8]:
print("Accuracy score: ", accuracy_score(Y_test, predictions))
print("Recall score: ", recall_score(Y_test, predictions, average = 'weighted'))
print("Precision score: ", precision_score(Y_test, predictions, average = 'weighted'))
print("F1 score: ", f1_score(Y_test, predictions, average = 'weighted'))

Accuracy score:  0.9283367264807537
Recall score:  0.9283367264807537
Precision score:  0.9284057938650963
F1 score:  0.9283611479662869
