# Import the Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Load the Data

In [2]:
bbc = pd.read_csv('BBCNews.csv')

In [3]:
bbc.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
bbc.shape

(2225, 2)

In [5]:
bbc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


# Data Analysis

In [6]:
features = bbc['text']
target = bbc['category']

In [7]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(features)
features = features.todense()

In [8]:
target.unique()

array(['tech', 'business', 'sport', 'entertainment', 'politics'],
      dtype=object)

In [9]:
target = target.map({'business':0, 'entertainment':1, 'politics':2, 'sport':3, 'tech':4})

# Split the Data

In [10]:
x_train, x_test, y_train, y_test = train_test_split(features, target, train_size = 0.8, random_state = 17)

# Select and Train the Model

In [11]:
model = MultinomialNB()

In [12]:
model.fit(x_train, y_train)

MultinomialNB()

# Evaluate the Model

In [13]:
prediction = model.predict(x_test)
print(accuracy_score(y_test, prediction))
print(confusion_matrix(y_test, prediction))

0.9550561797752809
[[101   0   3   0   3]
 [  1  68   1   3   1]
 [  1   0  87   2   0]
 [  1   0   0  92   0]
 [  0   1   2   1  77]]
