In [141]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [142]:
filename = 'datasets/news_dataset.csv'
df = pd.read_csv(filename)
df.Category.unique()

array(['world', 'business', 'politics', 'health', 'entertainment',
       'sport'], dtype=object)

In [143]:
df.dropna(inplace = True)
df.drop_duplicates(inplace= True)

In [144]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize the text
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    text = ' '.join(text)  # Join the tokens back into a string
    return text

df['Title'] = df['Title'].apply(clean_text)

In [145]:
import string
# Select the relevant columns (ID and TITLE)
df = df[['Title','Category']]

df['Category'] = df.Category.map({'world':1, 'business':2, 'politics':3, 'health':4,'entertainment': 5, 'sport': 6})
df['Title'] = df.Title.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)

label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# # Split the data into training and testing sets
x_train,x_test,y_train, y_test = train_test_split(df['Title'], df['Category'], test_size=0.2, random_state = 42)
print("Training dataset: ", x_train.shape[0])
print("Test dataset: ", x_test.shape[0])

Training dataset:  1294
Test dataset:  324


In [146]:

count_vector = CountVectorizer(stop_words="english")
    
x_training = count_vector.fit_transform(x_train)
x_testing = count_vector.transform(x_test)

In [147]:
# Initialize the Decision Tree classifier
clf = DecisionTreeClassifier()
# Train the classifier

clf.fit(x_training,y_train)

In [148]:
#make prediction on the testing data
y_pred = clf.predict(x_testing)
print(y_pred)

[0 0 3 5 5 3 5 0 0 4 1 4 5 0 0 0 0 0 4 0 0 0 4 0 5 4 0 0 3 0 4 0 1 1 0 0 4
 3 3 0 0 0 0 1 4 3 4 2 0 0 5 5 0 4 5 0 3 0 0 5 1 2 1 5 1 3 4 4 2 5 0 0 4 2
 5 2 0 1 1 5 0 0 0 0 3 0 0 4 1 0 0 5 5 4 5 4 1 4 4 5 1 4 4 0 0 1 3 0 4 0 5
 1 1 1 1 1 0 0 1 0 1 3 4 4 1 1 1 3 0 3 0 5 0 3 0 0 0 2 3 1 5 0 4 1 3 3 1 5
 1 0 3 1 2 5 0 0 5 0 4 1 1 0 0 4 0 5 4 0 3 3 3 0 5 1 5 0 0 3 0 5 2 0 3 0 4
 1 5 4 0 5 2 4 5 5 3 4 5 2 3 0 2 0 0 5 1 1 0 0 4 0 0 0 0 4 4 5 2 1 0 0 0 0
 0 2 1 0 0 0 1 1 0 4 4 1 4 3 1 3 3 1 3 4 0 3 2 5 5 1 0 3 1 0 3 1 5 1 0 5 1
 2 1 0 1 1 0 0 0 0 1 4 0 5 3 5 1 5 0 0 0 3 2 5 0 4 1 4 1 5 0 0 0 0 5 0 1 0
 3 2 0 5 4 3 0 5 0 0 0 2 0 0 4 4 3 1 0 5 4 0 5 0 3 0 1 0]


In [149]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
F1_score = f1_score(y_test, y_pred, average='macro')

In [150]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1_Score: ", F1_score)

Accuracy:  0.6790123456790124
Precision:  0.7190522261536194
Recall:  0.6824961171862349
F1_Score:  0.6886636834487168
