In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
filename = 'uci-news-aggregator.csv'
df = pd.read_csv(filename, sep= ",")
df.CATEGORY.unique()

array(['b', 't', 'e', 'm'], dtype=object)

In [4]:
import string
# Select the relevant columns (ID and TITLE)
df = df[['TITLE','CATEGORY']]

df['CATEGORY'] = df.CATEGORY.map({'b':1, 't':2, 'e':3, 'm':4})
df['TITLE'] = df.TITLE.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)
print(df['TITLE'])

# Split the data into training and testing sets
x_train,x_test,y_train, y_test = train_test_split(
    df['TITLE'], df['CATEGORY'], test_size=0.2, random_state = 42)
print("Training dataset: ", x_train.shape[0])
print("Test dataset: ", x_test.shape[0])

0         fed official says weak data caused by weather ...
1         feds charles plosser sees high bar for change ...
2         us open stocks fall after fed official hints a...
3         fed risks falling behind the curve charles plo...
4          feds plosser nasty weather has curbed job growth
                                ...                        
422414    surgeons to remove 4yearolds rib to rebuild da...
422415    boy to have surgery on esophagus after battery...
422416    child who swallowed battery to have reconstruc...
422417    phoenix boy undergoes surgery to repair throat...
422418    phoenix boy undergoes surgery to repair throat...
Name: TITLE, Length: 422419, dtype: object
Training dataset:  337935
Test dataset:  84484


In [5]:

count_vector = CountVectorizer(stop_words="english")
    
x_training = count_vector.fit_transform(x_train)
x_testing = count_vector.transform(x_test)

In [None]:
# Initialize the Decision Tree classifier
clf = DecisionTreeClassifier()
# Train the classifier

clf.fit(x_training,y_train)

In [None]:
#make prediction on the testing data
y_pred = clf.predict(x_testing)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
F1_score = f1_score(y_test, y_pred, average='macro')

In [None]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1_Score: ", F1_score)