In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import spacy
from sklearn.naive_bayes import MultinomialNB

# for NLTK and spaCy preprocessing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /Users/arthur/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arthur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/arthur/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Preprocessing

In [None]:
# read json file into dataframe
df = pd.read_json('/Users/arthur/My Drive/Education/Master/Copenhagen Business School/2. Semester/Natural Language Processing and Text Analytics/Assignments/Final Assignment/News_Category_Dataset_v3.json', lines=True)

#### Inspecting the Data:

In [None]:
print(df.shape)
df.head()

(209527, 6)


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [None]:
print("Unique categories:",df['category'].nunique())
print(df['category'].unique())
df['category'].value_counts()

Unique categories: 42
['U.S. NEWS' 'COMEDY' 'PARENTING' 'WORLD NEWS' 'CULTURE & ARTS' 'TECH'
 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WEIRD NEWS' 'ENVIRONMENT'
 'EDUCATION' 'CRIME' 'SCIENCE' 'WELLNESS' 'BUSINESS' 'STYLE & BEAUTY'
 'FOOD & DRINK' 'MEDIA' 'QUEER VOICES' 'HOME & LIVING' 'WOMEN'
 'BLACK VOICES' 'TRAVEL' 'MONEY' 'RELIGION' 'LATINO VOICES' 'IMPACT'
 'WEDDINGS' 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'THE WORLDPOST' 'GOOD NEWS' 'WORLDPOST' 'FIFTY' 'ARTS'
 'DIVORCE']


POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [None]:
df.describe()

  df.describe()


Unnamed: 0,link,headline,category,short_description,authors,date
count,209527,209527,209527,209527.0,209527.0,209527
unique,209486,207996,42,187022.0,29169.0,3890
top,https://www.huffingtonpost.comhttps://www.wash...,Sunday Roundup,POLITICS,,,2014-03-25 00:00:00
freq,2,90,35602,19712.0,37418.0,100
first,,,,,,2012-01-28 00:00:00
last,,,,,,2022-09-23 00:00:00


In [None]:
df.isnull().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [None]:
# checking for empty headlines, description and categories
empty_headlines = df[df['headline'] == '']['headline'].value_counts()
empty_description = df[df['short_description'] == '']['short_description'].value_counts()
empty_category = df[df['category'] == '']['category'].value_counts()

print(f'Empty Count: {empty_headlines} \n'
      f'Empty Count: {empty_description}\n'
      f'Empty Count: {empty_category}')

Empty Count:     6
Name: headline, dtype: int64 
Empty Count:     19712
Name: short_description, dtype: int64
Empty Count: Series([], Name: category, dtype: int64)


#### Data Preprocessing:

In [None]:
# Dropping Columns that are not necessary for our analysis
# Dropping Rows with empty strings in the headline

columns_to_drop = ['authors', 'link', 'date']
rows_to_drop = df[(df['headline'] == '') | (df['short_description'] == '') | (df['category'] == '')].index

filtered_df = df.drop(columns=columns_to_drop)
filtered_df.drop(rows_to_drop, inplace=True)
# drop rows with duplicate values for short_description and headline
print(filtered_df.duplicated(subset=['short_description','headline']).sum())
filtered_df.drop_duplicates(subset=['short_description','headline'],keep='last',inplace=True)

388


In [None]:
print(f'Shape of filtered DataFrame: {filtered_df.shape}')
print(f'Number of rows dropped: {len(df)-len(filtered_df)} rows out of {len(df)} dropped')
print(f'Percentage of rows dropped: {((len(df)-len(filtered_df))/len(df)*100):.2f}%') 

Shape of filtered DataFrame: (189426, 3)
Number of rows dropped: 20101 rows out of 209527 dropped
Percentage of rows dropped: 9.59%


In [None]:
# Function to merge 'headline' and 'short_description' with " : " separator
def merge_columns(row):
    if pd.isna(row['short_description']):
        return row['headline']
    else:
        return f"{row['headline']}: {row['short_description']}"

# Apply the function to create a new column
filtered_df['news'] = filtered_df.apply(merge_columns, axis=1)

In [None]:
filtered_df = filtered_df[['news', 'category']]
filtered_df.head()

Unnamed: 0,news,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [None]:
# determine which categories should be dropped, after a lot of consideration and inspection
categories_to_drop = [
    'IMPACT',
    'RELIGION', 
    'MEDIA',
    'SCIENCE', 
    'CRIME',
    'HOME & LIVING', 
    'WOMEN', 
    'WEIRD NEWS', 
    'FIFTY', 
    'GOOD NEWS', 
    'ARTS & CULTURE',
    'DIVORCE',
    'WEDDINGS',
    'QUEER VOICES',
    'BLACK VOICES',
    'LATINO VOICES',
    'U.S. NEWS',
    'COLLEGE',
    'EDUCATION',
]

# merging remaining categories together if they are similar in content
category_mapping1 = {
    'POLITICS': 'POLITICS',
    'WELLNESS': 'WELLNESS',
    'HEALTHY LIVING': 'WELLNESS',
    'ENTERTAINMENT': 'ART & ENTERTAINMENT',
    'COMEDY': 'ART & ENTERTAINMENT',
    'ARTS': 'ART & ENTERTAINMENT',
    'CULTURE & ARTS': 'ART & ENTERTAINMENT',
    'TRAVEL': 'TRAVEL',
    'STYLE & BEAUTY': 'STYLE & BEAUTY',
    'STYLE': 'STYLE & BEAUTY',
    'PARENTING': 'PARENTING',
    'PARENTS': 'PARENTING',
    'FOOD & DRINK': 'GASTRONOMY',
    'TASTE': 'GASTRONOMY',
    'BUSINESS': 'BUSINESS & TECH',
    'MONEY': 'BUSINESS & TECH',
    'TECH': 'BUSINESS & TECH',
    'SPORTS': 'SPORTS',
    'THE WORLDPOST': 'WORLD NEWS',
    'WORLD NEWS': 'WORLD NEWS',
    'WORLDPOST': 'WORLD NEWS',
    'GREEN': 'ENVIRONMENT',
    'ENVIRONMENT': 'ENVIRONMENT',
}

filtered_df = filtered_df[~filtered_df['category'].isin(categories_to_drop)]
filtered_df['category'] = filtered_df['category'].map(category_mapping1)

In [None]:
print(filtered_df.shape)
print("Unique categories:",filtered_df['category'].nunique())
print(filtered_df['category'].unique())
filtered_df['category'].value_counts()

(143201, 2)
Unique categories: 11
['ART & ENTERTAINMENT' 'PARENTING' 'WORLD NEWS' 'BUSINESS & TECH' 'SPORTS'
 'POLITICS' 'ENVIRONMENT' 'WELLNESS' 'STYLE & BEAUTY' 'GASTRONOMY'
 'TRAVEL']


POLITICS               32425
WELLNESS               23202
ART & ENTERTAINMENT    21339
PARENTING              12278
STYLE & BEAUTY         11229
TRAVEL                  9418
BUSINESS & TECH         8939
GASTRONOMY              8271
WORLD NEWS              8201
SPORTS                  4414
ENVIRONMENT             3485
Name: category, dtype: int64

In [None]:
print(f'Percentage of rows dropped: {((len(df)-len(filtered_df))/len(df)*100):.2f}%') 

Percentage of rows dropped: 31.66%


In [None]:
# Exporting as csv:
filtered_df.to_csv('/Users/arthur/Downloads/news.csv', index=False)

## NLTK (performs worse so use spacy)

In [2]:
df = filtered_df.copy()

In [3]:
# lowercase
df['news'] = df['news'].str.lower()

# Remove punctuation
df['news'] = df['news'].apply(lambda text: re.sub(r'[^\w\s]', '', text))

# Tokenization
df['news'] = df['news'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['news'] = df['news'].apply(lambda text: [word for word in text if word not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['news'] = df['news'].apply(lambda text: [lemmatizer.lemmatize(word) for word in text])

# join tokens back to string format for vectorization
df['news'] = df['news'].apply(lambda text: ' '.join(text))

print(df)

                                                     news             category
0       23 funniest tweet cat dog week sept 1723 dog d...  ART & ENTERTAINMENT
1       funniest tweet parent week sept 1723 accidenta...            PARENTING
2       puerto ricans desperate water hurricane fionas...           WORLD NEWS
3       new documentary capture complexity child immig...  ART & ENTERTAINMENT
4       biden un call russian war affront body charter...           WORLD NEWS
...                                                   ...                  ...
143196  rim ceo thorsten heins significant plan blackb...      BUSINESS & TECH
143197  maria sharapova stunned victoria azarenka aust...               SPORTS
143198  giant patriot jet colt among improbable super ...               SPORTS
143199  aldon smith arrested 49ers linebacker busted d...               SPORTS
143200  dwight howard rip teammate magic loss hornet f...               SPORTS

[143201 rows x 2 columns]


In [4]:
# Using CountVectorizer
vectorizer = CountVectorizer()
X_count = vectorizer.fit_transform(df['news'])
y = df['category']

# Using TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['news'])

### CountVectorizer

In [5]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)

# train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# predict labels
y_pred = model.predict(X_test)

# evaluate
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7749729408889354
Classification Report:
                     precision    recall  f1-score   support

ART & ENTERTAINMENT       0.75      0.78      0.76      4320
    BUSINESS & TECH       0.64      0.60      0.62      1705
        ENVIRONMENT       0.61      0.49      0.55       707
         GASTRONOMY       0.81      0.81      0.81      1643
          PARENTING       0.75      0.72      0.74      2479
           POLITICS       0.83      0.86      0.84      6602
             SPORTS       0.77      0.73      0.75       848
     STYLE & BEAUTY       0.85      0.81      0.83      2240
             TRAVEL       0.79      0.75      0.77      1897
           WELLNESS       0.77      0.80      0.79      4599
         WORLD NEWS       0.73      0.68      0.70      1601

           accuracy                           0.77     28641
          macro avg       0.75      0.73      0.74     28641
       weighted avg       0.77      0.77      0.77     28641



### Tfidf Vectorizer

In [6]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# predict labels
y_pred = model.predict(X_test)

# print metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7847142208721762
Classification Report:
                     precision    recall  f1-score   support

ART & ENTERTAINMENT       0.72      0.81      0.76      4320
    BUSINESS & TECH       0.70      0.59      0.64      1705
        ENVIRONMENT       0.70      0.43      0.53       707
         GASTRONOMY       0.83      0.80      0.82      1643
          PARENTING       0.80      0.73      0.76      2479
           POLITICS       0.83      0.88      0.85      6602
             SPORTS       0.79      0.65      0.71       848
     STYLE & BEAUTY       0.86      0.80      0.83      2240
             TRAVEL       0.80      0.76      0.78      1897
           WELLNESS       0.76      0.84      0.80      4599
         WORLD NEWS       0.77      0.68      0.72      1601

           accuracy                           0.78     28641
          macro avg       0.78      0.72      0.75     28641
       weighted avg       0.78      0.78      0.78     28641



## Spacy

In [7]:
df = filtered_df.copy()

In [8]:
# function to preprocess text with spaCy
def preprocess_text(text):
    doc = nlp(text.lower())  # Convert to lowercase and process with spaCy
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

# apply preprocessing to df
df['news'] = df['news'].apply(preprocess_text)

In [9]:
print(df)

                                                     news             category
0       23 funniest tweet cat dog week sept 17 23 dog ...  ART & ENTERTAINMENT
1       funniest tweet parent week sept 17 23 accident...            PARENTING
2       puerto rican desperate water hurricane fiona r...           WORLD NEWS
3       new documentary capture complexity child immig...  ART & ENTERTAINMENT
4       biden un russian war affront body charter whit...           WORLD NEWS
...                                                   ...                  ...
143196  rim ceo thorsten hein significant plan blackbe...      BUSINESS & TECH
143197  maria sharapova stun victoria azarenka austral...               SPORTS
143198  giant patriot jet colt   improbable super bowl...               SPORTS
143199  aldon smith arrest 49er linebacker bust dui co...               SPORTS
143200  dwight howard rips teammate magic loss hornet ...               SPORTS

[143201 rows x 2 columns]


In [10]:
# Using CountVectorizer
vectorizer = CountVectorizer()
X_count = vectorizer.fit_transform(df['news'])
y = df['category']

# Using TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['news'])

### CountVectorizer

In [11]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7787437589469641
Classification Report:
                     precision    recall  f1-score   support

ART & ENTERTAINMENT       0.75      0.78      0.77      4320
    BUSINESS & TECH       0.64      0.62      0.63      1705
        ENVIRONMENT       0.62      0.50      0.55       707
         GASTRONOMY       0.80      0.81      0.81      1643
          PARENTING       0.76      0.72      0.74      2479
           POLITICS       0.83      0.86      0.85      6602
             SPORTS       0.76      0.72      0.74       848
     STYLE & BEAUTY       0.85      0.81      0.83      2240
             TRAVEL       0.79      0.75      0.77      1897
           WELLNESS       0.77      0.81      0.79      4599
         WORLD NEWS       0.74      0.69      0.72      1601

           accuracy                           0.78     28641
          macro avg       0.76      0.74      0.75     28641
       weighted avg       0.78      0.78      0.78     28641



### Tfidf Vectorizer

In [28]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.7881358891100171
Classification Report:
                     precision    recall  f1-score   support

ART & ENTERTAINMENT       0.74      0.80      0.77      4320
    BUSINESS & TECH       0.70      0.61      0.65      1705
        ENVIRONMENT       0.68      0.44      0.54       707
         GASTRONOMY       0.82      0.82      0.82      1643
          PARENTING       0.79      0.72      0.75      2479
           POLITICS       0.83      0.88      0.86      6602
             SPORTS       0.79      0.67      0.73       848
     STYLE & BEAUTY       0.87      0.81      0.83      2240
             TRAVEL       0.80      0.77      0.78      1897
           WELLNESS       0.76      0.85      0.80      4599
         WORLD NEWS       0.78      0.69      0.73      1601

           accuracy                           0.79     28641
          macro avg       0.78      0.73      0.75     28641
       weighted avg       0.79      0.79      0.79     28641



### GridSearch

In [13]:
"""# Labels (categories)
labels = df['category']

# split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# Define the logistic regression model
model = LogisticRegression(max_iter=1000)

# set parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 500, 1000, 2000]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=5, n_jobs = -1)

# start GriedSearch
grid_search.fit(X_train, y_train)

# save best model
best_model = grid_search.best_estimator_

# predict on the test set with the best model
y_pred = best_model.predict(X_test)

# print metrics
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy:.2f}')"""

"# Labels (categories)\nlabels = df['category']\n\n# Split the data\nX_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)\n\n# Define the logistic regression model\nmodel = LogisticRegression(max_iter=1000)\n\n# Define the hyperparameters grid\nparam_grid = {\n    'C': [1],\n    'penalty': ['l2'],\n    'solver': ['liblinear', 'lbfgs'],\n    'max_iter': [100, 500, 1000, 2000]\n}\n\n# Initialize GridSearchCV\ngrid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=5, n_jobs = -1)\n\n# Fit GridSearchCV\ngrid_search.fit(X_train, y_train)\n\n# Best model\nbest_model = grid_search.best_estimator_\n\n# Predict on the test set with the best model\ny_pred = best_model.predict(X_test)\n\n# Evaluate the model\naccuracy = accuracy_score(y_test, y_pred)\nprint(f'Best Parameters: {grid_search.best_params_}')\nprint(f'Accuracy: {accuracy:.2f}')"

### Multinomial Naive Bayes with GridSearch

In [26]:
# set up Multinomila Naive Bayes
model = MultinomialNB()

# set parameter grid (not much to tune)
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'fit_prior': [True, False],
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# start the GridSearch
grid_search.fit(X_train, y_train)

# save best model
best_model = grid_search.best_estimator_

# predict on the test set with the best model
y_pred = best_model.predict(X_test)

# print metrics
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy}')

# print the classification report
report = classification_report(y_test, y_pred)
print(report)

Best Parameters: {'alpha': 0.1, 'fit_prior': False}
Accuracy: 0.7703990782444747
                     precision    recall  f1-score   support

ART & ENTERTAINMENT       0.80      0.73      0.76      4320
    BUSINESS & TECH       0.61      0.68      0.64      1705
        ENVIRONMENT       0.54      0.62      0.57       707
         GASTRONOMY       0.76      0.85      0.80      1643
          PARENTING       0.68      0.71      0.70      2479
           POLITICS       0.88      0.80      0.84      6602
             SPORTS       0.71      0.84      0.77       848
     STYLE & BEAUTY       0.82      0.82      0.82      2240
             TRAVEL       0.73      0.79      0.76      1897
           WELLNESS       0.81      0.77      0.79      4599
         WORLD NEWS       0.67      0.79      0.73      1601

           accuracy                           0.77     28641
          macro avg       0.73      0.76      0.74     28641
       weighted avg       0.78      0.77      0.77     28641

