In [3]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(1, 2)) # params: ngram_range minimum, maximum ngrams
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [4]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [5]:
import spacy

# Load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

In [7]:
def preprocess(text):
  doc = nlp(text)

  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)
  return " ".join(filtered_tokens)

preprocess("Loki is eating pizza")

'Loki eat pizza'

In [8]:
corpus_processed = [
    preprocess(text) for text in corpus
]

corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [9]:
v = CountVectorizer(ngram_range=(1, 2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [11]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

In [12]:
v.transform(["Rod eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]])

In [13]:
from google.colab import drive
drive.mount("/gdrive")

Mounted at /gdrive


In [16]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/kaggle"
%cd "/content/drive/MyDrive/kaggle"

/content/drive/MyDrive/kaggle


In [17]:
!kaggle datasets download -d rmisra/news-category-dataset

Downloading news-category-dataset.zip to /content/drive/MyDrive/kaggle
 68% 18.0M/26.5M [00:00<00:00, 94.4MB/s]
100% 26.5M/26.5M [00:00<00:00, 108MB/s] 


In [18]:
!unzip \*.zip && rm *.zip

Archive:  news-category-dataset.zip
  inflating: News_Category_Dataset_v3.json  


In [20]:
import pandas as pd

df = pd.read_json("/content/drive/MyDrive/kaggle/News_Category_Dataset_v3.json", lines=True)

df.shape

(209527, 6)

In [21]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [23]:
df.category.value_counts()

POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [25]:
min_samples = 1014
categories = df.category.value_counts().index.tolist()
dfs_by_category = {}  # Dictionary to store dataframes by category

for category in categories:
    df_category = df[df.category == category].sample(min_samples, random_state=2022)

    dfs_by_category[category] = df_category

concatenated_df = pd.concat(dfs_by_category.values(), ignore_index=True)


In [28]:
concatenated_df.category.value_counts()

POLITICS          1014
MONEY             1014
WEIRD NEWS        1014
GREEN             1014
WORLDPOST         1014
RELIGION          1014
STYLE             1014
SCIENCE           1014
TECH              1014
TASTE             1014
ARTS              1014
WELLNESS          1014
ENVIRONMENT       1014
FIFTY             1014
GOOD NEWS         1014
U.S. NEWS         1014
ARTS & CULTURE    1014
COLLEGE           1014
LATINO VOICES     1014
CULTURE & ARTS    1014
MEDIA             1014
WORLD NEWS        1014
DIVORCE           1014
IMPACT            1014
ENTERTAINMENT     1014
TRAVEL            1014
STYLE & BEAUTY    1014
PARENTING         1014
HEALTHY LIVING    1014
QUEER VOICES      1014
FOOD & DRINK      1014
BUSINESS          1014
COMEDY            1014
SPORTS            1014
BLACK VOICES      1014
HOME & LIVING     1014
PARENTS           1014
THE WORLDPOST     1014
WEDDINGS          1014
WOMEN             1014
CRIME             1014
EDUCATION         1014
Name: category, dtype: int64

In [32]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your DataFrame and 'category' is the column to encode
label_encoder = LabelEncoder()

# Fit and transform the 'category' column into numerical labels
concatenated_df['category_encoded'] = label_encoder.fit_transform(concatenated_df['category'])

# Display the updated DataFrame with the encoded column
print(concatenated_df)


                                                    link  \
0      https://www.huffingtonpost.com/entry/trump-hea...   
1      https://www.huffingtonpost.com/entry/dreamers-...   
2      https://www.huffingtonpost.com/entry/democrats...   
3      https://www.huffingtonpost.com/entry/jcc-lette...   
4      https://www.huffingtonpost.com/entry/donald-tr...   
...                                                  ...   
42583  https://www.huffingtonpost.com/entry/the-globa...   
42584  https://www.huffingtonpost.com/entry/reflectio...   
42585  https://www.huffingtonpost.com/entry/for-the-f...   
42586  https://www.huffingtonpost.com/entry/charter-s...   
42587  https://www.huffingtonpost.com/entry/3-tips-fo...   

                                                headline   category  \
0      The Coverage Of Trump’s Big Dumb Body Is Fat W...   POLITICS   
1           Dreamers Are People, Not Political Footballs   POLITICS   
2      Democrats Must Elect Bernie Sanders Senate Min...   POLITIC

In [35]:
concatenated_df.head(-5)

Unnamed: 0,link,headline,category,short_description,authors,date,category_encoded
0,https://www.huffingtonpost.com/entry/trump-hea...,The Coverage Of Trump’s Big Dumb Body Is Fat W...,POLITICS,"The president, it evidently needs to be said, ...",Travis Waldron,2018-01-18,24
1,https://www.huffingtonpost.com/entry/dreamers-...,"Dreamers Are People, Not Political Footballs",POLITICS,People should not be reduced to pawns used by ...,"Center for Community Change Action, Contributo...",2017-09-09,24
2,https://www.huffingtonpost.com/entry/democrats...,Democrats Must Elect Bernie Sanders Senate Min...,POLITICS,The Senate will meet this Wednesday to elect i...,"Linda Milazzo, ContributorParticipatory journa...",2016-11-13,24
3,https://www.huffingtonpost.com/entry/jcc-lette...,Jewish Leaders Frustrated By Lack Of Progress ...,POLITICS,"In a letter to Attorney General Jeff Sessions,...",Matt Ferner,2017-03-08,24
4,https://www.huffingtonpost.com/entry/donald-tr...,Donald Trump Roasted For Painfully Awkward Att...,POLITICS,"Well, that didn't seem to go as planned.",Rebecca Shapiro,2018-04-24,24
...,...,...,...,...,...,...,...
42578,https://www.huffingtonpost.com/entry/kentucky-...,Teachers Swarm Kentucky Capitol To Protest Pen...,EDUCATION,"“We feel demoralized, we feel assaulted, and w...",Travis Waldron,2018-04-02,9
42579,https://www.huffingtonpost.com/entry/parkland-...,Parkland School District Votes Against Arming ...,EDUCATION,"""I have not met one teacher or one student who...",Hayley Miller,2018-04-11,9
42580,https://www.huffingtonpost.com/entry/creating-...,Creating a Learning Environment With MOOCs,EDUCATION,"The question, in the end, is not whether or no...","Christine Nasserghodsi, ContributorDirector of...",2014-08-16,9
42581,https://www.huffingtonpost.com/entry/my-classr...,"My Classroom, Not My Island",EDUCATION,"As a woman of color and a life-long Chicagoan,...","TNTP, ContributorReimagine teaching",2014-05-27,9


In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    concatenated_df.short_description,
    concatenated_df.category_encoded,
    test_size=0.2,
    random_state=42,
    stratify=concatenated_df.category_encoded
)

In [44]:
print(X_train.shape)
X_train.head(5)

(34070,)


5080     A truth and a blessing. Enjoy a sampling of th...
20742    Far too much of the research discussed in the ...
12712    Flip through our many other Style Evolutions. ...
10509    When you hear that Justin Timberlake's going t...
35652                                                     
Name: short_description, dtype: object

In [45]:
print(y_test.shape)
y_test.head(5)

(8518,)


2549     10
29124    27
8605     13
18998     6
26407    26
Name: category_encoded, dtype: int64

In [46]:
y_test.value_counts()

10    203
24    203
14    203
34    203
0     203
2     203
15    203
33    203
1     203
32    203
9     203
22    203
23    203
17    203
31    203
30    203
7     203
40    203
27    203
18    203
36    203
21    203
6     203
38    203
37    203
12    203
28    203
41    203
11    203
5     203
20    203
3     203
16    203
35    203
29    202
39    202
19    202
8     202
4     202
26    202
13    202
25    202
Name: category_encoded, dtype: int64

In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1, 2))),
    ("Multi NB", MultinomialNB()),
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.21      0.08      0.11       203
           1       0.00      0.00      0.00       203
           2       0.41      0.09      0.15       203
           3       0.16      0.07      0.10       203
           4       0.07      0.35      0.11       202
           5       0.58      0.05      0.10       203
           6       0.52      0.14      0.22       203
           7       0.35      0.43      0.39       203
           8       0.38      0.47      0.42       202
           9       0.30      0.53      0.38       203
          10       0.39      0.04      0.08       203
          11       0.30      0.40      0.34       203
          12       0.07      0.63      0.13       203
          13       0.41      0.15      0.22       202
          14       0.00      0.00      0.00       203
          15       0.15      0.02      0.03       203
          16       0.25      0.01      0.03       203
          17       0.61    

In [49]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ("vectorizer_bow", CountVectorizer()),
    ("Multi NB", MultinomialNB()),
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.26      0.14      0.18       203
           1       0.29      0.01      0.02       203
           2       0.36      0.12      0.18       203
           3       0.22      0.11      0.14       203
           4       0.07      0.37      0.12       202
           5       0.42      0.05      0.09       203
           6       0.49      0.21      0.30       203
           7       0.32      0.39      0.35       203
           8       0.37      0.46      0.41       202
           9       0.34      0.57      0.42       203
          10       0.37      0.05      0.09       203
          11       0.28      0.39      0.33       203
          12       0.08      0.57      0.14       203
          13       0.39      0.22      0.28       202
          14       0.17      0.00      0.01       203
          15       0.35      0.08      0.13       203
          16       0.20      0.02      0.04       203
          17       0.47    