In [27]:
import pandas as pd
import transformers
import torch
from sklearn.model_selection import train_test_split

In [28]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [30]:
df.category.value_counts().index[:7]

Index(['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY',
       'PARENTING', 'HEALTHY LIVING'],
      dtype='object')

In [31]:
df.category.unique()

array(['U.S. NEWS', 'COMEDY', 'PARENTING', 'WORLD NEWS', 'CULTURE & ARTS',
       'TECH', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WEIRD NEWS',
       'ENVIRONMENT', 'EDUCATION', 'CRIME', 'SCIENCE', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA',
       'QUEER VOICES', 'HOME & LIVING', 'WOMEN', 'BLACK VOICES', 'TRAVEL',
       'MONEY', 'RELIGION', 'LATINO VOICES', 'IMPACT', 'WEDDINGS',
       'COLLEGE', 'PARENTS', 'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE',
       'HEALTHY LIVING', 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST',
       'FIFTY', 'ARTS', 'DIVORCE'], dtype=object)

In [32]:
categories = ['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY',
       'PARENTING', 'HEALTHY LIVING'] 
df = df[df.category.isin(categories)]
df = df.reset_index(drop=True) 

label_to_code = {}
code_to_label = {}
for i, label in enumerate(categories):
    label_to_code[label] = i
    code_to_label[i] = label

label_to_code, code_to_label

({'POLITICS': 0,
  'WELLNESS': 1,
  'ENTERTAINMENT': 2,
  'TRAVEL': 3,
  'STYLE & BEAUTY': 4,
  'PARENTING': 5,
  'HEALTHY LIVING': 6},
 {0: 'POLITICS',
  1: 'WELLNESS',
  2: 'ENTERTAINMENT',
  3: 'TRAVEL',
  4: 'STYLE & BEAUTY',
  5: 'PARENTING',
  6: 'HEALTHY LIVING'})

In [33]:
df['label'] = df.category.map(label_to_code)

In [8]:
df['headline_length'] = df.headline.str.len()
df['headline_length'].value_counts()

65     1818
67     1798
66     1740
68     1712
64     1712
       ... 
139       1
154       1
138       1
150       1
148       1
Name: headline_length, Length: 154, dtype: int64

In [34]:
df = df[['headline', 'label']]
df

Unnamed: 0,headline,label
0,The Funniest Tweets From Parents This Week (Se...,5
1,Golden Globes Returning To NBC In January Afte...,2
2,Biden Says U.S. Forces Would Defend Taiwan If ...,0
3,‘Beautiful And Sad At The Same Time’: Ukrainia...,0
4,James Cameron Says He 'Clashed' With Studio Be...,2
...,...,...
106103,This Is Only the Beginning: Surprising Advice ...,1
106104,Cheryl Tiegs In A Sauna: A Look Back,4
106105,Teen Responsibility Doesn't Happen -- It's Nur...,5
106106,"Sundance, Ice-T, and Shades of the American Ra...",2


In [35]:
train, dev_test = train_test_split(df, test_size=0.3, random_state=0)
dev, test = train_test_split(dev_test, test_size=0.5, random_state=0)

train = train.reset_index(drop=True)
dev = dev.reset_index(drop=True)
test = test.reset_index(drop=True)

In [41]:
train.to_json(path_or_buf='data/train.json', orient='records')
dev.to_json(path_or_buf='data/dev.json', orient='records')
test.to_json(path_or_buf='data/test.json', orient='records')