Downloads the [news aggregator data](https://archive.ics.uci.edu/dataset/359/news+aggregator)
and generates a subsample of data (50K) that is more balanced among the 4 classes. Original dataset has about 422K rows.

In [1]:
import io
import requests
from zipfile import ZipFile
from pathlib import Path

In [2]:
import pandas as pd

In [3]:
ulr_zipfile = 'https://archive.ics.uci.edu/static/public/359/news+aggregator.zip'

response = requests.get(ulr_zipfile)

In [4]:
response.raise_for_status() # Check if the request was successful

In [5]:
type(response.content)

bytes

In [6]:
out_dir = Path('news_aggregator_dataset')
with ZipFile(io.BytesIO(response.content)) as zip_file:
    zip_file.extractall(out_dir)

In [7]:
csv_path = out_dir / 'newsCorpora.csv'

col_names = ['id', 'title', 'url', 'publisher',
             'category', 'story', 'hostname', 'timestamp']

df = pd.read_csv(csv_path, sep='\t', names=col_names)
df = df[['title', 'category']]
df.shape

(422419, 2)

In [8]:
df.sample(10)

Unnamed: 0,title,category
334533,Potential SummerSlam Spoiler Removed from Amaz...,b
309266,The Buzz about Bees,t
302344,Chrysler Probed by NHTSA Over Ignitions in Jee...,t
389977,Puerto Rico declares chikungunya epidemic,m
68461,Possible To Use Office For iPad Without Office...,t
414714,"Jurassic Park 1993 Is Trending Online, Here's Why",e
287041,Kanye West's 'New Testament' Documentary Surfa...,e
186186,"Apps: Google Maps 3.0, Intake, Republique…",t
180411,"Adidas Profit Misses Estimates, Shares Have Fa...",b
416164,The best Irish Ice Bucket Challenge video yet,e


In [9]:
df['category'].value_counts(normalize=True).round(2)

Unnamed: 0_level_0,proportion
category,Unnamed: 1_level_1
e,0.36
b,0.27
t,0.26
m,0.11


In [10]:
df.query('category == "m"').sample(5)

Unnamed: 0,title,category
400052,Surgeon General's 'Call to Action' To Prevent ...,m
179691,Blood of Young Mice Could Be Key to Reverse Aging,m
99782,Obama Nominates Budget Director To Replace Seb...,m
378971,Do your friends feel like family? Science is b...,m
158314,Labor Department cuts levels of allowable coal...,m


In [11]:
df['word_cnt'] = df['title'].apply(lambda txt: len(txt.split()))

df['word_cnt'].value_counts().sort_values(ascending=False).head(10)

Unnamed: 0_level_0,count
word_cnt,Unnamed: 1_level_1
9,58184
8,57890
10,52096
7,49180
11,44528
12,36386
6,34865
13,26363
5,19805
14,16638


In [12]:
df['word_cnt'].describe().round(1)

Unnamed: 0,word_cnt
count,422419.0
mean,9.3
std,6.4
min,1.0
25%,7.0
50%,9.0
75%,11.0
max,2045.0


In [13]:
df.query('word_cnt == 20')

Unnamed: 0,title,category,word_cnt
79285,Do we have the time - or the stomach - to eat ...,m,20
119035,'I did not kill my wife - I am not a murderer'...,e,20
137002,"For the first time, fans can bid on a visit to...",e,20
273480,"If you have an iPhone, iPad or a Mac, Apple ju...",t,20
358897,One glass of wine or a beer at the age of 14 c...,m,20
385758,Dawn of the Planet of the Apes and Rise of the...,e,20


In [14]:
df.query('word_cnt > 100')

Unnamed: 0,title,category,word_cnt
69473,Love & Hip-Hop' Star Benzino Shot By Nephew En...,e,1364
93797,Peaches has died. We are beyond pain.\thttp://...,e,2045
110870,Hunger Games' top winner at MTV Movie Awards\t...,e,727
112847,"A few years ago, scientists calculated that be...",t,1837
194882,I never thought I'd be in love” says Angelina ...,e,1816
210277,The Best Reactions To The Supposed Video of So...,e,201
279114,The Fault In Our Stars' to release in India on...,e,287
391520,"More Japanese Xbox One Games In Development, A...",t,1030


In [15]:
df.query('5 < word_cnt < 14').shape

(359492, 3)

In [16]:
df.query('5 < word_cnt < 14')['category'].value_counts(normalize=True).round(2)

Unnamed: 0_level_0,proportion
category,Unnamed: 1_level_1
e,0.35
b,0.28
t,0.26
m,0.11


In [17]:
foo = df.query('5 < word_cnt < 14')['category'].value_counts(normalize=True).round(2)
weights = foo.to_dict()
weights

{'e': 0.35, 'b': 0.28, 't': 0.26, 'm': 0.11}

In [18]:
inverse_weights = {k: int(1/v) for k, v in weights.items()}
total = sum(inverse_weights.values())

inverse_weights, total

({'e': 2, 'b': 3, 't': 3, 'm': 9}, 17)

In [19]:
normalized_weights = {k: round(v/total, 2) for k, v in inverse_weights.items()}
normalized_weights

{'e': 0.12, 'b': 0.18, 't': 0.18, 'm': 0.53}

In [20]:
dx = df.query('5 < word_cnt < 14')

dx['weights'] = dx['category'].map(normalized_weights)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dx['weights'] = dx['category'].map(normalized_weights)


In [21]:
dx.sample(10)

Unnamed: 0,title,category,word_cnt,weights
74615,Benzino Shot During Mother's Funeral Processio...,e,9,0.12
151360,Networks cry foul in TV Internet case,e,7,0.12
303952,Björk Is Getting Her Own Massive Art Exhibition,e,8,0.12
62453,Bad Words misspells 'protagonist' with a less-...,e,8,0.12
100440,'Jet lag app' to help travellers adjust body c...,m,9,0.53
91513,Japan Pharma Takeda To Fight $6B US Jury Damag...,b,10,0.18
59763,"Ebola kills 66 in Guinea, with 5 cases in Conakry",m,10,0.53
619,Central Europe Turns to US for Natural Gas,b,8,0.18
313237,Lana Del Rey responds to Kurt Cobain controversy,e,8,0.12
238899,"T-Mobile Beats AT&T, Verizon to VoLTE",t,6,0.18


In [22]:
dx['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
e,126166
b,100728
t,93029
m,39569


In [23]:
samples = dx.sample(50000, weights='weights', random_state=19)
samples['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
b,13253
m,13232
t,12097
e,11418


In [24]:
samples[['title', 'category']].to_csv('headlines_sample.csv', index=False)

In [25]:
data = pd.read_csv('headlines_sample.csv')
data.head(3)

Unnamed: 0,title,category
0,WHO: 7 milion dead due to air pollution,m
1,Diane Sawyer steps away in ABC anchor shuffle,e
2,US denies knowledge of 'Heartbleed' bug,t


In [26]:
data['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
b,13253
m,13232
t,12097
e,11418
