In [1]:
import pandas as pd  

In [2]:
path = 'NewsCategorizer.csv'
# the converters argument will let Python read the ability column as a list, not a string
news = pd.read_csv(path)

In [46]:
from stop_words import get_stop_words

def headline_features(headline):
    # stopwords list is all lowercase so we need to match
    headline = headline.lower()
    words = headline.split(' ')
    keywords = [w for w in words if w not in get_stop_words('english')]
    while len(keywords) < 3:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2]}

headline_features('High Tech Works When It Enables High Touch')

{'first_keyword': 'high', 'second_keyword': 'tech', 'third_keyword': 'works'}

In [47]:
# this dataset happens to already contain key words, but for new data (ie news not from this dataset) we'll most likely
# have a headline and not a list of keywords
news[:10]

Unnamed: 0,category,headline,links,short_description,keywords
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods
5,WELLNESS,Bad Love Advice: 9 Lessons To Unlearn,https://www.huffingtonpost.com/entry/bad-love-...,"By Carey Moss for YouBeauty.com Love rom-coms,...",bad-love-advice-from-movies
6,WELLNESS,The Happiest (And Unhappiest) States In The U.S.,https://www.huffingtonpost.com/entry/happiest-...,The nation in general scored a 66.2 in 2011 on...,happiest-state-well-being-united-states-gallup
7,WELLNESS,Seaweed: The Green Superfood You're Not Eating...,https://www.huffingtonpost.com/entry/superfood...,It's also worth remembering that if the water ...,superfood-seaweed-health-benefits
8,WELLNESS,Addicted to Food?,https://www.huffingtonpost.com/entry/food-addi...,"If you look at our culture's eating behavior, ...",food-addiction
9,WELLNESS,High Tech Works When It Enables High Touch,https://www.huffingtonpost.com/entry/high-tech...,"François-Marie Arouet, 18th century French aut...",high-tech-works-when-it-e


In [76]:
# 10 categories, so our chances of guessing a headline correctly purely by chance is 10% <-- we want to do better!
news.category.unique()

array(['WELLNESS', 'POLITICS', 'ENTERTAINMENT', 'TRAVEL',
       'STYLE & BEAUTY', 'PARENTING', 'FOOD & DRINK', 'WORLD NEWS',
       'BUSINESS', 'SPORTS'], dtype=object)

In [60]:
zipped_features = zip(news.headline, news.category)

In [49]:
featuresets = [(headline_features(headline), category) for headline, category in zipped_features ]
featuresets

[({'first_keyword': '143', 'second_keyword': 'miles', 'third_keyword': '35'},
  'WELLNESS'),
 ({'first_keyword': 'talking',
   'second_keyword': 'yourself:',
   'third_keyword': 'crazy'},
  'WELLNESS'),
 ({'first_keyword': 'crenezumab:',
   'second_keyword': 'trial',
   'third_keyword': 'will'},
  'WELLNESS'),
 ({'first_keyword': 'oh,',
   'second_keyword': 'difference',
   'third_keyword': 'made'},
  'WELLNESS'),
 ({'first_keyword': 'green',
   'second_keyword': 'superfoods',
   'third_keyword': 'None'},
  'WELLNESS'),
 ({'first_keyword': 'bad',
   'second_keyword': 'love',
   'third_keyword': 'advice:'},
  'WELLNESS'),
 ({'first_keyword': 'happiest',
   'second_keyword': '(and',
   'third_keyword': 'unhappiest)'},
  'WELLNESS'),
 ({'first_keyword': 'seaweed:',
   'second_keyword': 'green',
   'third_keyword': 'superfood'},
  'WELLNESS'),
 ({'first_keyword': 'addicted',
   'second_keyword': 'food?',
   'third_keyword': 'None'},
  'WELLNESS'),
 ({'first_keyword': 'high',
   'second_key

In [50]:
import random
random.shuffle(featuresets)

In [51]:
import nltk
import math

print(len(featuresets))
split_num = math.floor(len(featuresets)*.8)
print(split_num)

# split feature sets into training and test sets (here we'll try 80% train, 20% test)
train_set, test_set = featuresets[:split_num], featuresets[split_num+1:]

50000
40000


In [52]:
# build a classifier based on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [53]:
classifier.classify(headline_features("Clevery Warns Tory Rebels: Don't Dump Truss"))

'WORLD NEWS'

In [54]:
classifier.classify(headline_features("Journalist’s Takedown Of Government Excuses Is Brilliant"))

'WORLD NEWS'

In [55]:
classifier.classify(headline_features("Do You Get More Anxious Or Sad In Autumn? There's A Reason For That"))

'PARENTING'

In [56]:
print(nltk.classify.accuracy(classifier, test_set))

0.6075607560756076


In [57]:
classifier.show_most_informative_features(12)

Most Informative Features
           first_keyword = 'best'         FOOD & : POLITI =    101.1 : 1.0
           first_keyword = 'recipe'       FOOD & : WELLNE =     90.8 : 1.0
          second_keyword = 'day:'         FOOD & : TRAVEL =     90.1 : 1.0
          second_keyword = 'recipes'      FOOD & : TRAVEL =     74.0 : 1.0
           third_keyword = 'style'        STYLE  : WELLNE =     72.6 : 1.0
           first_keyword = 'nfl'          SPORTS : WELLNE =     66.7 : 1.0
          second_keyword = 'travel'       TRAVEL : BUSINE =     66.3 : 1.0
           third_keyword = 'fashion'      STYLE  : WORLD  =     60.2 : 1.0
          second_keyword = 'business'     BUSINE : STYLE  =     56.8 : 1.0
           first_keyword = 'make'         FOOD & : BUSINE =     55.9 : 1.0
          second_keyword = 'ways'         WELLNE : SPORTS =     55.4 : 1.0
          second_keyword = 'best'         TRAVEL : WORLD  =     55.3 : 1.0


In [64]:
# let's improve this, let's try adding more keywords to our classifier, and remove number
# and using nltk stopwords
from nltk.corpus import stopwords

def headline_features2(headline):
    headline = headline.lower()
    words = headline.split(' ')
    s_words = stopwords.words('english')
    keywords = [w for w in words if w not in s_words and not w.isdigit()]
    while len(keywords) < 5:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2], 'fourth_keyword': keywords[3], 'fifth_keyword': keywords[4]}

headline_features2('High Tech Works When It Enables High Touch')

{'first_keyword': 'high',
 'second_keyword': 'tech',
 'third_keyword': 'works',
 'fourth_keyword': 'enables',
 'fifth_keyword': 'high'}

In [65]:
zipped_features = zip(news.headline, news.category)
featuresets = [(headline_features2(headline), category) for headline, category in zipped_features ]
featuresets

[({'first_keyword': 'miles',
   'second_keyword': 'days:',
   'third_keyword': 'lessons',
   'fourth_keyword': 'learned',
   'fifth_keyword': 'None'},
  'WELLNESS'),
 ({'first_keyword': 'talking',
   'second_keyword': 'yourself:',
   'third_keyword': 'crazy',
   'fourth_keyword': 'crazy',
   'fifth_keyword': 'helpful?'},
  'WELLNESS'),
 ({'first_keyword': 'crenezumab:',
   'second_keyword': 'trial',
   'third_keyword': 'gauge',
   'fourth_keyword': 'whether',
   'fifth_keyword': "alzheimer's"},
  'WELLNESS'),
 ({'first_keyword': 'oh,',
   'second_keyword': 'difference',
   'third_keyword': 'made',
   'fourth_keyword': 'None',
   'fifth_keyword': 'None'},
  'WELLNESS'),
 ({'first_keyword': 'green',
   'second_keyword': 'superfoods',
   'third_keyword': 'None',
   'fourth_keyword': 'None',
   'fifth_keyword': 'None'},
  'WELLNESS'),
 ({'first_keyword': 'bad',
   'second_keyword': 'love',
   'third_keyword': 'advice:',
   'fourth_keyword': 'lessons',
   'fifth_keyword': 'unlearn'},
  'WEL

In [66]:
import random

random.shuffle(featuresets)

In [67]:
import nltk
import math

split_num = math.floor(len(featuresets)*.8)

# split feature sets into training and test sets (here we'll try 80% train, 20% test)
train_set, test_set = featuresets[:split_num], featuresets[split_num+1:]

In [68]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [80]:
classifier.classify(headline_features("Alex Jones Will Likely Be Broke 'For The Rest Of His Life,' Ex-Prosecutor Says"))

'SPORTS'

In [70]:
print(nltk.classify.accuracy(classifier, test_set))

0.6576657665766577


In [71]:
classifier.show_most_informative_features(12)

Most Informative Features
           fifth_keyword = '(photos)'     TRAVEL : ENTERT =    121.6 : 1.0
           third_keyword = 'recipes'      FOOD & : WELLNE =     84.5 : 1.0
           first_keyword = 'recipe'       FOOD & : BUSINE =     83.2 : 1.0
          second_keyword = 'day:'         FOOD & : ENTERT =     83.0 : 1.0
          fourth_keyword = '(photos)'     TRAVEL : PARENT =     79.9 : 1.0
          second_keyword = 'recipes'      FOOD & : STYLE  =     73.6 : 1.0
           third_keyword = 'style'        STYLE  : WELLNE =     71.9 : 1.0
          second_keyword = 'travel'       TRAVEL : WORLD  =     68.7 : 1.0
           first_keyword = 'nfl'          SPORTS : WELLNE =     64.2 : 1.0
           third_keyword = 'fashion'      STYLE  : WORLD  =     59.1 : 1.0
          second_keyword = 'business'     BUSINE : WORLD  =     56.0 : 1.0
           first_keyword = 'ways'         WELLNE : WORLD  =     54.3 : 1.0


In [None]:
# how else can we improve our classifier?