In [None]:
import pandas as pd  

In [None]:
path = 'NewsCategorizer.csv'
# the converters argument will let Python read the ability column as a list, not a string
news = pd.read_csv(path)

In [None]:
from stop_words import get_stop_words

def headline_features(headline):
    # stopwords list is all lowercase so we need to match
    headline = headline.lower()
    words = headline.split()
    keywords = [w for w in words if w not in get_stop_words('english')]
    while len(keywords) < 3:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2]}

headline_features('High Tech Works When It Enables High Touch')

In [None]:
# this dataset happens to already contain key words, but for new data (ie news not from this dataset) we'll most likely
# have a headline and not a list of keywords
news[:10]

In [None]:
# 10 categories, so our chances of guessing a headline correctly purely by chance is 10% <-- we want to do better!
news.category.unique()

In [None]:
zipped_features = zip(news.headline, news.category)

In [None]:
featuresets = [(headline_features(headline), category) for headline, category in zipped_features ]
featuresets

In [None]:
import random
random.shuffle(featuresets)

In [None]:
import nltk
import math

print(len(featuresets))
split_num = math.floor(len(featuresets)*.8)
print(split_num)

# split feature sets into training and test sets (here we'll try 80% train, 20% test)
train_set, test_set = featuresets[:split_num], featuresets[split_num+1:]

In [None]:
train_set

In [None]:
# build a classifier based on the training set
# note the train_set is a list of tuples where the first item of the tuple is a dictionary of features
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
classifier.classify(headline_features("Clevery Warns Tory Rebels: Don't Dump Truss"))

In [None]:
classifier.classify(headline_features("Journalist’s Takedown Of Government Excuses Is Brilliant"))

In [None]:
classifier.classify(headline_features("Do You Get More Anxious Or Sad In Autumn? There's A Reason For That"))

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
classifier.show_most_informative_features(12)

In [None]:
# let's improve this, let's try adding more keywords to our classifier, and remove number
# and using nltk stopwords
from nltk.corpus import stopwords

def headline_features2(headline):
    headline = headline.lower()
    words = headline.split()
    s_words = stopwords.words('english')
    keywords = [w for w in words if w not in s_words and not w.isdigit()]
    while len(keywords) < 5:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2], 'fourth_keyword': keywords[3], 'fifth_keyword': keywords[4]}

headline_features2('High Tech Works When It Enables High Touch')

In [None]:
zipped_features = zip(news.headline, news.category)
featuresets = [(headline_features2(headline), category) for headline, category in zipped_features ]
featuresets

In [None]:
import random

random.shuffle(featuresets)

In [None]:
import nltk
import math

split_num = math.floor(len(featuresets)*.8)

# split feature sets into training and test sets (here we'll try 80% train, 20% test)
train_set, test_set = featuresets[:split_num], featuresets[split_num+1:]

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
classifier.classify(headline_features("Alex Jones Will Likely Be Broke 'For The Rest Of His Life,' Ex-Prosecutor Says"))

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
classifier.show_most_informative_features(12)

In [None]:
# how else can we improve our classifier?