In [None]:
# import pandas and rename is as pd
import pandas as pd

In [None]:
# set up the path to your csv file
path = 'NewsCategorizer.csv'
# convert your csv file to a pandas DataFrame
news = pd.read_csv(path)

In [None]:
# print out the dataframe
news

In [None]:
# print out the different category names
news.category.unique()

In [None]:
from stop_words import get_stop_words

# here's the template for how we want to structure our features
# {'first_keyword': slkdjfsl, 'second_keyword': lsdjfls, 'third_keyword': slkdjf}
def create_features(headline):
    # make all the words lowercase
    headline = headline.lower()
    # split the headline into a list of words
    words = headline.split()
    # using list comprehension to create a list of keywords
    keywords = [ x for x in words if x not in get_stop_words('english')  ]
    # make sure all keyword lists have three elements
    while len(keywords) < 3:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2]}

In [None]:
# bring together the category(label), and the headline(features)

zipped_feature_and_labels = zip(news.headline, news.category)

In [None]:
# we are creating a list of tuples
# we want () around our headline and category to show they are tuples
feature_sets = [ (create_features(headline),category) for headline,category in zipped_feature_and_labels   ]
feature_sets

In [None]:
# always shuffle your featureset before classifying!
import random
random.shuffle(feature_sets)

In [None]:
import math

split_num = math.floor(len(feature_sets) * .8)

# 80% of my data
training_set = feature_sets[:split_num]
# 20% of my data
testing_set = feature_sets[split_num:]

In [None]:
import nltk
# create our classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [None]:
# try out our classifier using a headline
classifier.classify(create_features("Keir Starmer Dashes Jeremy Corbyn's Hopes Of Returning As Labour MP"))

In [None]:
# determine the accuracy of our classifier
print(nltk.classify.accuracy(classifier, testing_set))

In [None]:
# show the most important features for our algorithm
classifier.show_most_informative_features(12)