In [18]:
# set up and dependencies
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB




In [None]:
pd.set_option('display.max_rows', None)

path = "files/headlines.csv"

In [5]:
df = pd.read_csv(path)

## some light EDA

In [6]:
df.head(10)

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
0,Farhad Manjoo picks four products from 2014 th...,article,"Standouts in Tech: Drones, Virtual Reality, In...","[{'name': 'organizations', 'value': 'Oculus VR...",LOTS of cool new technology products come out ...,Business,2015-01-01,Technology,Farhad Manjoo picks four products from 2014 th...,News,824
1,Representative Steve Scalise’s effort to expla...,article,Much of David Duke’s ’91 Campaign Is Now in Lo...,"[{'name': 'persons', 'value': 'Alford, Jeremy'...","BATON ROUGE, La. — David Duke seems a figure f...",National,2015-01-01,U.S.,Representative Steve Scalise’s effort to expla...,News,1293
2,Minimum wage increases go into effect in 20 st...,article,"States’ Minimum Wages Rise, Helping Millions o...","[{'name': 'subject', 'value': 'Minimum Wage', ...","For some low-wage workers, everyday tasks like...",Business,2015-01-01,Business Day,Minimum wage increases go into effect in 20 st...,News,1017
3,A new job title — chief of laboratory safety —...,article,New C.D.C. Job Overseeing Laboratory Safety,"[{'name': 'persons', 'value': 'McNeil, Donald ...",A new job title — chief of laboratory safety —...,National,2015-01-01,Health,A new job title — chief of laboratory safety —...,Brief,129
4,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",article,Massachusetts: New Effort to Move Bombings Trial,"[{'name': 'subject', 'value': 'Boston Marathon...","Lawyers for Dzhokhar Tsarnaev, the defendant i...",National,2015-01-01,U.S.,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",Brief,145
5,As United States combat operations formally en...,article,"Mission Ends in Afghanistan, but Sacrifices Ar...","[{'name': 'subject', 'value': 'Afghanistan War...","KILLEEN, Tex. — Sgt. First Class Ramon Morris,...",National,2015-01-01,U.S.,As United States combat operations formally en...,News,1425
6,One of the foremost challenges is finding a wa...,article,"Five Surprising Economic Trends in 2014, and W...","[{'name': 'subject', 'value': 'Economic Condit...",It was a confounding year in global financial ...,Business,2015-01-01,The Upshot,One of the foremost challenges is finding a wa...,News,1269
7,"By adopting tabletlike, touch-screen systems, ...",article,Carmakers Take a Hint From Tablets,"[{'name': 'subject', 'value': 'Automobile Safe...","Reaching an apogee of confusion, car dashboard...",Business,2015-01-01,Automobiles,"By adopting tabletlike, touch-screen systems, ...",News,1022
8,"Six months after its introduction, Kindle Unli...",article,Writers Are Mixed Over Amazon Unlimited,"[{'name': 'subject', 'value': 'Amazon Kindle',...","Amazon is disruptive, which means it is divisi...",Business,2015-01-01,Technology,"Six months after its introduction, Kindle Unli...",News,1005
9,Patricia Arquette says being a single mother a...,article,Unashamedly Maturing Into Her Role,"[{'name': 'subject', 'value': 'Movies', 'rank'...","LOS ANGELES — When Patricia Arquette was 19, s...",Arts&Leisure,2015-01-01,Movies,Patricia Arquette says being a single mother a...,News,1478


In [20]:
df['document_type'].unique()

array(['article'], dtype=object)

In [12]:
df['type_of_material'].unique()

array(['News', 'Brief', 'Obituary (Obit)', 'Question', 'Schedule',
       'Review', 'List', 'Letter', 'Interview', 'News Analysis', 'Text',
       'Web Log', 'Economic Analysis', 'Op-Ed', 'Editorial',
       'Special Report', 'recipe', 'briefing', 'Newsletter', 'Series',
       'An Analysis'], dtype=object)

In [7]:
df['news_desk'].unique()

array(['Business', 'National', 'Arts&Leisure', 'Science', 'Society',
       'Politics', 'Media', 'Climate'], dtype=object)

In [8]:
df['section_name'].unique()

array(['Technology', 'U.S.', 'Business Day', 'Health', 'The Upshot',
       'Automobiles', 'Movies', 'Theater', 'Your Money', 'Arts',
       'Science', 'Fashion & Style', 'Education', 'Real Estate', 'Food',
       'World', 'Style', 'New York', 'Books', 'Universal', 'Job Market',
       'Obituaries', 'Magazine', 'Sports', 'NYT Now', 'Opinion',
       'Times Insider', 'Giving', 'Sunday Review', 'Well', 'Watching',
       'Climate', 'Travel', 'Smarter Living', 'Reader Center'],
      dtype=object)

In [9]:
df['news_desk'].value_counts()

Business        20271
National        16054
Science          4021
Society          3984
Arts&Leisure     3350
Politics         2779
Climate           107
Media               6
Name: news_desk, dtype: int64

In [10]:
df['section_name'].value_counts()

U.S.               17653
Business Day       13883
Technology          4181
Fashion & Style     3959
Health              2298
Science             2172
Arts                2035
Your Money           843
Movies               807
Theater              557
The Upshot           529
Well                 363
Real Estate          353
Automobiles          273
Climate              179
World                121
Education             69
Style                 59
New York              42
Watching              37
Smarter Living        34
Books                 25
Reader Center         16
Food                  16
Obituaries            15
Giving                13
Times Insider         12
Magazine               8
NYT Now                8
Job Market             4
Sports                 2
Travel                 2
Universal              2
Sunday Review          1
Opinion                1
Name: section_name, dtype: int64

# Bigrams

### Prepare the data from scikit-learn¶

In [None]:
X = df['text'].values
y = df['class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

### Categorical Bigram Naive Bayes (currently don't have a categorical option - try other methods first)

In [19]:
%%time

bigram_naive_bayes = make_pipeline(
    CountVectorizer(
        stop_words='english',
        binary=True,
        ngram_range=(1, 2)
    ),
    MultinomialNB()
)

bigram_naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {bigram_naive_bayes.score(X_test, y_test)} \n')
print(classification_report(y_test, bigram_naive_bayes.predict(X_test)))

## Extra: word_count prediction based on news_desk and section_name, distinguish by type_of_material