# News Categorizers Scratch work

In [159]:
# Imports 
import string
import numpy as np
import pandas as pd

import nltk
# nltk.download('punkt')
from nltk.stem.porter import *

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

In [161]:
# Read in data
data = pd.read_json('data/data-json.json', lines=True)
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [116]:
# Create category dictionary linking unique categories 
category_dict = {cat: i for i, cat in enumerate(data.category.unique())}
category_dict

{'CRIME': 0,
 'ENTERTAINMENT': 1,
 'WORLD NEWS': 2,
 'IMPACT': 3,
 'POLITICS': 4,
 'WEIRD NEWS': 5,
 'BLACK VOICES': 6,
 'WOMEN': 7,
 'COMEDY': 8,
 'QUEER VOICES': 9,
 'SPORTS': 10,
 'BUSINESS': 11,
 'TRAVEL': 12,
 'MEDIA': 13,
 'TECH': 14,
 'RELIGION': 15,
 'SCIENCE': 16,
 'LATINO VOICES': 17,
 'EDUCATION': 18,
 'COLLEGE': 19,
 'PARENTS': 20,
 'ARTS & CULTURE': 21,
 'STYLE': 22,
 'GREEN': 23,
 'TASTE': 24,
 'HEALTHY LIVING': 25,
 'THE WORLDPOST': 26,
 'GOOD NEWS': 27,
 'WORLDPOST': 28,
 'FIFTY': 29,
 'ARTS': 30,
 'WELLNESS': 31,
 'PARENTING': 32,
 'HOME & LIVING': 33,
 'STYLE & BEAUTY': 34,
 'DIVORCE': 35,
 'WEDDINGS': 36,
 'FOOD & DRINK': 37,
 'MONEY': 38,
 'ENVIRONMENT': 39,
 'CULTURE & ARTS': 40}

In [173]:
# Create target attribute which has the corresponding category id established above
data['target'] = data['category'].map(category_dict)
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date,target
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,0
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,1
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,1
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,1
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,1


In [174]:
# Combine both the headline and the short description 
data['combined'] = data.headline + '. ' + data.short_description 

## Initial model - tfidf vectorizer and 

In [150]:
X, y = data.combined, data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train)

160672

In [151]:
vect = CountVectorizer()
X_train_counts = vect.fit_transform(X_train)
X_train_counts.shape

(160672, 78909)

In [153]:
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_counts)
X_train_tfidf.shape

(160672, 78909)

In [155]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [160]:
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

predicted = clf.predict(X_test_tfidf)
np.mean(predicted == y_test)

0.38441047600079664

In [170]:
def test_sentence(sentence, vect, tfidf, clf):
    sent_counts = vect.transform([sentence])
    sent_tfidf = tfidf.transform(sent_counts)
    
    return list(category_dict.keys())[list(category_dict.values()).index(clf.predict(sent_tfidf))]
    

In [172]:
sen = "Apple MacBook Pro 16-inch M1 Max review: New silicon meets retro ports. It's the best of the old and new in this long-awaited, professional-grade Apple silicon MacBook."

test_sentence(sen, vect, tfidf, clf)


'ENTERTAINMENT'

## Bags of words

In [86]:
data['combined'] = data.headline + '. ' + data.short_description 

In [88]:
# Assign fixed integer id to each word occurring in the document of the training set
count_vect = CountVectorizer()
test = count_vect.fit_transform(data.combined)

In [89]:
test

<200840x86670 sparse matrix of type '<class 'numpy.int64'>'
	with 4974671 stored elements in Compressed Sparse Row format>

## Initial text preprocessing

In [85]:
analyzer = SentimentIntensityAnalyzer()

analyzer.polarity_scores(data.combined[0])['compound']

-0.8047

In [73]:
def determine_sentiment(score):
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

In [75]:
# Combine the headline with the short description of the article --> likely will improve performance
data['combined'] = data.headline + '. ' + data.short_description 

analyzer = SentimentIntensityAnalyzer()

data['sentiment_score'] = [analyzer.polarity_scores(sentence)['compound'] for sentence in data.combined]
data['sentiment'] = data.sentiment_score.apply(determine_sentiment)

In [77]:
data[['combined', 'sentiment_score', 'sentiment']]

Unnamed: 0,combined,sentiment_score,sentiment
0,There Were 2 Mass Shootings In Texas Last Week...,-0.8047,negative
1,Will Smith Joins Diplo And Nicky Jam For The 2...,0.0000,neutral
2,Hugh Grant Marries For The First Time At Age 5...,0.3612,positive
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,0.3412,positive
4,Julianna Margulies Uses Donald Trump Poop Bags...,0.0000,neutral
...,...,...,...
200848,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,0.5106,positive
200849,Maria Sharapova Stunned By Victoria Azarenka I...,0.2732,positive
200850,"Giants Over Patriots, Jets Over Colts Among M...",0.7635,positive
200851,Aldon Smith Arrested: 49ers Linebacker Busted ...,-0.4767,negative


In [82]:
def preprocessing(text):
    stop_words = sklearn.feature_extraction.text.ENGLISH_STOP_WORDS
    
    text.lower()
    text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', text)
    tokens = nltk.word_tokenize(text)
    
    tokens = [tok for tok in tokens if len(tok) > 2]  # remove short words that don't add much in the way of meaning
    tokens = [tok for tok in tokens if tok not in stop_words]
    
    p_stemmer = PorterStemmer()
    
    return [p_stemmer.stem(word) for word in tokens]
    

In [83]:
data['tokenized'] = data['combined'].apply(preprocessing)

In [84]:
data.tokenized[0]

['there',
 'were',
 'mass',
 'shoot',
 'texa',
 'last',
 'week',
 'but',
 'onli',
 'she',
 'left',
 'husband',
 'kill',
 'children',
 'just',
 'day',
 'america']

## EDA

In [18]:
data.drop_duplicates(inplace=True)  # just in case there are duplicates

In [19]:
len(data)

200840

In [20]:
data.nunique()

category                 41
headline             199344
authors               27993
link                 200812
short_description    178353
date                   2309
dtype: int64