# svm scikit-learn tutorial - clickbait headline classifier
https://www.codementor.io/garethdwyer/introduction-to-machine-learning-with-python-s-scikit-learn-czha398p1

i spiced it up with trying on live buzzfeed articles

In [1]:
# imports & params
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import numpy as np

split_ratio = .8

In [2]:
# read data
with open('clickbait.txt') as f:
    lines = f.read().strip().split('\n')
    lines = [line.split('\t') for line in lines]
    
headlines, labels = zip(*lines)

print len(headlines)
print len(labels)
headlines[:5]

10000
10000


("Egypt's top envoy in Iraq confirmed killed",
 'Carter: Race relations in Palestine are worse than apartheid',
 'After Years Of Dutiful Service, The Shiba Who Ran A Tobacco Shop Retires',
 'In Books on Two Powerbrokers, Hints of the Future',
 'These Horrifyingly Satisfying Photos Of "Baby Foot" Will Haunt You')

In [3]:
labels[:5]

('0', '0', '1', '0', '1')

In [4]:
# split to train & test
headlines_a = np.array(headlines)
labels_a = np.array(labels)
random_split = (np.random.rand(len(headlines)) < split_ratio)

train_headlines = headlines_a[random_split]
test_headlines = headlines_a[~random_split]

train_labels = labels_a[random_split]
test_labels = labels_a[~random_split]

print 'train lengths: \t', len(train_headlines), len(train_labels)
print 'test lengths: \t', len(test_headlines), len(test_labels)

train lengths: 	8082 8082
test lengths: 	1918 1918


In [5]:
# initialize vectorizer & classifier
vectorizer = TfidfVectorizer()
svm = LinearSVC()

In [6]:
# transform text to vectors
train_vectors = vectorizer.fit_transform(train_headlines)
test_vectors = vectorizer.transform(test_headlines)

train_vectors.todense()[:5]

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [7]:
# train classifier
svm.fit(train_vectors, train_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [8]:
# predict on test
predictions = svm.predict(test_vectors)

predictions[:5]

array(['0', '1', '0', '0', '0'],
      dtype='|S1')

In [9]:
# peak at test headlines...
print test_labels[:5]
test_headlines[:5]

['0' '1' '0' '1' '0']


array(['In Books on Two Powerbrokers, Hints of the Future',
       'Try And Guess What Chips These Were Before We Smashed Them',
       'Johnson & Johnson to Acquire Cougar Biotechnology',
       'Shaggy Returns With The Ultimate Diss Track, "GFY"',
       'Warsaw court requests testimony from Thatcher and Gorbachev'],
      dtype='|S127')

In [10]:
accuracy_score(test_labels, predictions)

0.96402502606882168

### live buzzfeed articles

In [11]:
# now try on live buzzfeed articles
import requests
import json
import pandas as pd

with open('news_api_key.txt') as f:
    news_api_key = f.read()

url = ('https://newsapi.org/v2/top-headlines?'\
       'sources=buzzfeed&'\
       'apiKey=' + news_api_key)

j = requests.get(url).json()
j.keys()

[u'status', u'articles']

In [12]:
# read into dataframe, but we only need a list of the titles at the moment
df = pd.read_json(json.dumps(j['articles']))
live_headlines = list(df.title)
live_headlines

[u"Rate These Fast Food Chains And We'll Reveal Your Dominant Personality Trait",
 u'How Normal Are Your Thanksgiving Food Opinions?',
 u'34 Absolutely Stunning Winter Coats Under $100',
 u'This Guy Kept Prank-Calling Phone Scammers Until They Begged Him To Stop',
 u'16 Simple Ways To Deal With The Cold, Dark Winter Without Losing Your Damn Mind',
 u'Call An Ambulance Because These Photos Of Serena Williams At Her Wedding Nearly Killed Me',
 u"Build A Thanksgiving Dinner And We'll Reveal If You've Been Naughty Or Nice This Year",
 u'Cops Pulled Over This Driver And Found A Phone And A Tablet Mounted On His Steering Wheel',
 u'Can We Guess Your Age Based On Your Favorite Candy?',
 u"17 Thanksgiving Horror Stories That'll Make Even Your Turkey Get Up From The Table"]

In [13]:
# predict
live_vectors = vectorizer.transform(live_headlines)
live_predictions = svm.predict(live_vectors)
live_predictions

array(['1', '1', '1', '1', '1', '1', '1', '1', '1', '1'],
      dtype='|S1')