In [200]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import linear_model
from sklearn.metrics import accuracy_score
import pickle
from sklearn.pipeline import Pipeline

## Load data and split into train/test

In [201]:
#Load and check data
heds = pd.read_csv("combined_headlines.csv")
print heds.shape
print heds.describe()
heds.head()

(29480, 2)
                                                 Headline    HBR
count                                               29480  29480
unique                                              28918      2
top     The article requested cannot be found! Please ...     No
freq                                                    7  15000


Unnamed: 0,Headline,HBR
0,What So Many People Don’t Get About the U.S. W...,Yes
1,Why Do So Many Incompetent Men Become Leaders?,Yes
2,How to Write a Cover Letter,Yes
3,"The Most Important Leadership Competencies, Ac...",Yes
4,What Is Disruptive Innovation?,Yes


In [202]:
#Split data into train/test
train = heds.sample(frac=0.7, random_state=1)
test = heds.loc[~heds.index.isin(train.index)]

print "Train shape:"
print train.shape
print "Test shape"
print test.shape

Train shape:
(20636, 2)
Test shape
(8844, 2)


## Vectorize and train model

In [203]:
#Vectorize data

#Initialize countvectorizer and fit to headlines
vectorizer = CountVectorizer(analyzer='word',
                             stop_words = 'english',
                             ngram_range=(1,2),
                             max_features=1000)

train_counts = vectorizer.fit_transform(train['Headline'])
test_counts = vectorizer.transform(test['Headline'])

#Initialize tfidf transformer and fit to counts
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
print train_tfidf.shape

test_tfidf = tfidf_transformer.transform(test_counts)
print test_tfidf.shape

(20636, 1000)
(8844, 1000)


In [204]:
#Train and evaluate linear model

#Train logistic regression model on training set
logit = linear_model.LogisticRegression(penalty='l1',C=1)
logit = logit.fit(train_tfidf,train['HBR'])

#Evaluate training performance
logit_train_results = logit.predict(train_tfidf)
train_output = pd.DataFrame(data={"Headline":train['Headline'],
                                 "HBR":train["HBR"],
                                 "Prediction":logit_train_results})
print train_output.head(30)

print "\nThe accuracy of the model on the training set is:"
print accuracy_score(train_output['HBR'],train_output['Prediction'])

       HBR                                           Headline Prediction
25663   No  BWW Asks: Who Would You Cast in FOX's Live Bro...         No
24051   No    Megan Fox Finds Being a Working Mom 'Consuming'         No
5113   Yes  Grit predicts who will accomplish challenging ...        Yes
28759   No  Dozens of Avicii concertgoers rushed to hospit...         No
12109  Yes  We all have the potential to be curious, given...        Yes
14018  Yes      It's the little things that persuade. Listen:        Yes
2512   Yes  How to Raise Sensitive Issues During a Virtual...        Yes
13782  Yes  Do you really know how to best interact with y...        Yes
27697   No  Euro Near Three-Month Low as Volatility Surges...         No
17286   No  Avicii hospitalized; Deadmau5 will fill in at ...         No
28875   No                                   R,G and all that         No
2447   Yes              More Universities Need to Teach Sales        Yes
14952   No                      Well, beam me up Se

In [205]:
#Make predictions against test data and assess performance
logit_test_results = logit.predict(test_tfidf)
test_output = pd.DataFrame(data={"Headline":test['Headline'],
                                 "HBR":test["HBR"],
                                 "Prediction":logit_test_results})
print test_output.head(30)

print "\nThe accuracy of the model on the test set is:"
print accuracy_score(test_output['HBR'],test_output['Prediction'])

    HBR                                           Headline Prediction
1   Yes     Why Do So Many Incompetent Men Become Leaders?        Yes
9   Yes              The Best-Performing CEOs in the World        Yes
10  Yes                  How to Give a Killer Presentation         No
14  Yes  Resilience Is About How You Recharge, Not How ...        Yes
15  Yes                          How Netflix Reinvented HR        Yes
18  Yes           Why the Lean Start-Up Changes Everything        Yes
20  Yes  What Is Organizational Culture? And Why Should...        Yes
25  Yes  High-Performing Teams Need Psychological Safet...        Yes
27  Yes    Why Some People Get Burned Out and Others Don’t        Yes
29  Yes  Uber Can’t Be Fixed — It’s Time for Regulators...        Yes
30  Yes             How to Design a Winning Business Model        Yes
31  Yes         Why You Should Have (at Least) Two Careers        Yes
41  Yes  Proof That Positive Work Cultures Are More Pro...        Yes
42  Yes             

## Save the trained model

In [206]:
with open('hbr_logit.pkl','wb') as f:
    pickle.dump(logit,f)

In [207]:
#Turn text into dataframe to test
text = "How to manage a company"   
text = [text]
df = pd.DataFrame(text,columns =['text'])    
    
#Vectorize text
tk_features = vectorizer.transform(df['text'])
tk_features = tfidf_transformer.transform(tk_features)
    
#Return prediction
result = logit.predict(tk_features)
print result[0]

Yes


## Redo the same process, but using a pipeline

In [208]:
#Pipeline makes it easier to vectorize new data later on -- to save the vectorizer along with the model

vect = CountVectorizer(analyzer='word',
                             stop_words = 'english',
                             ngram_range=(1,2),
                             max_features=1000)

#Define pipeline with countvectorizer, tfidftransformer, and logit model
test_pipe = Pipeline([
     ('vectorizer', vect),
     ('tfidf', TfidfTransformer()),
     ('logit', linear_model.LogisticRegression(penalty='l1',C=1))
 ])

#Fit logistic regression model with training data
test_pipe.fit(train['Headline'], train["HBR"]) 

#Make predictions against test data and assess performance
predictions = test_pipe.predict(test['Headline'])
test_output = pd.DataFrame(data={"Headline":test['Headline'],
                                 "HBR":test["HBR"],
                                 "Prediction":predictions})
print test_output.head(30)

print "\nThe accuracy of the model on the test set is:"
print accuracy_score(test_output['HBR'],test_output['Prediction'])


    HBR                                           Headline Prediction
1   Yes     Why Do So Many Incompetent Men Become Leaders?        Yes
9   Yes              The Best-Performing CEOs in the World        Yes
10  Yes                  How to Give a Killer Presentation         No
14  Yes  Resilience Is About How You Recharge, Not How ...        Yes
15  Yes                          How Netflix Reinvented HR        Yes
18  Yes           Why the Lean Start-Up Changes Everything        Yes
20  Yes  What Is Organizational Culture? And Why Should...        Yes
25  Yes  High-Performing Teams Need Psychological Safet...        Yes
27  Yes    Why Some People Get Burned Out and Others Don’t        Yes
29  Yes  Uber Can’t Be Fixed — It’s Time for Regulators...        Yes
30  Yes             How to Design a Winning Business Model        Yes
31  Yes         Why You Should Have (at Least) Two Careers        Yes
41  Yes  Proof That Positive Work Cultures Are More Pro...        Yes
42  Yes             

In [209]:
#Print the features with the lowest and highest coefficients
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = logit.coef_[0].argsort()
print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:100]]))
print('Largest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:-101:-1]]))


Smallest Coefs: 
[u'stocks' u'2014' u'heartbleed' u'announces' u'galaxy' u'miley' u'gm'
 u'comcast' u'mers' u'tv' u'movie' u'says' u'report' u'fed' u'ebola'
 u'cancer' u'android' u'debut' u'video' u'box office' u'blood' u'trailer'
 u'easter' u'million' u'kim' u'dies' u'ecb' u'bieber' u'update'
 u'kardashian' u'film' u'samsung' u'ipad' u'chris' u'facebook' u'xbox'
 u'beyonce' u'awards' u'king' u'yellen' u'disease' u'euro' u'fda' u'mad'
 u'microsoft' u'july' u'smartphone' u'sony' u'thrones' u'google' u'htc'
 u'noah' u'west' u'release' u'launches' u'windows' u'recap' u'robin'
 u'recalls' u'adds' u'nasa' u'ford' u'usd' u'apple' u'selena' u'review'
 u'mars' u'virus' u'lohan' u'transformers' u'jay' u'earth' u'plane'
 u'wedding' u'star' u'bank' u'ice' u'talks' u'reportedly' u'cast'
 u'mother' u'ban' u'gas' u'photo' u'teen' u'ryan' u'need know' u'twitter'
 u'buys' u'revealed' u'march' u'app' u'man' u'hbo' u'allergan' u'arrested'
 u'uk' u'court' u'drug' u'scientists']

Largest Coefs: 
[u'trib' 

In [210]:
#Test new text (defined above) with fitted model

print type(text)
print type(test['Headline'])

print test_pipe.predict(pd.Series(text))

<type 'list'>
<class 'pandas.core.series.Series'>
['Yes']


In [211]:
#Save pipeline for later

with open('hbr_pipeline.pkl','wb') as f:
    pickle.dump(test_pipe,f)

# Repeat pipeline building and training with business heds only

In [212]:
#Load and check data
heds = pd.read_csv("combined_biz_headlines.csv")
print heds.shape
print heds.describe()
heds.head()

(29480, 2)
                   Headline    HBR
count                 29480  29480
unique                28854      2
top     Posted by Imaduddin     No
freq                     10  15000


Unnamed: 0,Headline,HBR
0,What So Many People Don’t Get About the U.S. W...,Yes
1,Why Do So Many Incompetent Men Become Leaders?,Yes
2,How to Write a Cover Letter,Yes
3,"The Most Important Leadership Competencies, Ac...",Yes
4,What Is Disruptive Innovation?,Yes


In [213]:
#Split data into train/test
train = heds.sample(frac=0.7, random_state=1)
test = heds.loc[~heds.index.isin(train.index)]

print "Train shape:"
print train.shape
print "Test shape"
print test.shape

Train shape:
(20636, 2)
Test shape
(8844, 2)


In [214]:
#Pipeline makes it easier to vectorize new data later on -- to save the vectorizer along with the model

vect = CountVectorizer(analyzer='word',
                             stop_words = 'english',
                             ngram_range=(1,2),
                             max_features=1000)

#Define pipeline with countvectorizer, tfidftransformer, and logit model
test_pipe = Pipeline([
     ('vectorizer', vect),
     ('tfidf', TfidfTransformer()),
     ('logit', linear_model.LogisticRegression(penalty='l1',C=1))
 ])

#Fit logistic regression model with training data
test_pipe.fit(train['Headline'], train["HBR"]) 

#Make predictions against test data and assess performance
predictions = test_pipe.predict(test['Headline'])
test_output = pd.DataFrame(data={"Headline":test['Headline'],
                                 "HBR":test["HBR"],
                                 "Prediction":predictions})
print test_output.head(30)

print "\nThe accuracy of the model on the test set is:"
print accuracy_score(test_output['HBR'],test_output['Prediction'])


    HBR                                           Headline Prediction
1   Yes     Why Do So Many Incompetent Men Become Leaders?        Yes
9   Yes              The Best-Performing CEOs in the World        Yes
10  Yes                  How to Give a Killer Presentation         No
14  Yes  Resilience Is About How You Recharge, Not How ...        Yes
15  Yes                          How Netflix Reinvented HR        Yes
18  Yes           Why the Lean Start-Up Changes Everything        Yes
20  Yes  What Is Organizational Culture? And Why Should...        Yes
25  Yes  High-Performing Teams Need Psychological Safet...        Yes
27  Yes    Why Some People Get Burned Out and Others Don’t        Yes
29  Yes  Uber Can’t Be Fixed — It’s Time for Regulators...         No
30  Yes             How to Design a Winning Business Model        Yes
31  Yes         Why You Should Have (at Least) Two Careers        Yes
41  Yes  Proof That Positive Work Cultures Are More Pro...        Yes
42  Yes             

In [215]:
#Print the features with the lowest and highest coefficients
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = logit.coef_[0].argsort()
print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:100]]))
print('Largest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:-101:-1]]))


Smallest Coefs: 
[u'stories' u'17' u'government' u'america' u'finally' u'mean' u'forex'
 u'class' u'mart' u'turkey' u'money' u'says' u'record' u'execution' u'dow'
 u'burger' u'amazon' u'cultures' u've' u'bnp' u'bid' u'trading' u'don'
 u'media' u'investors' u'dear' u'dow jones' u'behavior' u'unemployment'
 u'interview' u'experience' u'samsung' u'improve' u'change' u'episode'
 u'wrong' u'barclays' u'ask' u'irs' u'yellen' u'demand' u'emerging' u'ex'
 u'long' u'mcdonald' u'innovation' u'smart' u'south' u'time' u'futures'
 u'health care' u'net' u'way' u'recall' u'jobless claims' u'weekly'
 u'reaches' u'risks' u'rates' u'acquire' u'months' u'fears'
 u'unemployment rate' u'analysis' u'self' u'retail' u'makes' u'vietnam'
 u'learned' u'training' u'industry' u'doing' u'personal' u'waste'
 u'startup' u'avoid' u'hike' u'talk' u'recovery' u'candy' u'mobile'
 u'average' u'firms' u'people' u'technologies' u'safety' u'motors' u'turn'
 u'british' u'resilience' u'major' u'amid' u'lot' u'goals' u'ahead'


In [216]:
#Test new text (defined above) with fitted model

print type(text)
print type(test['Headline'])

print test_pipe.predict(pd.Series(text))

<type 'list'>
<class 'pandas.core.series.Series'>
['Yes']


In [217]:
#Save pipeline for later

with open('hbr_biz_pipeline.pkl','wb') as f:
    pickle.dump(test_pipe,f)