In [30]:
import pandas as pd
import numpy as np

from nltk import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk

## Let's explore our headlines

In [31]:
#Load headlines and see a few
data = pd.read_csv("headlines.csv")
print data.shape
data.head()

(5000, 2)


Unnamed: 0,Page Title,Topic
0,What So Many People Don’t Get About the U.S. W...,Demographics
1,Why Do So Many Incompetent Men Become Leaders?,Leadership
2,How to Write a Cover Letter,Hiring
3,"The Most Important Leadership Competencies, Ac...",Leadership
4,What Is Disruptive Innovation?,Disruptive innovation


In [32]:
#Some text functions

#Takes a list of words, remove common ones
def remove_stop_words(tokens):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in tokens if w.lower() not in stopwords]
    return content

#Takes a list of words, lowercases them
def lowercase_tokens(tokens):
    list_of_words = []
    for a in tokens:
        a = a.lower()
        list_of_words.append(a)
    return list_of_words

In [33]:
#Count the most common words in headlines

#Concatenate headlines into one string
text = data['Page Title'].str.cat(sep=" ")

#Encode as utf-8 
text = text.decode("utf-8")

#Create tokenizer to remove punctuation and numbers
tokenizer = RegexpTokenizer(r'\w+')

#Tokenize into words, lowercase, remove stop words
tokens = tokenizer.tokenize(text)
tokens = lowercase_tokens(tokens)
tokens = remove_stop_words(tokens)

#Print common words
fdist = FreqDist(tokens)
print "The most common words in our headlines are:"
print fdist.most_common(50)


The most common words in our headlines are:
[(u'hbr', 480), (u'work', 332), (u'people', 164), (u'make', 158), (u'new', 155), (u'get', 149), (u'company', 143), (u'companies', 141), (u'employees', 141), (u'leaders', 126), (u'business', 126), (u'strategy', 124), (u'team', 123), (u'need', 119), (u'ways', 118), (u'data', 116), (u'better', 109), (u'best', 101), (u'job', 99), (u'right', 98), (u'change', 93), (u'time', 92), (u'digital', 91), (u'way', 89), (u'good', 89), (u'leadership', 89), (u'innovation', 88), (u'great', 88), (u'know', 88), (u'care', 87), (u'health', 87), (u'women', 85), (u'boss', 82), (u'like', 77), (u'big', 76), (u'3', 74), (u'stop', 74), (u'research', 72), (u'really', 71), (u'things', 70), (u'management', 70), (u'culture', 69), (u'employee', 69), (u'managers', 68), (u'one', 68), (u'making', 67), (u'4', 66), (u'think', 64), (u'5', 62), (u'career', 62)]


## The exploration showed that lots of headlines have "HBR" in them. We need to fix that, look for similar issues, and then get rid of duplicates

In [34]:
#Subset headlines that have "HBR" and other issues in them to see how they look

hbr = data[data['Page Title'].str.contains("HBR")]
print hbr.shape

hbr2 = data[data['Page Title'].str.contains("Harvard Business Review")]
print hbr2.shape

for a in hbr['Page Title'][0:20]:
    print a
    

(477, 2)
(0, 2)
Reinventing Performance Management - HBR
Where the Digital Economy Is Moving the Fastest - HBR
How to Use Your LinkedIn Profile to Power a Career Transition - HBR
Why We Love to Hate HR…and What HR Can Do About It - HBR
What Parents Should Tell Their Kids About Finding a Career - HBR
Setting the Record Straight on Switching Jobs - HBR
How to Know If You Talk Too Much - HBR
How to Get Employees Excited to Do Their Work - HBR
5 Rules for a Vacation that’s Truly Worth It - HBR
75% of Cross-Functional Teams Are Dysfunctional - HBR
Mindfulness Can Literally Change Your Brain - HBR
The One Thing About Your Spouse’s Personality That Really Affects Your Career - HBR
The Top Complaints from Employees About Their Leaders - HBR
{title} - HBR
How the Navy SEALs Train for Leadership Excellence - HBR
Your Late-Night Emails Are Hurting Your Team - HBR
5 Signs It’s Time for a New Job - HBR
What You Miss When You Take Notes on Your Laptop - HBR
The 15 Diseases of Leadership, According t

In [35]:
#Fix headline formatting and remove duplicates
data['Clean Title'] = data['Page Title'].str.replace(' - HBR', '')

#Check for remaining 'HBR' mentions
hbr = data[data['Clean Title'].str.contains("HBR")]
print hbr.shape
hbr.head()

#We've cut 400+ headlines down to 15. Now let's just remove "HBR" from those remaining heds
data['Clean Title'] = data['Clean Title'].str.replace('HBR', '')

(15, 3)


In [36]:
#Drop duplicate headlines
data = data.drop_duplicates(subset="Clean Title")
data.shape


(4585, 3)

## We've got ~4500 cleaned up headlines left. Time to save them to a new csv

In [37]:
#data['Clean Title'].to_csv("clean_headlines.csv",header=True,index=False)

## To add to the dataset, add social media posts from Twitter & Facebook

In [38]:
social = pd.read_csv("all_tweets_fb_2013-nov18.csv")
print social.shape
print social.columns
social['Created By'].describe()

(12573, 47)
Index([u'Account Name', u'Account Type', u'Created By', u'Publish Date',
       u'Time (UTC)', u'Message', u'Publish Status', u'Labels', u'Clicks',
       u'Retweets/Repins/Shares', u'Current Favorites/Likes/+1s', u'Comments',
       u'Reach', u'Lifetime Likes', u'Lifetime Love', u'Lifetime Wow',
       u'Lifetime Haha', u'Lifetime Sad', u'Lifetime Angry', u'Link', u'Title',
       u'Link to Post', u'Deleted', u'Post Type', u'Breaking News',
       u'Targeting Info', u'Deleted By', u'Fans/Followers',
       u'Post Video Avg Time Watched', u'Post Video Views Organic Unique',
       u'Post Video Views Autoplayed', u'Post Video Views Clicked To Play',
       u'Post Video Views Organic', u'Post Video Complete Views Organic',
       u'Post Video Complete Views Organic Unique',
       u'Post Video Complete Views 30S',
       u'Post Video Complete Views 30S Unique',
       u'Post Video Complete Views 30S Autoplayed',
       u'Post Video Complete Views 30S Clicked To Play',
       

count                                 12571
unique                                   20
top       nicole.torres@harvardbusiness.org
freq                                   4082
Name: Created By, dtype: object

In [39]:
#Limit the dataset to tweets and Facebook posts by HBR editors, as opposed to marketing, etc.

editors = ['nicole.torres@harvardbusiness.org','alexandra.kephart@hbr.org', 'Ramsey.Khabbaz@harvardbusiness.org',
     'paige.cohen@hbr.org','nicole.blank@hbr.org','ggavett@hbr.org','etruxler@hbr.org','awieckowski@hbr.org',
     'duygu.mullin@hbr.org','walter.frick@harvardbusiness.org']
social = social[social['Created By'].isin(editors)]
social.shape

(9895, 47)

In [40]:
social['Message'].head

<bound method Series.head of 3        American firms like Google, Amazon, eBay, and ...
4        Diversity goes beyond gender, race, and age. I...
5        How do you keep a large, dispersed organizatio...
6        Give yourself time to be creative — even when ...
7        Seek some advice for better teamwork. On the l...
8              Boosting morale on an underperforming team.
10       A wealth of research shows that female leaders...
11       If you’re thinking about starting a side gig, ...
12       On the latest episode of HBR’s advice podcast ...
13       Before your company jumps on the data science ...
14       Before your company jumps on the data science ...
16       We should subject societally impactful algorit...
17       Research shows that personalities play a bigge...
18       Which one would you choose? Help us name an up...
19       We can’t help that our minds crave distraction...
22       Many of us feel overwhelmed, but are we busy w...
23       Another recession 

In [41]:
#Combine text fields for HBR headlines and social media

tk = data['Clean Title']
tk2 = social['Message']

frames = [tk,tk2]

result = pd.concat(frames)
df = result.to_frame(name='Clean Title')
print df.shape
df.head()

(14480, 1)


Unnamed: 0,Clean Title
0,What So Many People Don’t Get About the U.S. W...
1,Why Do So Many Incompetent Men Become Leaders?
2,How to Write a Cover Letter
3,"The Most Important Leadership Competencies, Ac..."
4,What Is Disruptive Innovation?


In [42]:
#Remove mentions of HBR
df['Clean Title'] = df['Clean Title'].str.replace(' - HBR', '')
df['Clean Title'] = df['Clean Title'].str.replace('HBR', '')

#Remove mentions of research because the algorithm is being used to evaluate research
df['Clean Title'] = df['Clean Title'].str.replace('Research', '')
df['Clean Title'] = df['Clean Title'].str.replace('research', '')
df.head()


Unnamed: 0,Clean Title
0,What So Many People Don’t Get About the U.S. W...
1,Why Do So Many Incompetent Men Become Leaders?
2,How to Write a Cover Letter
3,"The Most Important Leadership Competencies, Ac..."
4,What Is Disruptive Innovation?


## Save the combined dataframe to csv

In [43]:
df['Clean Title'].to_csv("clean_headlines.csv",header=True,index=False)