In [1]:
# set up and dependencies
import pandas as pd
pd.set_option('display.max_rows', None)

path = "files/nyt_2015_2017.csv"

df = pd.read_csv(path)

df.head()

Unnamed: 0,abstract,byline,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,source,type_of_material,word_count
0,A writing table anchors a home built on a gran...,,multimedia,They Knew What They Wanted,[],A writing table anchors a home built on a gran...,Home & Garden,2015-01-01T00:03:03+0000,Home & Garden,A writing table anchors a home built on a gran...,The New York Times,Slideshow,0
1,"Building a modern loft in Portland, Ore., suff...",,multimedia,Industrial Sleek,"[{'name': 'subject', 'value': 'Interior Design...","Building a modern loft in Portland, Ore., suff...",Home & Garden,2015-01-01T00:03:12+0000,Home & Garden,"Building a modern loft in Portland, Ore., suff...",The New York Times,Slideshow,0
2,"Building a loft in Portland, Ore., suffused wi...",By Amara Holstein,article,The Imperfectionists,"[{'name': 'glocations', 'value': 'Portland (Or...","For Joan Childs, southeast Portland, Ore., is ...",Home,2015-01-01T00:03:50+0000,Home & Garden,"Building a loft in Portland, Ore., suffused wi...",The New York Times,News,654
3,A writing table anchors a home built on a gran...,By Rachel Urquhart,article,A Simple Gift,"[{'name': 'persons', 'value': 'Urquhart, Rache...",This story begins with a wooden table. My gran...,Home,2015-01-01T00:03:53+0000,Home & Garden,A writing table anchors a home built on a gran...,The New York Times,News,1428
4,Farhad Manjoo picks four products from 2014 th...,By Farhad Manjoo,article,"Standouts in Tech: Drones, Virtual Reality, In...","[{'name': 'organizations', 'value': 'Oculus VR...",LOTS of cool new technology products come out ...,Business,2015-01-01T00:26:30+0000,Technology,Farhad Manjoo picks four products from 2014 th...,The New York Times,News,824


In [2]:
# check data types and counts for each column

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210165 entries, 0 to 210164
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   abstract          210137 non-null  object
 1   byline            182310 non-null  object
 2   document_type     210165 non-null  object
 3   headline          210165 non-null  object
 4   keywords          210165 non-null  object
 5   lead_paragraph    208841 non-null  object
 6   news_desk         201709 non-null  object
 7   pub_date          210165 non-null  object
 8   section_name      210165 non-null  object
 9   snippet           209808 non-null  object
 10  source            210165 non-null  object
 11  type_of_material  209410 non-null  object
 12  word_count        210165 non-null  int64 
dtypes: int64(1), object(12)
memory usage: 20.8+ MB


In [3]:
# check size of dataframe
df.shape

(210165, 13)

In [4]:
# obtain list of unique news desk values
total_nd_options = df["news_desk"]

total_nd_options.unique()

array(['Home & Garden', 'Home', 'Business', 'Metro', 'Culture', 'Sports',
       'Letters', 'Opinion', 'OpEd', 'National', 'Foreign', 'Editorial',
       'Business Day', 'World', 'Summary', 'Arts', 'Weekend', 'Food',
       'Dining', 'Arts&Leisure', 'Styles', 'Upshot', 'TStyle', 'Travel',
       'RealEstate', nan, 'U.S.', 'Obituaries', 'Obits', 'New York',
       'TLiving', 'NewsDesk', 'NYTNow', 'Movies', 'Magazine',
       'Real Estate', 'BookReview', 'Politics', 'SundayBusiness',
       'Science', 'Society', 'Blogs', 'Fashion & Style', 'Sunday Review',
       'NYT Now', 'The Upshot', 'Circuits', 'T Magazine', 'TCulture',
       'Automobiles', 'Health', 'NODESK', 'Books', 'Theater',
       'Your Money', 'Style', 'Great Homes & Destinations', 'EdLife',
       'Education', 'TDesign', 'Technology', 'Admin', 'Insider',
       'Fashion', 'Universal', 'Dealbook', 'Multimedia/Photos',
       'Crosswords & Games', 'TTravel', 'Washington', 'Media', 'Homepage',
       'Times Insider', 'Metropol

In [5]:
# obtain list of section name values

total_section_options = df["section_name"]

total_section_options.unique()

array(['Home & Garden', 'Technology', 'New York', 'Arts', 'Sports',
       'Opinion', 'U.S.', 'World', 'Business Day', 'Health', 'Movies',
       'Crosswords & Games', 'The Upshot', 'Corrections', 'Automobiles',
       'Food', 'Theater', 'T Magazine', 'Travel', 'Real Estate', 'Blogs',
       'Books', 'Obituaries', 'Multimedia/Photos', 'NYT Now', 'Education',
       'Your Money', 'Magazine', 'Science', 'Fashion & Style', 'Style',
       'Sunday Review', 'Job Market', 'Public Editor', 'Times Topics',
       'Great Homes & Destinations', 'Admin', 'Universal',
       'Times Insider', 'International Home', 'Topics', 'Homepage',
       'Today’s Paper', 'Giving', 'The Learning Network', 'membercenter',
       'Archives', 'Well', 'Podcasts', 'Briefing', 'Watching', 'Climate',
       'Smarter Living', 'T Brand', 'Reader Center', 'Slideshows'],
      dtype=object)

### List of headlines we're using for sentiment analysis
News Desk
- Business
- Business Day
- Arts
- Arts & Leisure
- Education
- Health
- Media
- Movies
- National
- Politics
- Science
- Society
- Technology
- Theater
- U.S.

Section Names
- Arts
- Business Day
- Education
- Health
- Movies
- Multimedia/Photos
- Science
- Technology
- Theater
- U.S.

In [6]:
# starting with news_desk column, drop rows with values not in list 
# (there are nan values in this column - will check value on section name after)

nd_filter = df["news_desk"].isin(["Arts", "Arts&Leisure", "Business", "Business Day", "Climate",
                                   "Education", "Health", "Media", "Movies", "National", "Politics",
                                   "Science", "Society", "Technology", "Theater", "U.S."])

news_desk_df = df[nd_filter]

news_desk_df.head()

Unnamed: 0,abstract,byline,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,source,type_of_material,word_count
4,Farhad Manjoo picks four products from 2014 th...,By Farhad Manjoo,article,"Standouts in Tech: Drones, Virtual Reality, In...","[{'name': 'organizations', 'value': 'Oculus VR...",LOTS of cool new technology products come out ...,Business,2015-01-01T00:26:30+0000,Technology,Farhad Manjoo picks four products from 2014 th...,The New York Times,News,824
18,Representative Steve Scalise’s effort to expla...,By Jeremy Alford,article,Much of David Duke’s ’91 Campaign Is Now in Lo...,"[{'name': 'persons', 'value': 'Alford, Jeremy'...","BATON ROUGE, La. — David Duke seems a figure f...",National,2015-01-01T01:36:18+0000,U.S.,Representative Steve Scalise’s effort to expla...,The New York Times,News,1293
24,"By Jan. 1, 29 states and the District of Colum...",,multimedia,A Guide to Minimum Wage Increases at the State...,"[{'name': 'subject', 'value': 'Minimum Wage', ...","By Jan. 1, 29 states and the District of Colum...",Business Day,2015-01-01T01:45:03+0000,Business Day,"By Jan. 1, 29 states and the District of Colum...",The New York Times,Interactive Feature,0
25,Minimum wage increases go into effect in 20 st...,By Rachel Abrams,article,"States’ Minimum Wages Rise, Helping Millions o...","[{'name': 'subject', 'value': 'Minimum Wage', ...","For some low-wage workers, everyday tasks like...",Business,2015-01-01T01:45:10+0000,Business Day,Minimum wage increases go into effect in 20 st...,The New York Times,News,1017
30,A new job title — chief of laboratory safety —...,By Donald G. McNeil Jr,article,New C.D.C. Job Overseeing Laboratory Safety,"[{'name': 'persons', 'value': 'McNeil, Donald ...",A new job title — chief of laboratory safety —...,National,2015-01-01T01:57:38+0000,Health,A new job title — chief of laboratory safety —...,The New York Times,Brief,129


In [7]:
# check unique values for news_desk column
news_desk_df["news_desk"].unique()

array(['Business', 'National', 'Business Day', 'Arts', 'Arts&Leisure',
       'U.S.', 'Movies', 'Politics', 'Science', 'Society', 'Health',
       'Theater', 'Education', 'Technology', 'Media', 'Climate'],
      dtype=object)

In [8]:
# check unique values for section_name column
news_desk_df["section_name"].unique()

array(['Technology', 'U.S.', 'Business Day', 'Health',
       'Crosswords & Games', 'The Upshot', 'Automobiles', 'Arts',
       'Movies', 'Theater', 'Your Money', 'Science', 'Fashion & Style',
       'Education', 'Real Estate', 'Food', 'World', 'Style', 'New York',
       'Books', 'Universal', 'Job Market', 'Obituaries', 'Magazine',
       'Sports', 'NYT Now', 'Opinion', 'Times Insider', 'Giving',
       'Sunday Review', 'Well', 'Watching', 'Climate', 'Travel',
       'Smarter Living', 'Reader Center'], dtype=object)

In [9]:
# remove rows where "crosswords & games" falls under section name
cw_games = news_desk_df[news_desk_df["section_name"] == "Crosswords & Games"].index

updated_df = news_desk_df.drop(cw_games)

In [10]:
# drop the byline column - this only contains the name of the person who wrote the article and had 6k nulls
updated_df.drop(["byline"], axis = 1, inplace = True) 

In [11]:
# drop the source column - all sources are from the NYT
updated_df.drop(["source"], axis = 1, inplace = True)

In [12]:
# check non-null count after filtering by section name and dropping cw & games rows

updated_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55931 entries, 4 to 210164
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   abstract          55927 non-null  object
 1   document_type     55931 non-null  object
 2   headline          55931 non-null  object
 3   keywords          55931 non-null  object
 4   lead_paragraph    55713 non-null  object
 5   news_desk         55931 non-null  object
 6   pub_date          55931 non-null  object
 7   section_name      55931 non-null  object
 8   snippet           55926 non-null  object
 9   type_of_material  55931 non-null  object
 10  word_count        55931 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 5.1+ MB


In [13]:
# check row count
updated_df.shape

(55931, 11)

In [14]:
# check all null values for abstract column
null_abst = pd.isnull(updated_df["abstract"])
                        
updated_df[null_abst]

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
41215,,multimedia,Why Puerto Rico’s Debt Is So Big,[],,U.S.,2015-07-04T01:13:04+0000,U.S.,,Interactive Feature,0
77052,,multimedia,Carly Fiorina on Climate Change,[],,U.S.,2015-12-15T05:15:21+0000,U.S.,,Interactive Feature,0
109063,,multimedia,Seeking Pluto’s Frigid Heart,"[{'name': 'subject', 'value': 'Pluto (Dwarf Pl...",,Science,2016-05-19T14:45:41+0000,Science,,Interactive Feature,0
151526,,multimedia,It ‘Falls Short in Every Respect’: Ethics Expe...,"[{'name': 'persons', 'value': 'Trump, Donald J...",,U.S.,2017-01-12T19:54:44+0000,U.S.,,Interactive Feature,0


### Note: the null abstracts all have a document type of multimedia - not articles

In [15]:
# check all null values for snippet column
null_snippet = pd.isnull(updated_df["snippet"])
                        
updated_df[null_snippet]

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
5257,Remember when Jack White used to hate hip-hop?...,article,Run the Jewels to Join Jack White,"[{'name': 'subject', 'value': 'Rap and Hip-Hop...",Remember when Jack White used to hate hip-hop?...,Arts&Leisure,2015-01-24T20:15:07+0000,Arts,,News,140
41215,,multimedia,Why Puerto Rico’s Debt Is So Big,[],,U.S.,2015-07-04T01:13:04+0000,U.S.,,Interactive Feature,0
77052,,multimedia,Carly Fiorina on Climate Change,[],,U.S.,2015-12-15T05:15:21+0000,U.S.,,Interactive Feature,0
109063,,multimedia,Seeking Pluto’s Frigid Heart,"[{'name': 'subject', 'value': 'Pluto (Dwarf Pl...",,Science,2016-05-19T14:45:41+0000,Science,,Interactive Feature,0
151526,,multimedia,It ‘Falls Short in Every Respect’: Ethics Expe...,"[{'name': 'persons', 'value': 'Trump, Donald J...",,U.S.,2017-01-12T19:54:44+0000,U.S.,,Interactive Feature,0


### Note: 4/5 null rows in snippet column have document type as multimedia, no lead paragraph - 0 word count 

In [16]:
# check all null values for lead_paragraph column
null_lead = pd.isnull(updated_df["lead_paragraph"])
                        
updated_df[null_lead]

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
282,You can adjust your settings so that only cert...,article,Sharing Flickr Photos Privately,"[{'name': 'subject', 'value': 'Computers and t...",,Business,2015-01-02T17:09:23+0000,Technology,You can adjust your settings so that only cert...,Question,520
548,"On Monday, automakers reported strong December...",article,"December Auto Sales, and Job Data for U.S. and...","[{'name': 'subject', 'value': 'United States E...",,Business,2015-01-04T22:23:05+0000,Business Day,"On Monday, automakers reported strong December...",News,463
2809,"Free, reputable programs are available to sear...",article,Freeing a PC From Malware,"[{'name': 'persons', 'value': 'Biersdorfer, J ...",,Business,2015-01-14T18:19:34+0000,Technology,"Free, reputable programs are available to sear...",Question,618
4927,How to keep a passing interest from following ...,article,Sharing iTunes With the Whole Family,"[{'name': 'subject', 'value': 'Mobile Applicat...",,Business,2015-01-23T14:02:38+0000,Technology,How to keep a passing interest from following ...,Question,611
8279,Valentine’s Day is recast as a group event.,article,Seizing the Day With Others,"[{'name': 'subject', 'value': ""Valentine's Day...",,Society,2015-02-06T20:18:49+0000,Fashion & Style,Valentine’s Day is recast as a group event.,News,1116
9159,Tips on how to transfer photographs into iPhot...,article,Importing Images to iPhoto,"[{'name': 'subject', 'value': 'Smartphones', '...",,Business,2015-02-11T14:35:37+0000,Technology,Tips on how to transfer photographs into iPhot...,Question,611
12444,"SFX disclosed that its founder, Robert F.X. Si...",article,Sillerman of SFX Proposes Taking the Concert C...,"[{'name': 'organizations', 'value': 'SFX Enter...",,Business,2015-02-25T18:02:58+0000,Business Day,"SFX disclosed that its founder, Robert F.X. Si...",News,303
12543,"At a two-year community college, textbooks can...",article,Putting a Dent in College Costs With Open-Sour...,"[{'name': 'subject', 'value': 'Textbooks', 'ra...",,Business,2015-02-25T23:04:48+0000,Your Money,"At a two-year community college, textbooks can...",News,756
12989,Learn about the differences between Amazon Fir...,article,Choosing Between Amazon’s TV Streaming Hardware,"[{'name': 'subject', 'value': 'Television Sets...",,Business,2015-02-27T15:06:31+0000,Technology,Learn about the differences between Amazon Fir...,Question,540
13492,The fatal shooting occurred Sunday in a homele...,article,Los Angeles Police Kill Man in Confrontation C...,"[{'name': 'glocations', 'value': 'Los Angeles ...",,National,2015-03-02T05:16:48+0000,U.S.,The fatal shooting occurred Sunday in a homele...,News,562


In [17]:
# check document types

updated_df["document_type"].unique()

array(['article', 'multimedia'], dtype=object)

### Note: document type values are article or multimedia

In [18]:
# check all observations of document type

updated_df.groupby(["document_type"])["document_type"].count()

document_type
article       50740
multimedia     5191
Name: document_type, dtype: int64

In [19]:
# check all available types of material

updated_df["type_of_material"].unique()

array(['News', 'Interactive Feature', 'Brief', 'Slideshow',
       'Obituary (Obit)', 'Question', 'Video', 'Schedule', 'Review',
       'List', 'Letter', 'Interview', 'News Analysis', 'Text', 'Web Log',
       'Economic Analysis', 'Op-Ed', 'Editorial', 'Special Report',
       'recipe', 'briefing', 'Newsletter', 'Series', 'An Analysis'],
      dtype=object)

In [20]:
# check type of material for all multimedia document types

multimedia = updated_df[updated_df["document_type"] == "multimedia"]

multimedia["type_of_material"].unique()

array(['Interactive Feature', 'Slideshow', 'Video'], dtype=object)

In [21]:
multimedia.head()

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
24,"By Jan. 1, 29 states and the District of Colum...",multimedia,A Guide to Minimum Wage Increases at the State...,"[{'name': 'subject', 'value': 'Minimum Wage', ...","By Jan. 1, 29 states and the District of Colum...",Business Day,2015-01-01T01:45:03+0000,Business Day,"By Jan. 1, 29 states and the District of Colum...",Interactive Feature,0
63,The music producer Philippe Zdar and his Paris...,multimedia,Inside the Motorbass studio,"[{'name': 'subject', 'value': 'Music', 'rank':...",The music producer Philippe Zdar and his Paris...,Arts,2015-01-01T12:29:24+0000,Arts,The music producer Philippe Zdar and his Paris...,Slideshow,0
93,A look at the new show at the American Folk Ar...,multimedia,‘A Shared Legacy: Folk Art in America’,"[{'name': 'subject', 'value': 'Art', 'rank': 1...",A look at the new show at the American Folk Ar...,Arts,2015-01-01T18:49:19+0000,Arts,A look at the new show at the American Folk Ar...,Slideshow,0
99,A look at the Guggenheim Museum’s new retrospe...,multimedia,"‘Painting as Process, Painting as Life’","[{'name': 'persons', 'value': 'Gaitonde, V S',...",A look at the Guggenheim Museum’s new retrospe...,Arts,2015-01-01T19:54:25+0000,Arts,A look at the Guggenheim Museum’s new retrospe...,Slideshow,0
162,Though homeless encampments elsewhere in the c...,multimedia,Hope for the Homeless,"[{'name': 'subject', 'value': 'Homeless Person...",Though homeless encampments elsewhere in the c...,U.S.,2015-01-02T01:14:03+0000,U.S.,Though homeless encampments elsewhere in the c...,Slideshow,0


In [22]:
# check max word count for multimedia document types

multimedia["word_count"].max()

0

In [23]:
### Note: Headlines with multimedia are not articles, dropping all rows (can revert if necessary)

# remove rows where "crosswords & games" falls under section name
mm_data = updated_df[updated_df["document_type"] == "multimedia"].index

updated_df = updated_df.drop(mm_data)

In [24]:
# check non-null count after dropping multimedia rows

updated_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50740 entries, 4 to 210164
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   abstract          50740 non-null  object
 1   document_type     50740 non-null  object
 2   headline          50740 non-null  object
 3   keywords          50740 non-null  object
 4   lead_paragraph    50526 non-null  object
 5   news_desk         50740 non-null  object
 6   pub_date          50740 non-null  object
 7   section_name      50740 non-null  object
 8   snippet           50739 non-null  object
 9   type_of_material  50740 non-null  object
 10  word_count        50740 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 4.6+ MB


In [25]:
updated_df.shape

(50740, 11)

In [26]:
# checking word count for any other non-news related articles

low_word_count = updated_df[updated_df["word_count"] <= 30]

low_word_count

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
17629,"Marriage and exercise, sexual desire and mosqu...",article,"The Weekly Health Quiz: Marriage, Memory and Sex","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-03-19T19:25:25+0000,Health,"Marriage and exercise, sexual desire and mosqu...",News,3
19378,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Upbeat Emotions, Male ...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-03-27T09:30:51+0000,Health,Test your knowledge of this week’s health news.,News,14
21022,How well do you know this week’s health news.,article,"The Weekly Health Quiz: Joni Mitchell, Fish Oi...",[],Take the Quiz,Science,2015-04-03T15:18:45+0000,Health,How well do you know this week’s health news.,News,3
22486,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Struck by Lightning, a...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-04-10T18:02:58+0000,Health,Test your knowledge of this week’s health news.,News,14
23991,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Exercise, Women and th...","[{'name': 'subject', 'value': 'Dogs', 'rank': ...",Take the Quiz,Science,2015-04-17T15:11:15+0000,Health,Test your knowledge of this week’s health news.,News,14
25452,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Runners, Rabies and Da...","[{'name': 'subject', 'value': 'Emotions', 'ran...",Take the Quiz,Science,2015-04-24T14:10:57+0000,Health,Test your knowledge of this week’s health news.,News,14
27014,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Calories, Gluten and a...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-05-01T15:02:11+0000,Health,Test your knowledge of this week’s health news.,News,3
28592,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Golf, Memory and Food ...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-05-08T14:44:42+0000,Health,Test your knowledge of this week’s health news.,News,14
30234,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Coffee, Work Breaks an...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-05-15T15:41:42+0000,Health,Test your knowledge of this week’s health news.,News,3
31702,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Probiotics, Belly Fat ...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-05-22T13:15:42+0000,Health,Test your knowledge of this week’s health news.,News,3


In [27]:
# check "take the quiz" by lead paragraph

th_quiz = updated_df[updated_df["lead_paragraph"] == "Take the Quiz"]

th_quiz

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
17629,"Marriage and exercise, sexual desire and mosqu...",article,"The Weekly Health Quiz: Marriage, Memory and Sex","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-03-19T19:25:25+0000,Health,"Marriage and exercise, sexual desire and mosqu...",News,3
19378,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Upbeat Emotions, Male ...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-03-27T09:30:51+0000,Health,Test your knowledge of this week’s health news.,News,14
21022,How well do you know this week’s health news.,article,"The Weekly Health Quiz: Joni Mitchell, Fish Oi...",[],Take the Quiz,Science,2015-04-03T15:18:45+0000,Health,How well do you know this week’s health news.,News,3
22486,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Struck by Lightning, a...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-04-10T18:02:58+0000,Health,Test your knowledge of this week’s health news.,News,14
23991,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Exercise, Women and th...","[{'name': 'subject', 'value': 'Dogs', 'rank': ...",Take the Quiz,Science,2015-04-17T15:11:15+0000,Health,Test your knowledge of this week’s health news.,News,14
25452,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Runners, Rabies and Da...","[{'name': 'subject', 'value': 'Emotions', 'ran...",Take the Quiz,Science,2015-04-24T14:10:57+0000,Health,Test your knowledge of this week’s health news.,News,14
27014,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Calories, Gluten and a...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-05-01T15:02:11+0000,Health,Test your knowledge of this week’s health news.,News,3
28592,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Golf, Memory and Food ...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-05-08T14:44:42+0000,Health,Test your knowledge of this week’s health news.,News,14
30234,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Coffee, Work Breaks an...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-05-15T15:41:42+0000,Health,Test your knowledge of this week’s health news.,News,3
31702,Test your knowledge of this week’s health news.,article,"The Weekly Health Quiz: Probiotics, Belly Fat ...","[{'name': 'subject', 'value': 'Medicine and He...",Take the Quiz,Science,2015-05-22T13:15:42+0000,Health,Test your knowledge of this week’s health news.,News,3


In [28]:
th_quiz.shape

(76, 11)

### Note: dropping rows with "take the quiz" - while document type shows as article, these are all interactive quizes to test people on their knowlege of weekly health news

In [29]:
# drop all rows with health quiz
quiz_rows = updated_df[updated_df["lead_paragraph"] == "Take the Quiz"].index

updated_df = updated_df.drop(quiz_rows)

In [30]:
first_draft = updated_df[updated_df["headline"].str.contains("First Draft Focus")]

first_draft

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
33202,Political pictures from the last week selected...,article,First Draft Focus: The Week in Political Pictures,[],,Politics,2015-05-29T20:18:06+0000,U.S.,Political pictures from the last week selected...,News,1
33909,Ina Bass and Elsie Shemin-Roth accepted the Me...,article,First Draft Focus: ‘Never Too Late to Say Than...,"[{'name': 'subject', 'value': 'Medal of Honor ...",,Politics,2015-06-02T20:51:11+0000,U.S.,Ina Bass and Elsie Shemin-Roth accepted the Me...,News,1
34199,"Martin O’Malley, the former governor of Maryla...",article,First Draft Focus: Martin O’Malley on the Stump,"[{'name': 'subject', 'value': 'Presidential El...",,Politics,2015-06-03T23:07:22+0000,U.S.,"Martin O’Malley, the former governor of Maryla...",News,1
34309,Mourners lined up to sign a condolence book in...,article,First Draft Focus: Saying Goodbye to Beau Biden,"[{'name': 'subject', 'value': 'Capitol Buildin...",,Politics,2015-06-04T12:19:21+0000,U.S.,Mourners lined up to sign a condolence book in...,News,1
34756,"Hillary Rodham Clinton is honored in Texas, Se...",article,First Draft Focus: The Week in Political Pictures,"[{'name': 'persons', 'value': 'Biden, Joseph R...",,Politics,2015-06-05T20:59:11+0000,U.S.,"Hillary Rodham Clinton is honored in Texas, Se...",News,1
35210,"Family members got together in Kennebunkport, ...",article,First Draft Focus: Bush Family Reunion in Maine,"[{'name': 'persons', 'value': 'Bush, Barbara',...",,Politics,2015-06-08T16:31:29+0000,U.S.,"Family members got together in Kennebunkport, ...",News,1
35481,A Secret Service police officer oversaw an eva...,article,First Draft Focus: Secret Service Evacuates W...,"[{'name': 'subject', 'value': 'White House Bui...",,Politics,2015-06-09T19:03:44+0000,U.S.,A Secret Service police officer oversaw an eva...,News,1
36364,"The Group of 7 meets in the Bavarian Alps, see...",article,First Draft Focus: The Week in Political Pictures,"[{'name': 'persons', 'value': 'Boehner, John A...",,Politics,2015-06-12T20:16:53+0000,U.S.,"The Group of 7 meets in the Bavarian Alps, see...",News,1
36889,Hillary Rodham Clinton read “The Very Hungry C...,article,First Draft Focus: Story Time With Hillary,"[{'name': 'subject', 'value': 'Presidential El...",,Politics,2015-06-15T19:30:12+0000,U.S.,Hillary Rodham Clinton read “The Very Hungry C...,News,1
36915,Mr. Bush formally announced his presidential c...,article,First Draft Focus: Jeb Bush Makes It Official,"[{'name': 'subject', 'value': 'Presidential El...","Jeb Bush Announces White House Bid, Saying ‘Am...",Politics,2015-06-15T20:49:41+0000,U.S.,Mr. Bush formally announced his presidential c...,News,10


In [31]:
first_draft.shape

(92, 11)

### Note: dropping rows with "first draft focus" - while document type shows as article, these are all individual photos which a short description beneath

In [32]:
# drop rows with first draft focus
fdf_rows = updated_df[updated_df["headline"].str.contains("First Draft Focus")].index

updated_df = updated_df.drop(fdf_rows)

In [33]:
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50572 entries, 4 to 210164
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   abstract          50572 non-null  object
 1   document_type     50572 non-null  object
 2   headline          50572 non-null  object
 3   keywords          50572 non-null  object
 4   lead_paragraph    50445 non-null  object
 5   news_desk         50572 non-null  object
 6   pub_date          50572 non-null  object
 7   section_name      50572 non-null  object
 8   snippet           50571 non-null  object
 9   type_of_material  50572 non-null  object
 10  word_count        50572 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 4.6+ MB


In [34]:
# article count of every section name

updated_df.groupby(["news_desk", "section_name"])["section_name"].count()

news_desk     section_name   
Arts&Leisure  Arts                1960
              Books                  4
              Movies               794
              Theater              555
              Watching              37
Business      Arts                  23
              Automobiles          273
              Books                  8
              Business Day       13814
              Climate                8
              Education             39
              Fashion & Style       24
              Food                   8
              Giving                13
              Health                31
              Job Market             4
              Movies                 7
              New York               5
              Obituaries             6
              Opinion                1
              Reader Center          3
              Real Estate          353
              Science               41
              Smarter Living        19
              Sports              

In [35]:
# reset index

updated_df = updated_df.reset_index(drop=True)

updated_df.head()

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
0,Farhad Manjoo picks four products from 2014 th...,article,"Standouts in Tech: Drones, Virtual Reality, In...","[{'name': 'organizations', 'value': 'Oculus VR...",LOTS of cool new technology products come out ...,Business,2015-01-01T00:26:30+0000,Technology,Farhad Manjoo picks four products from 2014 th...,News,824
1,Representative Steve Scalise’s effort to expla...,article,Much of David Duke’s ’91 Campaign Is Now in Lo...,"[{'name': 'persons', 'value': 'Alford, Jeremy'...","BATON ROUGE, La. — David Duke seems a figure f...",National,2015-01-01T01:36:18+0000,U.S.,Representative Steve Scalise’s effort to expla...,News,1293
2,Minimum wage increases go into effect in 20 st...,article,"States’ Minimum Wages Rise, Helping Millions o...","[{'name': 'subject', 'value': 'Minimum Wage', ...","For some low-wage workers, everyday tasks like...",Business,2015-01-01T01:45:10+0000,Business Day,Minimum wage increases go into effect in 20 st...,News,1017
3,A new job title — chief of laboratory safety —...,article,New C.D.C. Job Overseeing Laboratory Safety,"[{'name': 'persons', 'value': 'McNeil, Donald ...",A new job title — chief of laboratory safety —...,National,2015-01-01T01:57:38+0000,Health,A new job title — chief of laboratory safety —...,Brief,129
4,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",article,Massachusetts: New Effort to Move Bombings Trial,"[{'name': 'subject', 'value': 'Boston Marathon...","Lawyers for Dzhokhar Tsarnaev, the defendant i...",National,2015-01-01T01:59:29+0000,U.S.,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",Brief,145


In [36]:
# remove time from pub_date column

updated_df["pub_date"] = updated_df["pub_date"].str.split("T", expand=True)[0]

updated_df.head()

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
0,Farhad Manjoo picks four products from 2014 th...,article,"Standouts in Tech: Drones, Virtual Reality, In...","[{'name': 'organizations', 'value': 'Oculus VR...",LOTS of cool new technology products come out ...,Business,2015-01-01,Technology,Farhad Manjoo picks four products from 2014 th...,News,824
1,Representative Steve Scalise’s effort to expla...,article,Much of David Duke’s ’91 Campaign Is Now in Lo...,"[{'name': 'persons', 'value': 'Alford, Jeremy'...","BATON ROUGE, La. — David Duke seems a figure f...",National,2015-01-01,U.S.,Representative Steve Scalise’s effort to expla...,News,1293
2,Minimum wage increases go into effect in 20 st...,article,"States’ Minimum Wages Rise, Helping Millions o...","[{'name': 'subject', 'value': 'Minimum Wage', ...","For some low-wage workers, everyday tasks like...",Business,2015-01-01,Business Day,Minimum wage increases go into effect in 20 st...,News,1017
3,A new job title — chief of laboratory safety —...,article,New C.D.C. Job Overseeing Laboratory Safety,"[{'name': 'persons', 'value': 'McNeil, Donald ...",A new job title — chief of laboratory safety —...,National,2015-01-01,Health,A new job title — chief of laboratory safety —...,Brief,129
4,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",article,Massachusetts: New Effort to Move Bombings Trial,"[{'name': 'subject', 'value': 'Boston Marathon...","Lawyers for Dzhokhar Tsarnaev, the defendant i...",National,2015-01-01,U.S.,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",Brief,145


In [37]:
# convert to CSV

updated_df.to_csv("files/headlines.csv", index=False)