## Imports

In [70]:
import requests 
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

## Query The Data Through The NYT API. 🗞

In [71]:
def execute(year, month):
  requestUrl = f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key=wzF4H7JVaVYRQ6h74L0lszlGtxzFMKK5"
  requestHeaders = {
    "Accept": "application/json"
  }

  response = requests.get(requestUrl, headers=requestHeaders)

  return response

if __name__ == "__main__":
  execute(2022, 6)


## Checking Our Response

In [72]:
response = execute(2022, 6)
print(response)

<Response [200]>


## Bringing the data into a DataFrame

**I have no systematic way of bringing my data into a Pandas DataFrame. I typically just go through the data & explore it as much as I can to understand it and then bring it into a DataFrame.**

In [73]:
json_response = response.json()
for i in json_response['response']['docs'][:5]:
  print(i)

{'abstract': 'A senior administration official said the rocket system was provided only after direct assurances by Ukraine’s leaders that they would not use it against targets in Russian territory.', 'web_url': 'https://www.nytimes.com/2022/05/31/us/politics/biden-ukraine-rockets.html', 'snippet': 'A senior administration official said the rocket system was provided only after direct assurances by Ukraine’s leaders that they would not use it against targets in Russian territory.', 'lead_paragraph': 'WASHINGTON — The United States will send Ukraine advanced rocket systems and munitions as part of a new $700 million package of military equipment intended to help the Ukrainians fight back against the Russian invasion of their country, President Biden and White House officials said on Tuesday.', 'print_section': 'A', 'print_page': '9', 'source': 'The New York Times', 'multimedia': [{'rank': 0, 'subtype': 'xlarge', 'caption': None, 'credit': None, 'type': 'image', 'url': 'images/2022/05/31/

In [74]:
df = pd.DataFrame.from_dict(json_response['response']['docs'])
df.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,subsection_name,byline,type_of_material,_id,word_count,uri
0,A senior administration official said the rock...,https://www.nytimes.com/2022/05/31/us/politics...,A senior administration official said the rock...,WASHINGTON — The United States will send Ukrai...,A,9.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'U.S. to Send Ukraine $700 Million in...,"[{'name': 'persons', 'value': 'Biden, Joseph R...",2022-06-01T00:05:37+0000,article,Washington,U.S.,Politics,"{'original': 'By Michael D. Shear', 'person': ...",News,nyt://article/09f1e20a-5d5b-5908-875b-830f45c7...,964,nyt://article/09f1e20a-5d5b-5908-875b-830f45c7...
1,The 47-year-old man frequently swam into a lak...,https://www.nytimes.com/2022/05/31/us/alligato...,The 47-year-old man frequently swam into a lak...,The body of a man who had been retrieving Fris...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Alligator Kills Florida Man Retrievi...,"[{'name': 'subject', 'value': 'Alligators', 'r...",2022-06-01T00:42:23+0000,article,Express,U.S.,,"{'original': 'By Alex Traub', 'person': [{'fir...",News,nyt://article/7e2ae77b-4d75-5b1b-9773-51f37d75...,683,nyt://article/7e2ae77b-4d75-5b1b-9773-51f37d75...
2,Corrections that appeared in print on Wednesda...,https://www.nytimes.com/2022/05/31/pageoneplus...,Corrections that appeared in print on Wednesda...,An article on Tuesday about a draft of an agre...,A,21.0,The New York Times,[],"{'main': 'Corrections: June 1, 2022', 'kicker'...",[],2022-06-01T00:54:49+0000,article,Corrections,Corrections,,"{'original': '', 'person': [], 'organization':...",Correction,nyt://article/0b55b248-0bfe-503d-8bcc-18f5d33a...,440,nyt://article/0b55b248-0bfe-503d-8bcc-18f5d33a...
3,"Quotation of the Day for Wednesday, June 1, 2022.",https://www.nytimes.com/2022/05/31/todayspaper...,"Quotation of the Day for Wednesday, June 1, 2022.","“It’s like saying to the world, ‘look, here we...",A,3.0,The New York Times,[],"{'main': 'Quotation of the Day: Allinllachu, W...",[],2022-06-01T00:55:52+0000,article,Summary,Today’s Paper,,"{'original': '', 'person': [], 'organization':...",Quote,nyt://article/bccb0fd1-5bbe-5889-a2f6-36385173...,30,nyt://article/bccb0fd1-5bbe-5889-a2f6-36385173...
4,A judge ruled that Ms. Palin failed to introdu...,https://www.nytimes.com/2022/05/31/business/sa...,A judge ruled that Ms. Palin failed to introdu...,Sarah Palin lost her bid for a new trial in he...,B,4.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Sarah Palin’s Bid for New Libel Tria...,"[{'name': 'subject', 'value': 'Libel and Sland...",2022-06-01T01:14:09+0000,article,Business,Business Day,,"{'original': 'By Sheera Frenkel', 'person': [{...",News,nyt://article/a82cb17f-3a34-544d-8f6d-66351480...,534,nyt://article/a82cb17f-3a34-544d-8f6d-66351480...


### Deleting what I do not need. 

In [75]:
# multimedia, byline won't be needed for my purposes (feel free to keep whatever data you want)
# uri and _id are the same so I will just keep the _id and put it in our first column
df.drop(['multimedia', 'byline', 'uri'], axis=1, inplace=True)


### Reordering to make it more readable.

In [76]:
cols = list(df.columns)
print(cols)

['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'subsection_name', 'type_of_material', '_id', 'word_count']


In [77]:
# https://www.datasciencemadesimple.com/re-arrange-or-re-order-the-column-of-dataframe-in-pandas-python-2/ --> helpful article to reorder columns
columns_reordered = ['_id', 'abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'pub_date', 'document_type',  'type_of_material', 'word_count','news_desk', 'section_name', 'subsection_name' , 'headline', 'keywords']
df2=df.reindex(columns= columns_reordered)
df2.head()

Unnamed: 0,_id,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,pub_date,document_type,type_of_material,word_count,news_desk,section_name,subsection_name,headline,keywords
0,nyt://article/09f1e20a-5d5b-5908-875b-830f45c7...,A senior administration official said the rock...,https://www.nytimes.com/2022/05/31/us/politics...,A senior administration official said the rock...,WASHINGTON — The United States will send Ukrai...,A,9.0,The New York Times,2022-06-01T00:05:37+0000,article,News,964,Washington,U.S.,Politics,{'main': 'U.S. to Send Ukraine $700 Million in...,"[{'name': 'persons', 'value': 'Biden, Joseph R..."
1,nyt://article/7e2ae77b-4d75-5b1b-9773-51f37d75...,The 47-year-old man frequently swam into a lak...,https://www.nytimes.com/2022/05/31/us/alligato...,The 47-year-old man frequently swam into a lak...,The body of a man who had been retrieving Fris...,,,The New York Times,2022-06-01T00:42:23+0000,article,News,683,Express,U.S.,,{'main': 'Alligator Kills Florida Man Retrievi...,"[{'name': 'subject', 'value': 'Alligators', 'r..."
2,nyt://article/0b55b248-0bfe-503d-8bcc-18f5d33a...,Corrections that appeared in print on Wednesda...,https://www.nytimes.com/2022/05/31/pageoneplus...,Corrections that appeared in print on Wednesda...,An article on Tuesday about a draft of an agre...,A,21.0,The New York Times,2022-06-01T00:54:49+0000,article,Correction,440,Corrections,Corrections,,"{'main': 'Corrections: June 1, 2022', 'kicker'...",[]
3,nyt://article/bccb0fd1-5bbe-5889-a2f6-36385173...,"Quotation of the Day for Wednesday, June 1, 2022.",https://www.nytimes.com/2022/05/31/todayspaper...,"Quotation of the Day for Wednesday, June 1, 2022.","“It’s like saying to the world, ‘look, here we...",A,3.0,The New York Times,2022-06-01T00:55:52+0000,article,Quote,30,Summary,Today’s Paper,,"{'main': 'Quotation of the Day: Allinllachu, W...",[]
4,nyt://article/a82cb17f-3a34-544d-8f6d-66351480...,A judge ruled that Ms. Palin failed to introdu...,https://www.nytimes.com/2022/05/31/business/sa...,A judge ruled that Ms. Palin failed to introdu...,Sarah Palin lost her bid for a new trial in he...,B,4.0,The New York Times,2022-06-01T01:14:09+0000,article,News,534,Business,Business Day,,{'main': 'Sarah Palin’s Bid for New Libel Tria...,"[{'name': 'subject', 'value': 'Libel and Sland..."


### Changing my datetime column to a date column.

In [78]:
# First turning the column into a datetime column
df2['pub_date'] = pd.to_datetime(df['pub_date'])
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4374 entries, 0 to 4373
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   _id               4374 non-null   object             
 1   abstract          4374 non-null   object             
 2   web_url           4374 non-null   object             
 3   snippet           4374 non-null   object             
 4   lead_paragraph    4374 non-null   object             
 5   print_section     2359 non-null   object             
 6   print_page        2359 non-null   object             
 7   source            4374 non-null   object             
 8   pub_date          4374 non-null   datetime64[ns, UTC]
 9   document_type     4374 non-null   object             
 10  type_of_material  4374 non-null   object             
 11  word_count        4374 non-null   int64              
 12  news_desk         4374 non-null   object             
 13  sec

In [79]:
# Changing my datetime column to a date column 
df2['pub_date'] = pd.to_datetime(df2['pub_date']).dt.date
df2.head()

Unnamed: 0,_id,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,pub_date,document_type,type_of_material,word_count,news_desk,section_name,subsection_name,headline,keywords
0,nyt://article/09f1e20a-5d5b-5908-875b-830f45c7...,A senior administration official said the rock...,https://www.nytimes.com/2022/05/31/us/politics...,A senior administration official said the rock...,WASHINGTON — The United States will send Ukrai...,A,9.0,The New York Times,2022-06-01,article,News,964,Washington,U.S.,Politics,{'main': 'U.S. to Send Ukraine $700 Million in...,"[{'name': 'persons', 'value': 'Biden, Joseph R..."
1,nyt://article/7e2ae77b-4d75-5b1b-9773-51f37d75...,The 47-year-old man frequently swam into a lak...,https://www.nytimes.com/2022/05/31/us/alligato...,The 47-year-old man frequently swam into a lak...,The body of a man who had been retrieving Fris...,,,The New York Times,2022-06-01,article,News,683,Express,U.S.,,{'main': 'Alligator Kills Florida Man Retrievi...,"[{'name': 'subject', 'value': 'Alligators', 'r..."
2,nyt://article/0b55b248-0bfe-503d-8bcc-18f5d33a...,Corrections that appeared in print on Wednesda...,https://www.nytimes.com/2022/05/31/pageoneplus...,Corrections that appeared in print on Wednesda...,An article on Tuesday about a draft of an agre...,A,21.0,The New York Times,2022-06-01,article,Correction,440,Corrections,Corrections,,"{'main': 'Corrections: June 1, 2022', 'kicker'...",[]
3,nyt://article/bccb0fd1-5bbe-5889-a2f6-36385173...,"Quotation of the Day for Wednesday, June 1, 2022.",https://www.nytimes.com/2022/05/31/todayspaper...,"Quotation of the Day for Wednesday, June 1, 2022.","“It’s like saying to the world, ‘look, here we...",A,3.0,The New York Times,2022-06-01,article,Quote,30,Summary,Today’s Paper,,"{'main': 'Quotation of the Day: Allinllachu, W...",[]
4,nyt://article/a82cb17f-3a34-544d-8f6d-66351480...,A judge ruled that Ms. Palin failed to introdu...,https://www.nytimes.com/2022/05/31/business/sa...,A judge ruled that Ms. Palin failed to introdu...,Sarah Palin lost her bid for a new trial in he...,B,4.0,The New York Times,2022-06-01,article,News,534,Business,Business Day,,{'main': 'Sarah Palin’s Bid for New Libel Tria...,"[{'name': 'subject', 'value': 'Libel and Sland..."


### Checking for missing values

I check for missing values to further understand my data. The next step will be to visualize my data to see what I am working with. 

In [80]:
for column in df2.columns:
  print(f"{column} : {df2[column].isna().sum()}")

_id : 0
abstract : 0
web_url : 0
snippet : 0
lead_paragraph : 0
print_section : 2015
print_page : 2015
source : 0
pub_date : 0
document_type : 0
type_of_material : 0
word_count : 0
news_desk : 0
section_name : 0
subsection_name : 2366
headline : 0
keywords : 0


## Time for some visualizations. 📊

### Histograms to visualize the count of values within our columns

Let's create some functions since we will be reusing them.

In [116]:
def plot_histogram_descending(df, x, color=None, color_discrete_sequence=None,labels={}, title=""):
  fig = px.histogram(df, x=x, text_auto=True, color=color, title=title, 
                   color_discrete_sequence=color_discrete_sequence, labels=labels)
  fig.update_layout(bargap=0.2)
  fig.update_xaxes(ticks="inside")
  fig.update_xaxes(categoryorder="total descending")
  fig.update_yaxes(ticks="inside", col=1)
  return fig.show()

In [117]:
def plot_histogram(df, x, color=None, color_discrete_sequence=None,labels={}, title=""):
  fig = px.histogram(df, x=x, text_auto=True, color=color, title=title, 
                   color_discrete_sequence=color_discrete_sequence, labels=labels)
  fig.update_layout(bargap=0.2)
  fig.update_xaxes(ticks="inside")
  fig.update_yaxes(ticks="inside", col=1)
  return fig.show()

In [118]:
plot_histogram(df2, x='pub_date', color='document_type', 
               labels=dict(pub_date="Date Published", document_type="Type of Document"),
               title="<b>Timeline of Articles Published in Month</b>")

In [119]:
plot_histogram_descending(df2, x='type_of_material', color='type_of_material', 
               labels=dict(type_of_material="Material Type"),
               title="<b>Articles by Material Type</b>")

In [112]:
plot_histogram_descending(df2, x='type_of_material', color='document_type',
                          color_discrete_sequence=['lightblue', 'green'],
                          labels=dict(type_of_material="Material Type"),
                          title="<b>Articles by Material Type</b>")

In [115]:
plot_histogram_descending(df2, x='news_desk', color='document_type', 
                              color_discrete_sequence=['coral', 'darkblue'],
                              labels=dict(news_desk="News Desk"),
                              title="<b>Articles Categorized by their News Desk</b>")

In [127]:
plot_histogram_descending(df2, x='section_name', color='document_type',
                              color_discrete_sequence=['gold', 'gray'],
                              labels=dict(section_name="Section", document_type="Document Type"),
                              title="<b>Articles Categorized by Section</b>")

In [132]:
plot_histogram_descending(df2, x='subsection_name', color='document_type',
                              color_discrete_sequence=['teal', 'lemonchiffon'],
                              labels=dict(subsection_name="Subsection", document_type="Document Type"),
                              title="<b>Articles Categorized by Subsection</b>")