# Sourcing Data

In [1]:
import pandas as pd

## Creative Commons (CC) Licenses

In [2]:
# The following are the most common Creative Commons licenses that 
# will allow you to freely use datasets for your projects, provided 
# you follow the rules set out by the license:

# Creative Commons Public Domain Dedication (CC0)
# Creative Commons Attribution (CC BY)
# Creative Commons Attribution-ShareAlike (CC BY-SA)
# Creative Commons Attribution Non-Commercial (CC BY-NC)

## Open Data (OD) Licenses

In [3]:
# The following are the most common Open Data licenses that will allow 
# you to freely use datasets for your projects, provided you follow the 
# rules set out by the license:

# Open Data Commons Open Database License (ODbL)
# Open Data Commons Attribution License (ODC-BY)
# Open Data Commons Public Domain Dedication and License (PDDL)

## Dataset Sources Checklist

In [4]:
# 1. Check the dataset license.
# 2. Check for terms of use.
# 3. Scrutinize the source. Is it reliable? Is it trustworthy?
# 4. Ensure that you are allowed to use the data for your purpose 
#    (e.g., personal or commercial project).

## Sourcing Data from HTML Tables

In [5]:
# Wikipedia URL that provides a list of US presidents
url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'

In [6]:
# Get data from the URL
us_presidents = pd.read_html(url)
us_presidents

[    No.[a]  Portrait                       Name (Birth–Death)  \
 0        1       NaN       George Washington (1732–1799) [17]   
 1        2       NaN              John Adams (1735–1826) [19]   
 2        3       NaN        Thomas Jefferson (1743–1826) [21]   
 3        4       NaN           James Madison (1751–1836) [22]   
 4        5       NaN            James Monroe (1758–1831) [24]   
 5        6       NaN       John Quincy Adams (1767–1848) [25]   
 6        7       NaN          Andrew Jackson (1767–1845) [28]   
 7        8       NaN        Martin Van Buren (1782–1862) [29]   
 8        9       NaN  William Henry Harrison (1773–1841) [30]   
 9       10       NaN              John Tyler (1790–1862) [31]   
 10      11       NaN           James K. Polk (1795–1849) [34]   
 11      12       NaN          Zachary Taylor (1784–1850) [35]   
 12      13       NaN        Millard Fillmore (1800–1874) [36]   
 13      14       NaN         Franklin Pierce (1804–1869) [38]   
 14      1

In [7]:
# Get the relevant table as there can be several tables in the data obtained
us_presidents[0]

Unnamed: 0,No.[a],Portrait,Name (Birth–Death),Term[14],Party[b][15],Party[b][15].1,Election,Vice President[16]
0,1,,George Washington (1732–1799) [17],"April 30, 1789 – March 4, 1797",,Unaffiliated,1788–1789 1792,John Adams[c]
1,2,,John Adams (1735–1826) [19],"March 4, 1797 – March 4, 1801",,Federalist,1796,Thomas Jefferson[d]
2,3,,Thomas Jefferson (1743–1826) [21],"March 4, 1801 – March 4, 1809",,Democratic- Republican,1800 1804,Aaron Burr George Clinton
3,4,,James Madison (1751–1836) [22],"March 4, 1809 – March 4, 1817",,Democratic- Republican,1808 1812,"George Clinton[e] Vacant after April 20, 1812 ..."
4,5,,James Monroe (1758–1831) [24],"March 4, 1817 – March 4, 1825",,Democratic- Republican,1816 1820,Daniel D. Tompkins
5,6,,John Quincy Adams (1767–1848) [25],"March 4, 1825 – March 4, 1829",,Democratic- Republican[f] National Republican,1824,John C. Calhoun[g]
6,7,,Andrew Jackson (1767–1845) [28],"March 4, 1829 – March 4, 1837",,Democratic,1828 1832,"John C. Calhoun[h] Vacant after December 28, 1..."
7,8,,Martin Van Buren (1782–1862) [29],"March 4, 1837 – March 4, 1841",,Democratic,1836,Richard Mentor Johnson
8,9,,William Henry Harrison (1773–1841) [30],"March 4, 1841 – April 4, 1841[e]",,Whig,1840,John Tyler
9,10,,John Tyler (1790–1862) [31],"April 4, 1841[i] – March 4, 1845",,Whig[j] Unaffiliated,–,Vacant throughout presidency


In [8]:
# View the dataframe in reverse order based on index
us_presidents[0].sort_index(ascending=False)

Unnamed: 0,No.[a],Portrait,Name (Birth–Death),Term[14],Party[b][15],Party[b][15].1,Election,Vice President[16]
45,46,,Joe Biden (b. 1942) [13],"January 20, 2021 – Incumbent",,Democratic,2020,Kamala Harris
44,45,,Donald Trump (b. 1946) [74],"January 20, 2017 – January 20, 2021",,Republican,2016,Mike Pence
43,44,,Barack Obama (b. 1961) [73],"January 20, 2009 – January 20, 2017",,Democratic,2008 2012,Joe Biden
42,43,,George W. Bush (b. 1946) [72],"January 20, 2001 – January 20, 2009",,Republican,2000 2004,Dick Cheney
41,42,,Bill Clinton (b. 1946) [71],"January 20, 1993 – January 20, 2001",,Democratic,1992 1996,Al Gore
40,41,,George H. W. Bush (1924–2018) [70],"January 20, 1989 – January 20, 1993",,Republican,1988,Dan Quayle
39,40,,Ronald Reagan (1911–2004) [69],"January 20, 1981 – January 20, 1989",,Republican,1980 1984,George H. W. Bush
38,39,,Jimmy Carter (b. 1924) [68],"January 20, 1977 – January 20, 1981",,Democratic,1976,Walter Mondale
37,38,,Gerald Ford (1913–2006) [67],"August 9, 1974[u] – January 20, 1977",,Republican,–,"Vacant through December 19, 1974 Nelson Rockef..."
36,37,,Richard Nixon (1913–1994) [66],"January 20, 1969 – August 9, 1974[h]",,Republican,1968 1972,Spiro Agnew[h] Vacant: October 10 – December 6...


In [9]:
# Sort/Group the data for further analysis
us_presidents[0].groupby('Party[b][15].1').count()

Unnamed: 0_level_0,No.[a],Portrait,Name (Birth–Death),Term[14],Party[b][15],Election,Vice President[16]
Party[b][15].1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Democratic,16,0,16,16,0,16,16
Democratic- Republican,3,0,3,3,0,3,3
Democratic- Republican[f] National Republican,1,0,1,1,0,1,1
Federalist,1,0,1,1,0,1,1
National Union[n] Democratic,1,0,1,1,0,1,1
Republican,18,0,18,18,0,18,18
Republican National Union[l],1,0,1,1,0,1,1
Unaffiliated,1,0,1,1,0,1,1
Whig,3,0,3,3,0,3,3
Whig[j] Unaffiliated,1,0,1,1,0,1,1


## Application Programming Interfaces (APIs)

In [10]:
# APIs are a set of functions packaged together that provide developers 
# with a means of communicating with a server and integrating third-party 
# software and technology into new applications.

In [11]:
from dotenv import load_dotenv
import os

In [12]:
# Load environment variables and New York Times API key
load_dotenv()
api_key = os.getenv("NYT_API_KEY")

In [13]:
# JavaScript Object Notation (JSON)
import json
# A request is a communication to the API to retrieve data.
import requests

In [14]:
# New York Times Article API URL
# API Documentation: https://developer.nytimes.com/apis
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

In [15]:
# Refine your search - note that these are specific to NY Times

# Filters - Technology and Special Report
filter_query = 'section_name:"New York"'

# Sort - newest
sort = "newest"

# Field list - required fields
field_list = "snippet,headline,web_url,source,keywords,pub_date"

# Date range
begin_date = "20000101"
end_date = "20240419"

# Page number
page_number = 0

In [16]:
# Build URL - The f-string has been split in multiple lines to make is easier to understand
query_url = (
    f'{url}' +
    f'api-key={api_key}' +
    f'&begin_date={begin_date}' +
    f'&end_date={end_date}' +
    f'&fq={filter_query}' + 
    f'&sort={sort}' +
    f'&fl={field_list}' +
    f'&page={str(page_number)}'
)
query_url

'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=6wkFzyOZvAk2YenigCsLx3gTG1CVG5Sy&begin_date=20000101&end_date=20240419&fq=section_name:"New York"&sort=newest&fl=snippet,headline,web_url,source,keywords,pub_date&page=0'

In [17]:
# Get the response
response = requests.get(query_url)
response

<Response [200]>

In [18]:
# Retrieve reports
reports = response.json()

# Print results in JSON format
print(json.dumps(reports, indent=4))

{
    "status": "OK",
    "copyright": "Copyright (c) 2024 The New York Times Company. All Rights Reserved.",
    "response": {
        "docs": [
            {
                "web_url": "https://www.nytimes.com/2024/04/19/nyregion/nemat-shafik-columbia-university-president.html",
                "snippet": "Dr. Shafik, who also goes by Minouche, is facing criticism from multiple sides over how she is handling protests over the war in Gaza on her campus.",
                "source": "The New York Times",
                "headline": {
                    "main": "What We Know About Columbia University\u2019s President, Nemat Shafik",
                    "kicker": null,
                    "content_kicker": null,
                    "print_headline": null,
                    "name": null,
                    "seo": null,
                    "sub": null
                },
                "keywords": [
                    {
                        "name": "subject",
                       

In [19]:
# Convert the response to a Dataframe
reports_df = pd.DataFrame(reports["response"]["docs"])
reports_df.head()

Unnamed: 0,web_url,snippet,source,headline,keywords,pub_date
0,https://www.nytimes.com/2024/04/19/nyregion/ne...,"Dr. Shafik, who also goes by Minouche, is faci...",The New York Times,{'main': 'What We Know About Columbia Universi...,"[{'name': 'subject', 'value': 'Colleges and Un...",2024-04-19T17:54:13+0000
1,https://www.nytimes.com/2024/04/19/nyregion/ma...,"Onlookers screamed as fire engulfed the man, w...",The New York Times,{'main': 'Man Sets Himself on Fire Near Courth...,"[{'name': 'subject', 'value': 'New York State ...",2024-04-19T17:54:12+0000
2,https://www.nytimes.com/2024/04/19/nyregion/co...,"The encampment is gone, but some activists spe...",The New York Times,{'main': 'Student Protesters at Columbia Remai...,"[{'name': 'subject', 'value': 'Colleges and Un...",2024-04-19T17:46:08+0000
3,https://www.nytimes.com/2024/04/19/nyregion/me...,"The senator’s trial will start May 13, a week ...",The New York Times,{'main': 'Menendez Corruption Trial Delayed fo...,"[{'name': 'persons', 'value': 'Menendez, Rober...",2024-04-19T17:34:42+0000
4,https://www.nytimes.com/2024/04/19/nyregion/tr...,Fame creates its own gravity and Donald J. Tru...,The New York Times,"{'main': 'Outside Court, Trump is the Center o...","[{'name': 'subject', 'value': 'New York State ...",2024-04-19T15:09:08+0000


In [20]:
# In the case a nested objects it is better to use json_normalize()
reports_df['headline'][0]

{'main': 'What We Know About Columbia University’s President, Nemat Shafik',
 'kicker': None,
 'content_kicker': None,
 'print_headline': None,
 'name': None,
 'seo': None,
 'sub': None}

In [21]:
# Use json_normalize() function to normalize data into a flat table
reports_df = pd.json_normalize(reports["response"]["docs"])
reports_df

Unnamed: 0,web_url,snippet,source,keywords,pub_date,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub
0,https://www.nytimes.com/2024/04/19/nyregion/ne...,"Dr. Shafik, who also goes by Minouche, is faci...",The New York Times,"[{'name': 'subject', 'value': 'Colleges and Un...",2024-04-19T17:54:13+0000,What We Know About Columbia University’s Presi...,,,,,,
1,https://www.nytimes.com/2024/04/19/nyregion/ma...,"Onlookers screamed as fire engulfed the man, w...",The New York Times,"[{'name': 'subject', 'value': 'New York State ...",2024-04-19T17:54:12+0000,Man Sets Himself on Fire Near Courthouse Where...,,,,,,
2,https://www.nytimes.com/2024/04/19/nyregion/co...,"The encampment is gone, but some activists spe...",The New York Times,"[{'name': 'subject', 'value': 'Colleges and Un...",2024-04-19T17:46:08+0000,Student Protesters at Columbia Remain Defiant,,,,,,
3,https://www.nytimes.com/2024/04/19/nyregion/me...,"The senator’s trial will start May 13, a week ...",The New York Times,"[{'name': 'persons', 'value': 'Menendez, Rober...",2024-04-19T17:34:42+0000,Menendez Corruption Trial Delayed for One Week,,,,,,
4,https://www.nytimes.com/2024/04/19/nyregion/tr...,Fame creates its own gravity and Donald J. Tru...,The New York Times,"[{'name': 'subject', 'value': 'New York State ...",2024-04-19T15:09:08+0000,"Outside Court, Trump is the Center of Attentio...",,,,,,
5,https://www.nytimes.com/2024/04/19/nyregion/wh...,The former president faces dozens of felony ch...,The New York Times,"[{'name': 'persons', 'value': 'Trump, Donald J...",2024-04-19T14:08:28+0000,Will Trump Go to Prison if He Is Convicted?,,,,,,
6,https://www.nytimes.com/2024/04/19/nyregion/ba...,"Youth America Grand Prix, a student ballet sch...",The New York Times,"[{'name': 'subject', 'value': 'Dancing', 'rank...",2024-04-19T09:03:14+0000,353 Ballet Dancers Set a Record,New York Today,,,,,
7,https://www.nytimes.com/2024/04/19/nyregion/do...,"At a farm tied to Blue Hill at Stone Barns, wh...",The New York Times,"[{'name': 'subject', 'value': 'Agriculture and...",2024-04-19T07:01:10+0000,A Vicious Dog Attack Upends an Elite Westchest...,,,,,,
8,https://www.nytimes.com/2024/04/19/nyregion/ne...,The fledgling Professional Women’s Hockey Leag...,The New York Times,"[{'name': 'subject', 'value': 'Hockey, Ice', '...",2024-04-19T07:00:54+0000,New York’s Newest Hockey Team Has Everything b...,,,"On and Off the Ice, Challenges Aplenty",,,
9,https://www.nytimes.com/2024/04/19/nyregion/aa...,The N.F.L. great was supposed to be the Jets’ ...,The New York Times,"[{'name': 'subject', 'value': 'Football', 'ran...",2024-04-19T07:00:28+0000,Will Conspiracy Theories Be Aaron Rodgers’s Ac...,,,,,,


In [22]:
# Use try-except loop to check user input
while True:
    try:
        user_input = int(input(f'How many snippets would you like to see?\nEnter a number between 1-{len(reports_df)}: '))
        
        if 1 <= user_input <= len(reports_df):
            break
        else:
             print('Invalid input! Please try again')
            
    except:
        print('Invalid input! Please try again')

How many snippets would you like to see?
Enter a number between 1-10: Two
Invalid input! Please try again
How many snippets would you like to see?
Enter a number between 1-10: 20
Invalid input! Please try again
How many snippets would you like to see?
Enter a number between 1-10: 4


In [23]:
# Using the table to print out snippets of the newest results for New York
for i, report in enumerate(reports_df['snippet'][0:user_input], start=1):
    print(f'{i}. {report}\n')

1. Dr. Shafik, who also goes by Minouche, is facing criticism from multiple sides over how she is handling protests over the war in Gaza on her campus.

2. Onlookers screamed as fire engulfed the man, who had thrown pamphlets in the air before he set himself aflame. A police officer tried to extinguish the flames before the man was taken away in an ambulance.

3. The encampment is gone, but some activists spent the night on the university lawn, with what seemed like light enforcement, at least for the moment.

4. The senator’s trial will start May 13, a week later than planned, to give defense lawyers more time to prepare.



In [24]:
# Remove time from the date column
reports_df['pub_date'] = reports_df['pub_date'].apply(lambda dt: dt[:dt.find('T')])

In [25]:
# Only keep rank 1 keywords
reports_df['keywords'] = reports_df['keywords'].apply(lambda key: key[0]['value'])

In [26]:
# Remove unwanted columns
reports_df.drop(columns={'headline.kicker', 'headline.content_kicker', 'headline.print_headline', 'headline.name', 'headline.seo', 'headline.sub'}, inplace=True)
# Rename columns
reports_df.rename(columns={'headline.main': 'headline'}, inplace=True)
#View the Dataframe after cleanup
reports_df.head()

Unnamed: 0,web_url,snippet,source,keywords,pub_date,headline
0,https://www.nytimes.com/2024/04/19/nyregion/ne...,"Dr. Shafik, who also goes by Minouche, is faci...",The New York Times,Colleges and Universities,2024-04-19,What We Know About Columbia University’s Presi...
1,https://www.nytimes.com/2024/04/19/nyregion/ma...,"Onlookers screamed as fire engulfed the man, w...",The New York Times,New York State Criminal Case Against Trump (71...,2024-04-19,Man Sets Himself on Fire Near Courthouse Where...
2,https://www.nytimes.com/2024/04/19/nyregion/co...,"The encampment is gone, but some activists spe...",The New York Times,Colleges and Universities,2024-04-19,Student Protesters at Columbia Remain Defiant
3,https://www.nytimes.com/2024/04/19/nyregion/me...,"The senator’s trial will start May 13, a week ...",The New York Times,"Menendez, Robert",2024-04-19,Menendez Corruption Trial Delayed for One Week
4,https://www.nytimes.com/2024/04/19/nyregion/tr...,Fame creates its own gravity and Donald J. Tru...,The New York Times,New York State Criminal Case Against Trump (71...,2024-04-19,"Outside Court, Trump is the Center of Attentio..."


## Software development kits (SDKs)

In [27]:
from census import Census
api_key = os.getenv("CENSUS_API_KEY")

In [28]:
# Create an instance of the Census library
c = Census(
    api_key,
    year = 2020)

In [29]:
# Run Census Search to retrieve data on all states (2020 ACS5 Census)
census_data = c.acs5.get(
    (
        "NAME",
        "B19013_001E",
        "B01003_001E",
        "B01002_001E",
        "B23025_002E",
        "B25077_001E"
    ), 
    {'for': 'state:*'}
)

In [30]:
# Convert to DataFrame, check first 5 rows
census_pd = pd.DataFrame(census_data)
census_pd.head()

Unnamed: 0,NAME,B19013_001E,B01003_001E,B01002_001E,B23025_002E,B25077_001E,state
0,Pennsylvania,63627.0,12794885.0,40.9,6566126.0,187500.0,42
1,California,78672.0,39346023.0,36.7,20016955.0,538500.0,6
2,West Virginia,48037.0,1807426.0,42.7,798208.0,123200.0,54
3,Utah,74197.0,3151239.0,31.1,1600462.0,305400.0,49
4,New York,71117.0,19514849.0,39.0,10032721.0,325000.0,36


In [31]:
# Column renaming
census_pd = census_pd.rename(
    columns = {
        "B01003_001E": "Population",
        "B01002_001E": "Median Age",
        "B19013_001E": "Household Income",
        "B23025_002E": "Employable People in the labor force",
        "B25077_001E": "Median Home Value",
        "NAME": "Name"
    }
)

# Display DataFrame length and sample data
print(f"Number of rows in the DataFrame: {len(census_pd)}")
census_pd.head()

Number of rows in the DataFrame: 52


Unnamed: 0,Name,Household Income,Population,Median Age,Employable People in the labor force,Median Home Value,state
0,Pennsylvania,63627.0,12794885.0,40.9,6566126.0,187500.0,42
1,California,78672.0,39346023.0,36.7,20016955.0,538500.0,6
2,West Virginia,48037.0,1807426.0,42.7,798208.0,123200.0,54
3,Utah,74197.0,3151239.0,31.1,1600462.0,305400.0,49
4,New York,71117.0,19514849.0,39.0,10032721.0,325000.0,36
