In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

## Working with Strings in a data frame

In [2]:
got_deaths = pd.read_csv("./data/GoT_Character_Deaths.csv")
got_deaths.sample(10)

Unnamed: 0,Name,Allegiances,Book Intro Chapter,Gender,Nobility,GoT,CoK,SoS,FfC,DwD,dead
317,Hallis Mollen,Stark,14,1,1,1,1,0,0,0,0
278,Gladden Wylde,Stark,43,1,1,1,0,0,0,0,1
302,Guncer Sunglass,Baratheon,0,1,1,0,1,1,0,0,1
258,Gariss,Stark,50,1,0,0,1,0,0,0,0
406,Jonos Bracken,Tully,28,1,1,1,0,1,0,1,0
45,Arron,Night's Watch,75,1,0,0,0,1,0,1,0
74,Bellonara Otherys,,34,0,1,0,0,0,1,0,0
540,Mikken,Stark,8,1,0,1,1,0,0,0,1
410,Joseth,Stark,37,1,0,1,1,0,0,0,0
740,Serra Frey,,49,0,1,0,0,1,0,0,0


In [3]:
name = "Jonathan Arp" 
name.upper()

'JONATHAN ARP'

In [6]:
got_deaths['ALL_CAPS'] = got_deaths['Allegiances'].str.upper()
got_deaths

Unnamed: 0,Name,Allegiances,Book Intro Chapter,Gender,Nobility,GoT,CoK,SoS,FfC,DwD,dead,ALL_CAPS
0,Addam Marbrand,Lannister,56,1,1,1,1,1,1,0,0,LANNISTER
1,Aegon Frey (Jinglebell),,49,1,1,0,0,1,0,0,1,NONE
2,Aegon Targaryen,Targaryen,5,1,1,0,0,0,0,1,0,TARGARYEN
3,Adrack Humble,Greyjoy,20,1,1,0,0,0,0,1,1,GREYJOY
4,Aemon Targaryen (son of Maekar I),Night's Watch,21,1,1,1,0,1,1,0,1,NIGHT'S WATCH
...,...,...,...,...,...,...,...,...,...,...,...,...
900,Zollo,,21,1,0,0,0,1,0,0,0,NONE
901,Yurkhaz zo Yunzak,,47,1,0,0,0,0,0,1,1,NONE
902,Yezzan Zo Qaggaz,,25,1,1,0,0,0,0,1,1,NONE
903,Torwynd the Tame,Wildling,73,1,0,0,0,1,0,0,1,WILDLING


In [7]:
got_deaths['also allegiances, but shouted'] = got_deaths['Allegiances'].str.upper() + "!!!"
got_deaths.head()

Unnamed: 0,Name,Allegiances,Book Intro Chapter,Gender,Nobility,GoT,CoK,SoS,FfC,DwD,dead,ALL_CAPS,"also allegiances, but shouted"
0,Addam Marbrand,Lannister,56,1,1,1,1,1,1,0,0,LANNISTER,LANNISTER!!!
1,Aegon Frey (Jinglebell),,49,1,1,0,0,1,0,0,1,NONE,NONE!!!
2,Aegon Targaryen,Targaryen,5,1,1,0,0,0,0,1,0,TARGARYEN,TARGARYEN!!!
3,Adrack Humble,Greyjoy,20,1,1,0,0,0,0,1,1,GREYJOY,GREYJOY!!!
4,Aemon Targaryen (son of Maekar I),Night's Watch,21,1,1,1,0,1,1,0,1,NIGHT'S WATCH,NIGHT'S WATCH!!!


# Web Scrapping

Web scrapping is very large concept and involves a deep understanding of how websites are created and managed. You will also need to know some fundamentals of HTML. In this section we will do a very basic foundations of extracting the data from the websites. 

### `pd.read_html()`

Using the pandas package, you can read the tables that are created on the websites. It reads all the tables that are available on the webpage. 

The following example extracts the NBA 2017 draft data set from the [Sports Reference](https://www.basketball-reference.com/draft/NBA_2017.html) website

In [9]:
nba_data_list = pd.read_html("https://www.basketball-reference.com/draft/NBA_2017.html") 
print(type(nba_data_list))
print(len(nba_data_list))
type(nba_data_list[0])

<class 'list'>
1


pandas.core.frame.DataFrame

You will notice that after `read_html()` returns a list. There can be multiple tables in a given webpage. The `read_html()` method returns list of tables. In this webpage there is only one table. So you can access the table with the 0th indexed element. 

In [10]:
nba_df = nba_data_list[0]
nba_df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Round 1,Round 1,Unnamed: 5_level_0,Totals,Totals,Totals,Totals,...,Shooting,Shooting,Per Game,Per Game,Per Game,Per Game,Advanced,Advanced,Advanced,Advanced
Unnamed: 0_level_1,Rk,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P%,FT%,MP,PTS,TRB,AST,WS,WS/48,BPM,VORP
0,1,1,PHI,Markelle Fultz,Washington,4,113,2891,1228,376,...,.265,.695,25.6,10.9,3.3,4.6,3.1,.052,-2.2,-0.2
1,2,2,LAL,Lonzo Ball,UCLA,4,210,6734,2419,1205,...,.353,.547,32.1,11.5,5.7,6.4,9.0,.064,0.8,4.7
2,3,3,BOS,Jayson Tatum,Duke,4,281,9162,5348,1756,...,.396,.839,32.6,19.0,6.2,2.6,24.5,.128,2.0,9.3
3,4,4,PHO,Josh Jackson,Kansas,4,234,5730,2885,990,...,.300,.675,24.5,12.3,4.2,2.0,-1.5,-.012,-3.7,-2.5
4,5,5,SAC,De'Aaron Fox,Kentucky,4,263,8242,4781,907,...,.326,.718,31.3,18.2,3.4,6.4,13.0,.076,0.1,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,56,56,BOS,Jabari Bird,California,1,13,115,39,19,...,.429,.462,8.8,3.0,1.5,0.6,0.2,.098,-3.1,0.0
58,57,57,BRK,Sasha Vezenkov,,,,,,,...,,,,,,,,,,
59,58,58,NYK,Ognjen Jaramaz,,,,,,,...,,,,,,,,,,
60,59,59,SAS,Jaron Blossomgame,Clemson,1,27,439,114,98,...,.256,.769,16.3,4.2,3.6,0.5,0.3,.030,-3.8,-0.2


In [11]:
nba_df.keys()

MultiIndex([('Unnamed: 0_level_0',      'Rk'),
            ('Unnamed: 1_level_0',      'Pk'),
            ('Unnamed: 2_level_0',      'Tm'),
            (           'Round 1',  'Player'),
            (           'Round 1', 'College'),
            ('Unnamed: 5_level_0',     'Yrs'),
            (            'Totals',       'G'),
            (            'Totals',      'MP'),
            (            'Totals',     'PTS'),
            (            'Totals',     'TRB'),
            (            'Totals',     'AST'),
            (          'Shooting',     'FG%'),
            (          'Shooting',     '3P%'),
            (          'Shooting',     'FT%'),
            (          'Per Game',      'MP'),
            (          'Per Game',     'PTS'),
            (          'Per Game',     'TRB'),
            (          'Per Game',     'AST'),
            (          'Advanced',      'WS'),
            (          'Advanced',   'WS/48'),
            (          'Advanced',     'BPM'),
            (

Information on the web pages is not always clean. In this case you might have observed the column names are all multilevel indexes. You can change the column names as indicated on the website by renaming the column names. 

In [12]:
nba_df.columns = ['Rk', 'Pk', 'Tm','Player','College', 'Yrs','G', 'MP', 'PTS','TRB','AST','FG%', 
                    '3P%', 'FT%', 'MP', 'PTS', 'TRB', 'AST', 'WS', 'WS/48', 'BPM', 'VORP']

nba_df.head()

Unnamed: 0,Rk,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST,WS,WS/48,BPM,VORP
0,1,1,PHI,Markelle Fultz,Washington,4,113,2891,1228,376,...,0.265,0.695,25.6,10.9,3.3,4.6,3.1,0.052,-2.2,-0.2
1,2,2,LAL,Lonzo Ball,UCLA,4,210,6734,2419,1205,...,0.353,0.547,32.1,11.5,5.7,6.4,9.0,0.064,0.8,4.7
2,3,3,BOS,Jayson Tatum,Duke,4,281,9162,5348,1756,...,0.396,0.839,32.6,19.0,6.2,2.6,24.5,0.128,2.0,9.3
3,4,4,PHO,Josh Jackson,Kansas,4,234,5730,2885,990,...,0.3,0.675,24.5,12.3,4.2,2.0,-1.5,-0.012,-3.7,-2.5
4,5,5,SAC,De'Aaron Fox,Kentucky,4,263,8242,4781,907,...,0.326,0.718,31.3,18.2,3.4,6.4,13.0,0.076,0.1,4.5


#### Clean the data

Data downloaded from the webpages, most certainly requires to be cleaned. The following is a simple example of deleting unnnecessary data. 

You will notice that the internet data is **messy**. For example, if you actually see the rows from 28:34, you will see that index 30, 31 had data that is not required. Look at the [website](https://www.basketball-reference.com/draft/NBA_2017.html) the table has a break, so the the DataFrame has unnecessary information. 

In [13]:
nba_df.loc[28:34]

Unnamed: 0,Rk,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST,WS,WS/48,BPM,VORP
28,29,29,SAS,Derrick White,Colorado,4,188,4608,2035,603,...,.357,.825,24.5,10.8,3.2,3.4,10.3,.107,1.1,3.6
29,30,30,UTA,Josh Hart,Villanova,4,242,6280,2111,1313,...,.348,.728,26.0,8.7,5.4,1.6,11.3,.086,-0.8,1.9
30,,,,Round 2,Round 2,,Totals,Totals,Totals,Totals,...,Shooting,Shooting,Per Game,Per Game,Per Game,Per Game,Advanced,Advanced,Advanced,Advanced
31,Rk,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P%,FT%,MP,PTS,TRB,AST,WS,WS/48,BPM,VORP
32,31,31,CHO,Frank Jackson,Duke,3,153,2520,1154,283,...,.345,.756,16.5,7.5,1.8,1.0,1.4,.027,-4.4,-1.6
33,32,32,PHO,Davon Reed,Miami (FL),2,31,289,75,45,...,.310,.667,9.3,2.4,1.5,0.5,-0.2,-.039,-6.3,-0.3
34,33,33,ORL,Wesley Iwundu,Kansas State,4,218,3683,959,530,...,.289,.797,16.9,4.4,2.4,0.9,4.2,.055,-3.8,-1.6


In [14]:
# Drop those two rows with those indices and you are saying inplace=True, to make sure you are not creating a copy. 
nba_df.drop([30,31], axis=0, inplace=True)
nba_df.loc[28:34]

Unnamed: 0,Rk,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST,WS,WS/48,BPM,VORP
28,29,29,SAS,Derrick White,Colorado,4,188,4608,2035,603,...,0.357,0.825,24.5,10.8,3.2,3.4,10.3,0.107,1.1,3.6
29,30,30,UTA,Josh Hart,Villanova,4,242,6280,2111,1313,...,0.348,0.728,26.0,8.7,5.4,1.6,11.3,0.086,-0.8,1.9
32,31,31,CHO,Frank Jackson,Duke,3,153,2520,1154,283,...,0.345,0.756,16.5,7.5,1.8,1.0,1.4,0.027,-4.4,-1.6
33,32,32,PHO,Davon Reed,Miami (FL),2,31,289,75,45,...,0.31,0.667,9.3,2.4,1.5,0.5,-0.2,-0.039,-6.3,-0.3
34,33,33,ORL,Wesley Iwundu,Kansas State,4,218,3683,959,530,...,0.289,0.797,16.9,4.4,2.4,0.9,4.2,0.055,-3.8,-1.6


## More on cleaning up data 

We are goin gto download the top 250 movies from [IMDB](http://www.imdb.com/chart/top?ref_=nv_wl_img_3) list 

We need to clean the data and remove unnecessary rows and columns like before. But there's more we want to do. 

Notice that the Title acutally has the date of the movie in it. That's not helpful. Wouldn't it be great to have a column that had the date. That would be very useful for our data analysis goals

Like, asking which movie released in 2014 has highest IMDb rating. Or which year had the highest average rankings.

In [15]:
movie_df = pd.read_html("https://www.imdb.com/chart/top?ref_=nv_wl_img_3")[0]
movie_df.head()

Unnamed: 0.1,Unnamed: 0,Rank & Title,IMDb Rating,Your Rating,Unnamed: 4
0,,1. The Shawshank Redemption (1994),9.2,12345678910 NOT YET RELEASED Seen,
1,,2. The Godfather (1972),9.1,12345678910 NOT YET RELEASED Seen,
2,,3. The Godfather: Part II (1974),9.0,12345678910 NOT YET RELEASED Seen,
3,,4. The Dark Knight (2008),9.0,12345678910 NOT YET RELEASED Seen,
4,,5. 12 Angry Men (1957),8.9,12345678910 NOT YET RELEASED Seen,


### Dropping unnecessary columns
let's drop the columns that have no useful data. First he pass in a `list` of columns. Remember that the default is to delete rows, so we add `axis=1` to tell pandas we are dropping collumns. Lastly we want the changes to remain so we add `inplace=True`

In [16]:
movie_df.drop(["Unnamed: 0", "Unnamed: 4", "Your Rating"], axis=1, inplace=True)
movie_df.head()

Unnamed: 0,Rank & Title,IMDb Rating
0,1. The Shawshank Redemption (1994),9.2
1,2. The Godfather (1972),9.1
2,3. The Godfather: Part II (1974),9.0
3,4. The Dark Knight (2008),9.0
4,5. 12 Angry Men (1957),8.9


In [17]:
# using the str function, let's grab the year and put it in a column called 'year'
movie_df['year'] = movie_df['Rank & Title'].str[-5:-1]
movie_df['Rank & Title'] = movie_df['Rank & Title'].str[0:-6]
movie_df.head()

Unnamed: 0,Rank & Title,IMDb Rating,year
0,1. The Shawshank Redemption,9.2,1994
1,2. The Godfather,9.1,1972
2,3. The Godfather: Part II,9.0,1974
3,4. The Dark Knight,9.0,2008
4,5. 12 Angry Men,8.9,1957


In [18]:
movie_df['ranking'] = movie_df['Rank & Title'].str.extract('(\d{1,4}).\ ')
movie_df['title'] = movie_df['Rank & Title'].str.extract('\d{1,4}.\ (.*)')
movie_df.drop(['Rank & Title'], axis=1, inplace=True)
movie_df.head()

Unnamed: 0,IMDb Rating,year,ranking,title
0,9.2,1994,1,The Shawshank Redemption
1,9.1,1972,2,The Godfather
2,9.0,1974,3,The Godfather: Part II
3,9.0,2008,4,The Dark Knight
4,8.9,1957,5,12 Angry Men


In [19]:
movie_14_df = movie_df[movie_df['year'] == '2014']
df_sorted = movie_14_df.sort_values('IMDb Rating', ascending=False)
df_sorted

Unnamed: 0,IMDb Rating,year,ranking,title
28,8.5,2014,29,Interstellar
43,8.5,2014,44,Whiplash
177,8.1,2014,178,Wild Tales
186,8.1,2014,187,The Grand Budapest Hotel
189,8.1,2014,190,Gone Girl


In [20]:
movie_14_df[ movie_14_df['IMDb Rating'] == movie_14_df['IMDb Rating'].max() ]

Unnamed: 0,IMDb Rating,year,ranking,title
28,8.5,2014,29,Interstellar
43,8.5,2014,44,Whiplash


In [21]:
# aggregate the highest average rankings for each year
movies_by_year = movie_df.groupby('year').mean()
# Highest ranked year
movies_by_year.loc[ movies_by_year['IMDb Rating'].idxmax()]

IMDb Rating    9.1
Name: 1972, dtype: float64

In [None]:
movies_by_year
# movies_by_year['IMDb Rating'].idxmax()

# Packages for webscrapping 

* urllib
* requests
* **BeautifulSoup**
* mechanize

This will require some fundamentals on HTML, the language used to display the webpages on the browser. 

In [22]:
import urllib
import requests
from bs4 import BeautifulSoup

In [23]:
req = requests.get("https://simple.wikipedia.org/wiki/List_of_U.S._state_capitals")
page = req.text
page

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"60985881-aa1e-4010-89e5-3e832793fb73","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_U.S._state_capitals","wgTitle":"List of U.S. state capitals","wgCurRevisionId":7470080,"wgRevisionId":7470080,"wgArticleId":18635,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["State capitals in the United States","Lists of cities in the United States"],"wgPageConte

In [24]:
page_soup = BeautifulSoup(page, 'html.parser')
page_soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"60985881-aa1e-4010-89e5-3e832793fb73","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_U.S._state_capitals","wgTitle":"List of U.S. state capitals","wgCurRevisionId":7470080,"wgRevisionId":7470080,"wgArticleId":18635,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["State capitals in the United States","Lists of cities in the United States"],"wgPageContentLan

You can print the actual webpage and its contents. 

**Warning**: The contents of a webpage are messy and may not be obvious for the first time. However, if you want to scrape any website, you will have to be patient and look through the contents to extract the information. 

In [25]:
print(page_soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"60985881-aa1e-4010-89e5-3e832793fb73","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_U.S._state_capitals","wgTitle":"List of U.S. state capitals","wgCurRevisionId":7470080,"wgRevisionId":7470080,"wgArticleId":18635,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["State capitals in the United States","Lists of cities in the United States"],

In [26]:
page_soup.title

<title>List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia</title>

In [27]:
page_soup.title.string

'List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia'

### Searching in the webpage

You can programmatically search through a webpage to find the tables that are available on the webpage. You can do that by using **`find_all()`** method. 

In [28]:
states_table = page_soup.find_all("table")
print(len(states_table))
states_table

1


[<table class="wikitable sortable">
 <caption>State capitals of the United States
 </caption>
 <tbody><tr>
 <th rowspan="2">State</th>
 <th rowspan="2">Abr.</th>
 <th rowspan="2">State-hood</th>
 <th rowspan="2">Capital</th>
 <th rowspan="2">Capital since</th>
 <th rowspan="2">Area (mi²)</th>
 <th colspan="4">Population (2018)</th>
 <th rowspan="2">Notes
 </th></tr>
 <tr>
 <th><a href="/wiki/List_of_United_States_cities_by_population" title="List of United States cities by population">City</a>
 </th>
 <th>Metropolitan
 </th>
 <th>Rank in state
 </th>
 <th>Rank in US
 </th></tr>
 <tr>
 <td><a href="/wiki/Alabama" title="Alabama">Alabama</a></td>
 <td>AL</td>
 <td align="center">1819</td>
 <td><a href="/wiki/Montgomery,_Alabama" title="Montgomery, Alabama">Montgomery</a></td>
 <td align="center">1846</td>
 <td align="right">159.8</td>
 <td align="right">198,218</td>
 <td align="right">373,903</td>
 <td align="center">2</td>
 <td align="center">119</td>
 <td>
 </td></tr>
 <tr>
 <td><a hre

# WebScrapping through Application Programming Interface (API)

There are a lot of APIs available for each of the website. You can use these APIs to scrape websites like Twitter, Google Trends, etc. 

In this section, we will use a simple API provided by NASA, [here](http://open-notify.org/), to retrieve data about the International Space Station (ISS). 

Some of the content presented here is based on [dataquest](https://www.dataquest.io/blog/python-api-tutorial/). 

#### Current ISS position

In [29]:
import requests
response = requests.get("http://api.open-notify.org/iss-now.json")

print(response.status_code)
print(response.content)

200
b'{"timestamp": 1619705862, "iss_position": {"longitude": "-50.2006", "latitude": "24.4257"}, "message": "success"}'


There are various status codes that you will get when you request a website. [This](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes) describes more detailed description. 

In [30]:
response = requests.get("http://api.open-notify.org/iss-now.json")
pd.read_json(response.content)

Unnamed: 0,timestamp,iss_position,message
latitude,2021-04-29 14:20:32,16.1839,success
longitude,2021-04-29 14:20:32,-43.1302,success


#### Current Number of People In Space

In [31]:
response = requests.get("http://api.open-notify.org/astros.json")
pd.read_json(response.content)

Unnamed: 0,message,number,people
0,success,11,"{'name': 'Mike Hopkins', 'craft': 'ISS'}"
1,success,11,"{'name': 'Victor Glover', 'craft': 'ISS'}"
2,success,11,"{'name': 'Shannon Walker', 'craft': 'ISS'}"
3,success,11,"{'name': 'Soichi Noguchi', 'craft': 'ISS'}"
4,success,11,"{'name': 'Mark Vande Hei', 'craft': 'ISS'}"
5,success,11,"{'name': 'Oleg Novitskiy', 'craft': 'ISS'}"
6,success,11,"{'name': 'Pyotr Dubrov', 'craft': 'ISS'}"
7,success,11,"{'name': 'Thomas Pesquet', 'craft': 'ISS'}"
8,success,11,"{'name': 'Megan McArthur', 'craft': 'ISS'}"
9,success,11,"{'name': 'Shane Kimbrough', 'craft': 'ISS'}"


In [32]:
response.content

b'{"message": "success", "number": 11, "people": [{"name": "Mike Hopkins", "craft": "ISS"}, {"name": "Victor Glover", "craft": "ISS"}, {"name": "Shannon Walker", "craft": "ISS"}, {"name": "Soichi Noguchi", "craft": "ISS"}, {"name": "Mark Vande Hei", "craft": "ISS"}, {"name": "Oleg Novitskiy", "craft": "ISS"}, {"name": "Pyotr Dubrov", "craft": "ISS"}, {"name": "Thomas Pesquet", "craft": "ISS"}, {"name": "Megan McArthur", "craft": "ISS"}, {"name": "Shane Kimbrough", "craft": "ISS"}, {"name": "Akihiko Hoshide", "craft": "ISS"}]}'

# Google Maps API

You need to install `googlemaps` package in order to use this. 

Select `Anaconda Prompt` on your computer and then type `pip install --user googlemaps`. This should install googlemaps package that we can use here. 

In [33]:
import googlemaps

from datetime import datetime

### Google API key

Ideally you would need to create this key from your google API dashboard by logging in with your google accounts. I have provided this key to for a dummy account. It comes with it's own restrictions. You may want to create this for your own accounts. 

**NOTE**: The below key `AIzaSyC7sJdwW-skSS0UOR-OFOHeGRNa8TwoM18` might be disabled after the class. You can create your won key using the link [here](https://support.google.com/googleapi/answer/6158862?hl=en). 

In [34]:
gmaps = googlemaps.Client(key='AIzaSyDCdQCVKWQNNhERNEmuufTwmhDeDszV1ws')

In [35]:
cities_zip = pd.read_csv("./data/uscities_zip.csv", index_col = ['city', 'state_id'])

In [36]:
cities_zip.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,city_ascii,state_name,county_fips,county_name,lat,lng,population,population_proper,density,source,incorporated,timezone,zips,id
city,state_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Prairie Ridge,WA,Prairie Ridge,Washington,53053,Pierce,47.1443,-122.1408,,,1349.8,polygon,False,America/Los_Angeles,98360 98391,1840037882
Edison,WA,Edison,Washington,53057,Skagit,48.5602,-122.4311,,,127.4,polygon,False,America/Los_Angeles,98232,1840017314
Packwood,WA,Packwood,Washington,53041,Lewis,46.6085,-121.6702,,,213.9,polygon,False,America/Los_Angeles,98361,1840025265
Wautauga Beach,WA,Wautauga Beach,Washington,53035,Kitsap,47.5862,-122.5482,,,261.7,point,False,America/Los_Angeles,98366,1840037725
Harper,WA,Harper,Washington,53035,Kitsap,47.5207,-122.5196,,,342.1,point,False,America/Los_Angeles,98366,1840037659


In [38]:
datetime_object = datetime.strptime('Apr 30 2021  3:00PM', '%b %d %Y %I:%M%p')
datetime_object

datetime.datetime(2021, 4, 30, 15, 0)

In [39]:
json_text = gmaps.distance_matrix((cities_zip.loc['Detroit', 'MI']['lat'], 
                                   cities_zip.loc['Detroit', 'MI']['lng']),
                                  
                                  (cities_zip.loc['Chicago', 'IL']['lat'], 
                                   cities_zip.loc['Chicago', 'IL']['lng']), 
                                    
                                  departure_time= datetime_object)

In [40]:
json_text

{'destination_addresses': ['3201 S Western Ave, Chicago, IL 60608, USA'],
 'origin_addresses': ['1673 Calvert Ave, Detroit, MI 48206, USA'],
 'rows': [{'elements': [{'distance': {'text': '447 km', 'value': 447454},
     'duration': {'text': '4 hours 18 mins', 'value': 15489},
     'duration_in_traffic': {'text': '4 hours 15 mins', 'value': 15305},
     'status': 'OK'}]}],
 'status': 'OK'}