In [56]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

In [None]:
# link to landslide data
# https://data.nasa.gov/Earth-Science/Global-Landslide-Catalog-Export/dd9e-wu2v

In [None]:
# request a nasa api key here: https://api.nasa.gov/index.html#signUp

In [2]:
# save your personal api key in a text file and read it in, I saved mine as nasa_key
with open('nasa_apikey.txt', 'r') as file:
    nasa_key = file.read()

In [3]:
# initialize the payload
payload = {}

# save the apikey as a header to use in request.get()
headers= {
  "apikey": nasa_key
}

{}

In [4]:
# url for accessing the data - this is usually found in the API documentation
nasa_url = 'https://data.nasa.gov/resource/dd9e-wu2v.json'

In [8]:
# request to pull the data with your API
r = requests.get(nasa_url, headers=headers, data = payload)

In [52]:
# this will return True if your request is successful
r.ok

True

In [137]:
landslide = r.json()
landslide_df = pd.DataFrame(landslide)

In [138]:
landslide_df.shape

(1000, 30)

In [139]:
landslide_df = landslide_df[['event_date', 'landslide_category', 'landslide_trigger', 
'landslide_size', 'fatality_count', 'injury_count', 'country_name', 'longitude', 'latitude']]

In [140]:
landslide_df.isnull().sum()

event_date              0
landslide_category      0
landslide_trigger       1
landslide_size          3
fatality_count        110
injury_count          672
country_name          148
longitude               0
latitude                0
dtype: int64

In [141]:
landslide_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   event_date          1000 non-null   object
 1   landslide_category  1000 non-null   object
 2   landslide_trigger   999 non-null    object
 3   landslide_size      997 non-null    object
 4   fatality_count      890 non-null    object
 5   injury_count        328 non-null    object
 6   country_name        852 non-null    object
 7   longitude           1000 non-null   object
 8   latitude            1000 non-null   object
dtypes: object(9)
memory usage: 70.4+ KB


In [142]:
# changing numeric variables to type float instead of type object
landslide_df['fatality_count'] = landslide_df['fatality_count'].astype(float)
landslide_df['injury_count'] = landslide_df['injury_count'].astype(float)
landslide_df['longitude'] = landslide_df['longitude'].astype(float)
landslide_df['latitude'] = landslide_df['latitude'].astype(float)

# simplifying event_date and changing to type datetime
landslide_df['event_date'] = landslide_df['event_date'].str.extract(r'(\d+\-\d+\-\d+)')
landslide_df['event_date'] = pd.to_datetime(landslide_df['event_date'])

In [143]:
landslide_df['landslide_size'] = landslide_df['landslide_size'].fillna('unknown')
landslide_df['landslide_trigger'] = landslide_df['landslide_trigger'].fillna('unknown')
landslide_df['country_name'] = landslide_df['country_name'].fillna('other/unknown')
landslide_df['fatality_count'] = landslide_df['fatality_count'].fillna(0)
landslide_df['injury_count'] = landslide_df['injury_count'].fillna(0)

In [144]:
landslide_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   event_date          1000 non-null   datetime64[ns]
 1   landslide_category  1000 non-null   object        
 2   landslide_trigger   1000 non-null   object        
 3   landslide_size      1000 non-null   object        
 4   fatality_count      1000 non-null   float64       
 5   injury_count        1000 non-null   float64       
 6   country_name        1000 non-null   object        
 7   longitude           1000 non-null   float64       
 8   latitude            1000 non-null   float64       
dtypes: datetime64[ns](1), float64(4), object(4)
memory usage: 70.4+ KB


In [145]:
landslide_df.head()

Unnamed: 0,event_date,landslide_category,landslide_trigger,landslide_size,fatality_count,injury_count,country_name,longitude,latitude
0,2014-05-20,landslide,downpour,small,0.0,0.0,New Zealand,167.6337,-46.7748
1,2010-05-29,landslide,downpour,medium,0.0,0.0,New Zealand,170.0982,-45.9034
2,2015-06-04,landslide,downpour,medium,0.0,0.0,New Zealand,170.6273,-45.8767
3,2013-06-17,landslide,downpour,medium,0.0,0.0,New Zealand,170.5863,-45.8727
4,2014-05-01,landslide,rain,small,0.0,0.0,New Zealand,170.509,-45.8618


In [147]:
landslide_df.shape

(1000, 9)

In [151]:
landslide_df.to_csv("landslide", index = False)

In [152]:
from pandas import read_csv


csv = read_csv('landslide')

In [154]:
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   event_date          1000 non-null   object 
 1   landslide_category  1000 non-null   object 
 2   landslide_trigger   1000 non-null   object 
 3   landslide_size      1000 non-null   object 
 4   fatality_count      1000 non-null   float64
 5   injury_count        1000 non-null   float64
 6   country_name        1000 non-null   object 
 7   longitude           1000 non-null   float64
 8   latitude            1000 non-null   float64
dtypes: float64(4), object(5)
memory usage: 70.4+ KB
