# Record Data Gathering

I will be using OECD API in order to gather some record data. The record data I will gather is 'Threatened Species (Wildlife)' dataset.

## Python API

In [20]:
# install packages
import os
import pandasdmx as sdmx
import pandas as pd
import requests
import json
import datetime
import numpy as np
import bs4 as beautifulsoup

In [29]:
endpoint = 'https://stats.oecd.org/SDMX-JSON/data/WILD_LIFE/TOT_KNOWN+TOT_KNOWN_IND+CRITICAL+CRITICAL_IND+ENDANGERED+ENDANGERED_IND+VULNERABLE+VULNERABLE_IND+THREATENED+THREATENED_IND+THREAT_PERCENT+IND_PERCENT.MAMMAL+BIRD+REPTILE+AMPHIBIAN+FISH_TOT+MARINE_F+FRESHW_F+VASCULAR_PLANT+MOSS+LICHEN+INVERTEB.AUS+AUT+BEL+CAN+CHL+COL+CRI+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+JPN+KOR+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+NMEC+BRA+RUS/all?&dimensionAtObservation=allDimensions&pid=b28d2ce7-811e-4a88-95ab-4cc005c77bd1'

try:
    response = requests.get(endpoint)
    response.raise_for_status()
    
    data = response.json()

    print(data)

except requests.HTTPError as http_err:
    print(f'HTTP error occurred: {http_err}')
except Exception as err:
    print(f'Error occurred: {err}')

{'header': {'id': 'ad78b66e-8c63-4ff9-9205-bf5417445421', 'test': False, 'prepared': '2023-12-06T00:33:35.903553Z', 'sender': {'id': 'OECD', 'name': 'Organisation for Economic Co-operation and Development'}, 'links': [{'href': 'https://stats.oecd.org:443/SDMX-JSON/data/WILD_LIFE/TOT_KNOWN+TOT_KNOWN_IND+CRITICAL+CRITICAL_IND+ENDANGERED+ENDANGERED_IND+VULNERABLE+VULNERABLE_IND+THREATENED+THREATENED_IND+THREAT_PERCENT+IND_PERCENT.MAMMAL+BIRD+REPTILE+AMPHIBIAN+FISH_TOT+MARINE_F+FRESHW_F+VASCULAR_PLANT+MOSS+LICHEN+INVERTEB.AUS+AUT+BEL+CAN+CHL+COL+CRI+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+JPN+KOR+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+NMEC+BRA+RUS/all?&dimensionAtObservation=allDimensions&pid=b28d2ce7-811e-4a88-95ab-4cc005c77bd1', 'rel': 'request'}]}, 'dataSets': [{'action': 'Information', 'observations': {'0:0:0': [377.0, None, 0, 0, None], '1:0:0': [352.0, None, 0, 0, None], '2:0:0': [41.0, None, 0, 0, None], '3:0:0': [9.0, None, 0, 0, None], '4:0:0': [57

In [30]:
# data.keys()
# data['structure'].keys()
data['structure']['dimensions']

{'observation': [{'keyPosition': 0,
   'id': 'IUCN',
   'name': 'IUCN Category',
   'values': [{'id': 'TOT_KNOWN', 'name': 'Total number of known species'},
    {'id': 'TOT_KNOWN_IND',
     'name': 'Total number of indigenous known species'},
    {'id': 'ENDANGERED', 'name': 'Number of endangered species'},
    {'id': 'CRITICAL', 'name': 'Number of critically endangered species'},
    {'id': 'VULNERABLE', 'name': 'Number of vulnerable species'},
    {'id': 'THREATENED', 'name': 'Total number of threatened species'},
    {'id': 'ENDANGERED_IND',
     'name': 'Number of endangered indigenous species'},
    {'id': 'CRITICAL_IND',
     'name': 'Number of critically endangered indigenous species'},
    {'id': 'VULNERABLE_IND',
     'name': 'Number of vulnerable indigenous species'},
    {'id': 'THREAT_PERCENT',
     'name': 'Threatened species as % of known species'},
    {'id': 'THREATENED_IND',
     'name': 'Total number of indigenous threatened species'},
    {'id': 'IND_PERCENT',
     '

In [18]:
df = pd.DataFrame.from_dict(data['dataSets'][0]['observations'], orient = 'index')
df

Unnamed: 0,0,1,2,3,4
0:0:0,377.000,,0,0,
1:0:0,352.000,,0,0,
2:0:0,41.000,,0,0,
3:0:0,9.000,,0,0,
4:0:0,57.000,,0,0,
...,...,...,...,...,...
0:9:18,2026.000,,0,0,
1:8:34,52.000,,0,0,
9:8:18,1.468,,1,0,
9:9:18,0.790,,1,0,


In [19]:
df.to_csv('./data/wildlife_py.csv', index=False)

The data we gathered via API looks messy, so we will move onto data cleaning and make this data look more neat and cleaned.  

## R API

In [12]:
library(httr)

res <- VERB("GET", url = "https://stats.oecd.org/SDMX-JSON/data/WILD_LIFE/TOT_KNOWN+TOT_KNOWN_IND+CRITICAL+CRITICAL_IND+ENDANGERED+ENDANGERED_IND+VULNERABLE+VULNERABLE_IND+THREATENED+THREATENED_IND+THREAT_PERCENT+IND_PERCENT.MAMMAL+BIRD+REPTILE+AMPHIBIAN+FISH_TOT+MARINE_F+FRESHW_F+VASCULAR_PLANT+MOSS+LICHEN+INVERTEB.AUS+AUT+BEL+CAN+CHL+COL+CRI+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+JPN+KOR+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+NMEC+BRA+RUS/all?&dimensionAtObservation=allDimensions&pid=b28d2ce7-811e-4a88-95ab-4cc005c77bd1")

cat(content(res, 'text'))

raw_Data <- content(res,'text')

{"header":{"id":"f5b61dbd-e44b-4be9-b761-24aff4be5acf","test":false,"prepared":"2023-12-06T07:42:21.9362756Z","sender":{"id":"OECD","name":"Organisation for Economic Co-operation and Development"},"links":[{"href":"https://stats.oecd.org:443/SDMX-JSON/data/WILD_LIFE/TOT_KNOWN+TOT_KNOWN_IND+CRITICAL+CRITICAL_IND+ENDANGERED+ENDANGERED_IND+VULNERABLE+VULNERABLE_IND+THREATENED+THREATENED_IND+THREAT_PERCENT+IND_PERCENT.MAMMAL+BIRD+REPTILE+AMPHIBIAN+FISH_TOT+MARINE_F+FRESHW_F+VASCULAR_PLANT+MOSS+LICHEN+INVERTEB.AUS+AUT+BEL+CAN+CHL+COL+CRI+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+JPN+KOR+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+NMEC+BRA+RUS/all?&dimensionAtObservation=allDimensions&pid=b28d2ce7-811e-4a88-95ab-4cc005c77bd1","rel":"request"}]},"dataSets":[{"action":"Information","observations":{"0:0:0":[377.0,null,0,0,null],"1:0:0":[352.0,null,0,0,null],"2:0:0":[41.0,null,0,0,null],"3:0:0":[9.0,null,0,0,null],"4:0:0":[57.0,null,0,0,null],"5:0:0":[107.0,null,0,0,nul

In [15]:
library(jsonlite)

data <- fromJSON(raw_data)

dimensions <- data$structure$dimensions


for (dim in dimensions) {
  dim_id <- dim$id
  dim_name <- dim$name
  dim_values <- dim$values
  
  cat("Dimension ID:", dim_id, "\n")
  cat("Dimension Name:", dim_name, "\n")
  
  cat("Dimension Values:\n")
  for (value in dim_values) {
    cat("ID:", value$id, " Name:", value$name, "\n")
  }
  cat("\n")
}


Dimension ID: IUCN SPEC COU 
Dimension Name: IUCN Category Species Country 
Dimension Values:
ID: TOT_KNOWN TOT_KNOWN_IND ENDANGERED CRITICAL VULNERABLE THREATENED ENDANGERED_IND CRITICAL_IND VULNERABLE_IND THREAT_PERCENT THREATENED_IND IND_PERCENT  Name: Total number of known species Total number of indigenous known species Number of endangered species Number of critically endangered species Number of vulnerable species Total number of threatened species Number of endangered indigenous species Number of critically endangered indigenous species Number of vulnerable indigenous species Threatened species as % of known species Total number of indigenous threatened species Threatened indigenous species as % of indigenous spec. 
ID: MAMMAL BIRD REPTILE AMPHIBIAN VASCULAR_PLANT FISH_TOT MARINE_F FRESHW_F MOSS LICHEN INVERTEB  Name: Mammals Birds Reptiles Amphibians Vascular plants Fish Marine Fish Freshwater Fish Mosses Lichens Invertebrates 
ID: AUS AUT BEL CAN CZE DNK FIN FRA DEU GRC HUN I

In [27]:
library(httr)
# The URL for the .csv file you want
csv_url = "https://jpj.georgetown.domains/dsan5000-scratch/WILD_LIFE_10102023223859340.csv"
# The filename you'd like to save it to, on your local drive
local_filename = "./data/wildlife_r.csv"
GET(csv_url, write_disk(local_filename), progress(), quiet = TRUE)

data <- read.csv('./data/wildlife_r.csv')
head(data)



Response [https://jpj.georgetown.domains/dsan5000-scratch/WILD_LIFE_10102023223859340.csv]
  Date: 2023-12-06 08:06
  Status: 200
  Content-Type: text/csv
  Size: 476 kB
<ON DISK>  ./data/wildlife_r.csvNULL

Unnamed: 0_level_0,X...IUCN,IUCN.Category,SPEC,Species,COU,Country,Unit.Code,Unit,PowerCode.Code,PowerCode,Reference.Period.Code,Reference.Period,Value,Flag.Codes,Flags
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<lgl>,<lgl>,<dbl>,<chr>,<chr>
1,TOT_KNOWN,Total number of known species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,377,,
2,TOT_KNOWN_IND,Total number of indigenous known species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,352,,
3,ENDANGERED,Number of endangered species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,41,,
4,CRITICAL,Number of critically endangered species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,9,,
5,VULNERABLE,Number of vulnerable species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,57,,
6,THREATENED,Total number of threatened species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,107,,


Before using certain APIs, I would have to get used to JSON format and parsing it. 

Overall, gathering the data in Python was easier but dealing with JSON in R was challenging. 

# Text Data Gathering

I will be using both News API and Wikipedia API for Text Data. 

## News API

In [40]:
# import packages
import requests
import json
import re
import pandas as pd
import numpy as np

In [41]:
# function to clean string
def string_cleaner(input_string):
    try:
        out=re.sub(r"""
                   [,.;@#?!&$-]+ # Accept one or more copies of punctuation
                   \ *
                   """,
                   " ",
                   input_string, flags=re.VERBOSE) 
        
        out=re.sub('[’.]+', '', input_string)
        out=re.sub(r'\s+', ' ', out)
        out=out.lower()
        
    except: 
        print("ERROR")
        out=''
    return out

In [42]:
baseURL = "https://newsapi.org/v2/everything?"
total_requests = 2
verbose = True

API_KEY='a078005c866644ab8dce1c6637ec465f'

COU_1 = 'Wildlife'
COU_2 = 'Threatened Species'
COU_3 = 'Climate Change'

In [43]:
text_list = []

# Wildlife News
URLpost_1  = {'apiKey': API_KEY,
            'q': '+'+ COU_1,
            'sortBy': 'relevancy',
            'totalRequests': 1}

response = requests.get(baseURL, URLpost_1)
response = response.json() # extract the text data from request into json

article_list=response['articles']
article_keys=article_list[0].keys()
index=0

cleaned_data_1=[];  
for article in article_list:
    tmp=[]

    for key in article_keys:
        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        if(key=='description'):
            tmp.append(string_cleaner(article[key]))

    cleaned_data_1.append(tmp)
    index+=1
    
COU_1_df = pd.DataFrame(cleaned_data_1)

# extract the title and description text and save it to 'cou_1' var.
title = str(np.array(COU_1_df[0]))
description = str(np.array(COU_1_df[1]))

COU_1_text = title + description
COU_1_text

'[\'bolivia wildfires: locals care for animals affected by blazes\'\n \'video shows car interior mauled by a trapped bear\'\n "spacex prepares for starship\'s second test flight after securing faa clearance"\n "a woman transformed a 130-acre property in new york into an organic farm complete with a cozy yurt now it\'s on the market for just over $2 million — take a look"\n "the bureau of land management will no longer allow the use of \'cyanide bombs\' to kill coyotes"\n "how wildlife officials saved a humpback whale found \'hogtied\' to a 300-pound crab pot"\n "if you\'re having a rough day, these baby wallabies will cheer you up!"\n \'californias giant sequoias are in big trouble\'\n \'us moves to protect wolverines as climate change melts their mountain refuges\'\n \'2 manatees named romeo and juliet that have lived in a tank at a florida theme park since 1956 will finally be freed after pressure from activists\'\n \'dozens of drunken grizzly bears have been killed by trains in mont

In [34]:
# Threatened Species News
URLpost_2  = {'apiKey': API_KEY,
            'q': '+'+ COU_2,
            'sortBy': 'relevancy',
            'totalRequests': 1}

response = requests.get(baseURL, URLpost_2)
response = response.json() # extract the text data from request into json

article_list=response['articles']
article_keys=article_list[0].keys()
index=0

cleaned_data_2=[];  
for article in article_list:
    tmp=[]

    for key in article_keys:
        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        if(key=='description'):
            tmp.append(string_cleaner(article[key]))

    cleaned_data_2.append(tmp)
    index+=1
    
COU_2_df = pd.DataFrame(cleaned_data_2)

# extract the title and description text and save it to 'cou_1' var.
title = str(np.array(COU_2_df[0]))
description = str(np.array(COU_2_df[1]))

COU_2_text = title + description
COU_2_text

ERROR




In [37]:
# Endangered Species News
URLpost_3  = {'apiKey': API_KEY,
            'q': '+'+ COU_3,
            'sortBy': 'relevancy',
            'totalRequests': 1}

response = requests.get(baseURL, URLpost_3)
response = response.json() # extract the text data from request into json

article_list=response['articles']
article_keys=article_list[0].keys()
index=0

cleaned_data_3=[];  
for article in article_list:
    tmp=[]

    for key in article_keys:
        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        if(key=='description'):
            tmp.append(string_cleaner(article[key]))

    cleaned_data_3.append(tmp)
    index+=1
    
COU_3_df = pd.DataFrame(cleaned_data_3)

# extract the title and description text and save it to 'cou_1' var.
title = str(np.array(COU_3_df[0]))
description = str(np.array(COU_3_df[1]))

COU_3_text = title + description
COU_3_text

ERROR
ERROR




In [45]:
text_list.append(COU_1_text)
text_list.append(COU_2_text)
text_list.append(COU_3_text)
print(text_list)



## Wikipedia API

In [46]:
# conda install -c conda-forge wikipedia
# conda install -c conda-forge wordcloud
# pip install wikipedia_sections

import warnings
warnings.filterwarnings('ignore')

import wikipedia
import nltk
import string 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon', quiet = True)
nltk.download('punkt', quiet = True)
nltk.download('wordnet', quiet = True)
nltk.download('omw-1.4', quiet = True)
nltk.download('stopwords', quiet = True)

True

In [47]:
# PARAMETERS 
label_list=['Wildlife','Threatened Species','Climate Change']
max_num_pages=25
sentence_per_chunk=5
min_sentence_length=20

# GET STOPWORDS
# from nltk.corpus import stopwords
stop_words=nltk.corpus.stopwords.words('english')

# INITALIZE STEMMER+LEMITZIZER+SIA
sia = SentimentIntensityAnalyzer()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [48]:
def clean_string(text):
	# #FILTER OUT UNWANTED CHAR
	new_text=""
	# keep=string.printable
	keep=" abcdefghijklmnopqrstuvwxyz0123456789"
	for character in text:
		if character.lower() in keep:
			new_text+=character.lower()
		else: 
			new_text+=" "
	text=new_text
	# print(text)

	# #FILTER OUT UNWANTED WORDS
	new_text=""
	for word in nltk.tokenize.word_tokenize(text):
		if word not in nltk.corpus.stopwords.words('english'):
			#lemmatize 
			tmp=lemmatizer.lemmatize(word)
			# tmp=stemmer.stem(tmp)

			# update word if there is a change
			# if(tmp!=word): print(tmp,word)
			
			word=tmp
			if len(word)>1:
				if word in [".",",","!","?",":",";"]:
					#remove the last space
					new_text=new_text[0:-1]+word+" "
				else: #add a space
					new_text+=word.lower()+" "
	text=new_text.strip()
	return text

In [49]:
#INITIALIZE 
corpus=[]  # list of strings (input variables X)
targets=[] # list of targets (labels or response variables Y)

#--------------------------
# LOOP OVER TOPICS 
#--------------------------
for label in label_list:

	#SEARCH FOR RELEVANT PAGES 
	titles=wikipedia.search(label,results=max_num_pages)
	print("Pages for label =",label,":",titles)

	#LOOP OVER WIKI-PAGES
	for title in titles:
		try:
			# print("	",title)
			wiki_page = wikipedia.page(title, auto_suggest=True)

			# LOOP OVER SECTIONS IN ARTICLE AND GET PAGE TEXT
			for section in wiki_page.sections:
				text=wiki_page.section(section); #print(text)

				#BREAK IN TO SENTANCES 
				sentences=nltk.tokenize.sent_tokenize(text)
				counter=0
				text_chunk=''

				#LOOP OVER SENTENCES 
				for sentence in sentences:
					if len(sentence)>min_sentence_length:
						if(counter%sentence_per_chunk==0 and counter!=0):
							# PROCESS COMPLETED CHUNK 
							
							# CLEAN STRING
							text_chunk=clean_string(text_chunk)

							# REMOVE LABEL IF IN STRING (MAKES IT TOO EASY)
							text_chunk=text_chunk.replace(label,"")
							
							# REMOVE ANY DOUBLE SPACES
							text_chunk=' '.join(text_chunk.split()).strip()

							#UPDATE CORPUS 
							corpus.append(text_chunk)

							#UPDATE TARGETS
							score=sia.polarity_scores(text_chunk)
							target=[label,score['compound']]
							targets.append(target)

							#print("TEXT\n",text_chunk,target)

							# RESET CHUNK FOR NEXT ITERATION 
							text_chunk=sentence
						else:
							text_chunk+=sentence
						#print("--------\n", sentence)
						counter+=1

		except:
			continue; 

Pages for label = Wildlife : ['Wildlife', 'United States Fish and Wildlife Service', 'Wildlife conservation', 'Wildlife Park', 'List of wildlife sanctuaries of India', 'Wildlife photography', 'BBC Wildlife', 'World Wide Fund for Nature', 'List of protected areas of Assam', 'Wildlife trust', 'Kuno National Park', 'Wildlife crossing', 'Rahmat International Wildlife Museum & Gallery', 'Nature reserve', 'National Wildlife Refuge', 'Wildlife observation', 'Wildlife!', 'The Wildlife Trusts', 'Mukundara Hills National Park', 'Utah', 'List of protected areas of Gujarat', 'Conservation status', 'List of states and territories of the United States', 'Gir National Park', 'Mandai Wildlife Group']
Pages for label = Threatened Species : ['Threatened species', 'Near-threatened species', 'IUCN Red List', 'List of threatened species of the Philippines', 'Endangered Species Act of 1973', 'List of endangered species in Pakistan', 'Endangered species', 'Conservation status', 'Lists of IUCN Red List critic

In [50]:
#SANITY CHECKS AND PRINT TO FILE 
print("number of text chunks = ",len(corpus))
print("number of targets = ",len(targets))

tmp=[]
for i in range(0,len(corpus)):
    tmp.append([corpus[i],targets[i][0],targets[i][1]])
df=pd.DataFrame(tmp)
df=df.rename(columns={0: "text", 1: "label", 2: "sentiment"})
print(df)
df.to_csv('./data/wiki_textdata.csv',index=False)

number of text chunks =  849
number of targets =  849
                                                  text           label  \
0    pursuant eagle feather law title 50 part 22 co...        Wildlife   
1    founding 1896 work division biological survey ...        Wildlife   
2    edward goldman survey made perfectly clear pos...        Wildlife   
3    1940 1970 fws 1956 usfws operated fleet seagoi...        Wildlife   
4    upon creation 1940 fws inherited bof fleet bro...        Wildlife   
..                                                 ...             ...   
844  drip irrigation especially identified water ef...  Climate Change   
845  irrigation use waste treated water focusing in...  Climate Change   
846  several initiative local site specific local n...  Climate Change   
847  rwanda developed national adaptation programme...  Climate Change   
848  future climate africa programme african climat...  Climate Change   

     sentiment  
0       0.9670  
1       0.9468  
2     