## Wild Life Dataset Cleaning 

1. Load the data

In [65]:
import numpy as np
import pandas as pd

# load dataset
wildlife = pd.read_csv('./data/wild_life_py.csv')
wildlife

Unnamed: 0,IUCN,IUCN Category,SPEC,Species,COU,Country,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,TOT_KNOWN,Total number of known species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,377.000,,
1,TOT_KNOWN_IND,Total number of indigenous known species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,352.000,,
2,ENDANGERED,Number of endangered species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,41.000,,
3,CRITICAL,Number of critically endangered species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,9.000,,
4,VULNERABLE,Number of vulnerable species,MAMMAL,Mammals,AUS,Australia,NBR,Number,0,Units,,,57.000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3626,TOT_KNOWN,Total number of known species,LICHEN,Lichens,NZL,New Zealand,NBR,Number,0,Units,,,2026.000,,
3627,TOT_KNOWN_IND,Total number of indigenous known species,MOSS,Mosses,COL,Colombia,NBR,Number,0,Units,,,52.000,,
3628,THREAT_PERCENT,Threatened species as % of known species,MOSS,Mosses,NZL,New Zealand,PC,Percentage,0,Units,,,1.468,,
3629,THREAT_PERCENT,Threatened species as % of known species,LICHEN,Lichens,NZL,New Zealand,PC,Percentage,0,Units,,,0.790,,


2. Drop the column

In [66]:
wildlife = wildlife.drop(columns=['Unit Code', 'Unit', 'PowerCode Code', 'PowerCode', 'Reference Period Code', 'Reference Period','Flag Codes', 'Flags'])
wildlife

Unnamed: 0,IUCN,IUCN Category,SPEC,Species,COU,Country,Value
0,TOT_KNOWN,Total number of known species,MAMMAL,Mammals,AUS,Australia,377.000
1,TOT_KNOWN_IND,Total number of indigenous known species,MAMMAL,Mammals,AUS,Australia,352.000
2,ENDANGERED,Number of endangered species,MAMMAL,Mammals,AUS,Australia,41.000
3,CRITICAL,Number of critically endangered species,MAMMAL,Mammals,AUS,Australia,9.000
4,VULNERABLE,Number of vulnerable species,MAMMAL,Mammals,AUS,Australia,57.000
...,...,...,...,...,...,...,...
3626,TOT_KNOWN,Total number of known species,LICHEN,Lichens,NZL,New Zealand,2026.000
3627,TOT_KNOWN_IND,Total number of indigenous known species,MOSS,Mosses,COL,Colombia,52.000
3628,THREAT_PERCENT,Threatened species as % of known species,MOSS,Mosses,NZL,New Zealand,1.468
3629,THREAT_PERCENT,Threatened species as % of known species,LICHEN,Lichens,NZL,New Zealand,0.790


3. Remove the rows that contain 'IND'

In [67]:
remove_str = 'IND'
rows = wildlife['IUCN'].str.contains(remove_str)
wildlife = wildlife[~rows]
wildlife

Unnamed: 0,IUCN,IUCN Category,SPEC,Species,COU,Country,Value
0,TOT_KNOWN,Total number of known species,MAMMAL,Mammals,AUS,Australia,377.000
2,ENDANGERED,Number of endangered species,MAMMAL,Mammals,AUS,Australia,41.000
3,CRITICAL,Number of critically endangered species,MAMMAL,Mammals,AUS,Australia,9.000
4,VULNERABLE,Number of vulnerable species,MAMMAL,Mammals,AUS,Australia,57.000
5,THREATENED,Total number of threatened species,MAMMAL,Mammals,AUS,Australia,107.000
...,...,...,...,...,...,...,...
3623,THREAT_PERCENT,Threatened species as % of known species,LICHEN,Lichens,CHL,Chile,0.145
3625,TOT_KNOWN,Total number of known species,MOSS,Mosses,NZL,New Zealand,1362.000
3626,TOT_KNOWN,Total number of known species,LICHEN,Lichens,NZL,New Zealand,2026.000
3628,THREAT_PERCENT,Threatened species as % of known species,MOSS,Mosses,NZL,New Zealand,1.468


Check the 'country' and 'species' column:

In [68]:
print("Countrys:\n", df['Country'].drop_duplicates().head(5))
print("Species:\n", df['Species'].drop_duplicates().head(5))

Countrys:
 0          Australia
6            Austria
15           Belgium
24            Canada
31    Czech Republic
Name: Country, dtype: object
Species:
 0              Mammals
230              Birds
477           Reptiles
679         Amphibians
883    Vascular plants
Name: Species, dtype: object


In [69]:
wildlife.to_csv('./data/wild_life_cleaned.csv', index = False, encoding='utf-8')

## Cleaning Climate Data

In [44]:
climate = pd.read_csv('./data/AIR_GHG.csv')
climate

Unnamed: 0,COU,Country,POL,Pollutant,VAR,Variable,YEA,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1990,1990,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,438056.760,,
1,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1991,1991,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,438049.290,,
2,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1992,1992,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,441752.180,,
3,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1993,1993,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,442282.480,,
4,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1994,1994,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,442610.560,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80471,ZAF,South Africa,PFC,Perfluorocarbons,INDEX_2000,"Total GHG excl. LULUCF, Index 2000=100",2016,2016,IDX,Index,0,Units,,,15.283,,
80472,ZAF,South Africa,PFC,Perfluorocarbons,INDEX_2000,"Total GHG excl. LULUCF, Index 2000=100",2017,2017,IDX,Index,0,Units,,,11.504,,
80473,ZAF,South Africa,PFC,Perfluorocarbons,INDEX_2000,"Total GHG excl. LULUCF, Index 2000=100",2018,2018,IDX,Index,0,Units,,,12.368,,
80474,ZAF,South Africa,PFC,Perfluorocarbons,INDEX_2000,"Total GHG excl. LULUCF, Index 2000=100",2019,2019,IDX,Index,0,Units,,,13.942,,


In [45]:
# drop the column
climate = climate.drop(columns=['YEA', 'PowerCode Code', 'PowerCode', 'Reference Period Code', 'Variable' ,'Unit Code','Unit', 'Reference Period', 'Flag Codes', 'Flags'])

In [46]:
climate

Unnamed: 0,COU,Country,POL,Pollutant,VAR,Year,Value
0,AUS,Australia,GHG,Greenhouse gases,TOTAL,1990,438056.760
1,AUS,Australia,GHG,Greenhouse gases,TOTAL,1991,438049.290
2,AUS,Australia,GHG,Greenhouse gases,TOTAL,1992,441752.180
3,AUS,Australia,GHG,Greenhouse gases,TOTAL,1993,442282.480
4,AUS,Australia,GHG,Greenhouse gases,TOTAL,1994,442610.560
...,...,...,...,...,...,...,...
80471,ZAF,South Africa,PFC,Perfluorocarbons,INDEX_2000,2016,15.283
80472,ZAF,South Africa,PFC,Perfluorocarbons,INDEX_2000,2017,11.504
80473,ZAF,South Africa,PFC,Perfluorocarbons,INDEX_2000,2018,12.368
80474,ZAF,South Africa,PFC,Perfluorocarbons,INDEX_2000,2019,13.942


In [47]:
df2 = climate.loc[climate['VAR'] == 'TOTAL']
df2 = df2.reset_index(drop = True)
df2

Unnamed: 0,COU,Country,POL,Pollutant,VAR,Year,Value
0,AUS,Australia,GHG,Greenhouse gases,TOTAL,1990,438056.760
1,AUS,Australia,GHG,Greenhouse gases,TOTAL,1991,438049.290
2,AUS,Australia,GHG,Greenhouse gases,TOTAL,1992,441752.180
3,AUS,Australia,GHG,Greenhouse gases,TOTAL,1993,442282.480
4,AUS,Australia,GHG,Greenhouse gases,TOTAL,1994,442610.560
...,...,...,...,...,...,...,...
11891,ZAF,South Africa,PFC,Perfluorocarbons,TOTAL,2016,150.262
11892,ZAF,South Africa,PFC,Perfluorocarbons,TOTAL,2017,113.104
11893,ZAF,South Africa,PFC,Perfluorocarbons,TOTAL,2018,121.603
11894,ZAF,South Africa,PFC,Perfluorocarbons,TOTAL,2019,137.073


In [64]:
df2.to_csv('./data/climate_cleaned.csv', index = False, encoding='utf-8')

:::{.callout-note}
Data Cleaning for the future: 
* CLEAN UP THE 'SPECIES' COLUMN; 'Plants' data should be simplified or maybe removed.

* REMOVE THE ROWS WITH TOO LITTLE AMOUNT OF DATA

* Check the climate change data and decide whether to use only MAMMAL data or MAMMAL and BIRD data or MAMMAL and BIRD and PLANTS data. (Based on the data size)
  
* AGGREGATE BY THE COUNTRY AND PLOT THE DATA BY 'VALUE' COLUMN
:::

## Text Data: NEWS API

:::{.callout-note}
Please click the 'Count Vectorization content using the nav-bar on the right
:::

In [49]:
# import packages
import requests
import json
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np

In [50]:
baseURL = "https://newsapi.org/v2/everything?"
total_requests = 2
verbose = True

API_KEY='a078005c866644ab8dce1c6637ec465f'

COU_1 = 'Australia Wildlife'
COU_2 = 'Austria Wildlife'
COU_3 = 'Belgium Wildlife'
COU_4 = "Canada Wildlife"
COU_5 = "Czech Republich Wildlife"
COU_6 = "Denmark Wildlife"
COU_7 = "Finland Wildlife"
COU_8 = "Germany Wildlife"
COU_9 = "Greece Wildlife"
COU_10 = "Hungary Wildlife"
COU_11 = "Iceland Wildlife"
COU_12 = "Italy Wildlife"
COU_13 = "Japan Wildlife"
COU_14 = "Korea Wildlife"
COU_15 = "Lativa Wildlife"
COU_16 = "Lithuania Wildlife"
COU_17 = "Luxembourg Wildlife"
COU_18 = "Mexico Wildlife"
COU_19 = "Netherlands Wildlife"
COU_20 = "New Zealand Wildlife"
COU_21 = "Norway Wildlife"
COU_22 = "Poland Wildlife"
COU_23 = "Protugal Wildlife"
COU_24 = "Slovak Republic Wildlife"
COU_25 = "Spain Wildlife"
COU_26 = "Sweden Wildlife"
COU_27 = "Switzerland Wildlife"
COU_28 = "Turkiye Wildlife"
COU_29 = "United Kindom Wildlife"

Using the 'Australia', 'Korea', 'Spain' text for Count Vectorization

In [51]:
text_list = []

In [52]:
# Australia Wildlife
URLpost_1  = {'apiKey': API_KEY,
            'q': '+'+ COU_1,
            'sortBy': 'relevancy',
            'totalRequests': 1}

response = requests.get(baseURL, URLpost_1)
response = response.json() # extract the text data from request into json

In [53]:
# function to clear string
def string_cleaner(input_string):
    try:
        out=re.sub(r"""
                   [,.;@#?!&$-]+ # Accept one or more copies of punctuation
                   \ *
                   """,
                   " ",
                   input_string, flags=re.VERBOSE) 
        
        out=re.sub('[’.]+', '', input_string)
        out=re.sub(r'\s+', ' ', out)
        out=out.lower()
        
    except: 
        print("ERROR")
        out=''
    return out

In [54]:
article_list=response['articles']
article_keys=article_list[0].keys()
index=0

cleaned_data_1=[];  
for article in article_list:
    tmp=[]

    for key in article_keys:
        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        if(key=='description'):
            tmp.append(string_cleaner(article[key]))

    cleaned_data_1.append(tmp)
    index+=1

In [55]:
COU_1_df = pd.DataFrame(cleaned_data_1)

# extract the title and description text and save it to 'cou_1' var.
title = str(np.array(COU_1_df[0]))
description = str(np.array(COU_1_df[1]))

COU_1_text = title + description
COU_1_text



In [56]:
# Korea Wildlife
URLpost_1  = {'apiKey': API_KEY,
            'q': '+'+ COU_14,
            'sortBy': 'relevancy',
            'totalRequests': 1}

response = requests.get(baseURL, URLpost_1)
response = response.json() # extract the text data from request into json

In [57]:
article_list=response['articles']
article_keys=article_list[0].keys()
index=0

cleaned_data_14=[];  
for article in article_list:
    tmp=[]

    for key in article_keys:
        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        if(key=='description'):
            tmp.append(string_cleaner(article[key]))

    cleaned_data_14.append(tmp)
    index+=1

In [58]:
COU_14_df = pd.DataFrame(cleaned_data_14)

# extract the title and description text and save it to 'cou_1' var.
title = str(np.array(COU_14_df[0]))
description = str(np.array(COU_14_df[1]))

COU_14_text = title + description
COU_14_text

'[\'in the studio: kieran stanley - designing a zoo\'\n \'ecuador\\\'s drug lords are building "narco-zoos" as status symbols\'\n \'dragons and other decorative tattoos by ehyang\'\n "like escobar, ecuador\'s drug lords build \'narco-zoos\'"\n \'reasons our civilization will soon collapse\'\n \'captivating and astonishing landmark natural history series, "planet earth iii," set to premiere on saturday, november 4 on bbc america and amc+\'\n \'world of forbes: stories of entrepreneurial capitalism across our 45 international editions\'\n \'record-low trapping numbers may point to trouble for minnesota muskrats\'\n \'10 obscure cryptids and why youve never heard of them\' \'[removed]\'\n \'links 10/24/2023\'\n \'apple tv+ shows and movies: everything to watch on apple tv plus\'\n \'apple tv+ shows and movies: everything to watch on apple tv plus\'\n \'apple tv+ shows and movies: everything to watch on apple tv plus\'\n \'apple tv+ shows and movies: everything to watch on apple tv plus\'\

In [59]:
# Spain Wildlife
URLpost_1  = {'apiKey': API_KEY,
            'q': '+'+ COU_25,
            'sortBy': 'relevancy',
            'totalRequests': 1}

response = requests.get(baseURL, URLpost_1)
response = response.json() # extract the text data from request into json

In [60]:
article_list=response['articles']
article_keys=article_list[0].keys()
index=0

cleaned_data_25=[];  
for article in article_list:
    tmp=[]

    for key in article_keys:
        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        if(key=='description'):
            tmp.append(string_cleaner(article[key]))

    cleaned_data_25.append(tmp)
    index+=1

In [61]:
COU_25_df = pd.DataFrame(cleaned_data_25)

# extract the title and description text and save it to 'cou_1' var.
title = str(np.array(COU_25_df[0]))
description = str(np.array(COU_25_df[1]))

COU_25_text = title + description
COU_25_text

'[\'photo of stinkbugs maternal behavior wins 2023 european wildlife photographer of the year\'\n \'lynxes and vultures offer insights for european wildlife conservation\'\n \'nature calls: the 2023 wildlife photographer of the year winners\'\n \'this algae-based 3d-printed surfboard design is sturdier than conventional foam boards\'\n \'stunning winning photos of the natures best photography international awards 2023\'\n \'slideshow: winners of the 2023 nikon small world photomicroscopy competition\'\n \'amazing wild animals in photos: 19 winners of wildlife photographer of the year 2023\'\n \'water-hungry golf courses plague arizonas native javelina-filled landscape\'\n \'the mystic art of gardening\' \'[removed]\'\n \'marine biologists photo of a golden horseshoe crab wins wildlife photography awards\'\n \'[removed]\' \'eumies awards 2024 announces its list of 362 nominees\'\n \'[removed]\' \'beqo hoti masters the scandinavian-grecian market\'\n \'reasons our civilization will soon 

In [62]:
text_list.append(COU_1_text)
text_list.append(COU_14_text)
text_list.append(COU_25_text)
print(text_list)



### Cleaning the Data using Count Vectorizer

Using Count Vectorization for 3 text data: Australia Wildlife, Korea Wildlife, Spain Wildlife

In [63]:
vectorizer = CountVectorizer()
vectorizer.fit(text_list)

print("Vocabulary: ", vectorizer.vocabulary_)

vector = vectorizer.transform(text_list)

print("Encoded Document is:")
print(vector.toarray())

Encoded Document is:
[[3 1 1 ... 1 1 1]
 [0 0 3 ... 0 0 0]
 [3 0 1 ... 0 0 0]]
