# Machine Learning

Machine learning is a subset of artificial intelligence that enables computers to learn from data without being explicitly programmed. It involves training algorithms on large datasets to recognize patterns, make predictions, and improve their performance over time. Machine learning models can be trained on various types of data, such as images, text, or audio, and can be applied to a wide range of applications, including natural language processing, computer vision, and predictive analytics. Through machine learning, computers can learn to identify objects in images, understand human language, and make predictions about future events, all without being explicitly programmed to do so.

## Setup

In [1]:
from opensoundscape.spectrogram import Spectrogram
from opensoundscape import Audio
from pathlib import Path
import os
import glob

src_data_dir = 'data/audio/training_sets/small'
test_data_dir = 'data/audio/training_sets/test_data'
train_data_dir = 'data/audio/training_sets/train_data'

# bandpass filters
high_cut = 8000
low_cut = 300

# sounds
clip_seconds = 1
overlap_seconds = .1
freq = 44100

image_shape = (200, 200)


  from tqdm.autonotebook import tqdm


# Datasets
For this portion, you may either use the training set included in the supplementary audio files, or generate a dataset from iNaturalist. 

In [2]:
inaturalist_data = True

# get data from iNaturalist
if inaturalist_data:
    from pyinaturalist import get_observation_species_counts, TaxonCount, pprint, get_observations
    import json
    import urllib.request
    import re
    from novus_pytils.files import (create_directory, directory_exists, delete_directory)

    def safe_serialize(obj):
        default = lambda o: f"<<non-serializable: {type(o).__qualname__}>>"
        return json.dumps(obj, default=default)

    src_data_dir = 'data/audio/training_sets/inat/'
    
    delete_directory(src_data_dir)
    create_directory(src_data_dir)

    # define the place_id and taxon_id of interest from iNaturalist
    scp_place_id = 130630
    frog_taxon_id = 20979

    # get the observation species counts for frogs at Steele Creek Park
    response = get_observation_species_counts(place_id= scp_place_id, taxon_id=frog_taxon_id)

    # response comes in as a list of dictionaries in JSON format, Taxon count will help us parse it
    taxa = TaxonCount.from_json_list(response['results'][:])

    # loop through json results
    for taxon in taxa:
        obs = get_observations(taxon_id=taxon.id, sounds=True, per_page=200)
        print(f"Collecting wav audio observations for {taxon.full_name}")

        for ob in obs['results']:
            for o in ob['sounds']:
                file_ext = o['file_content_type'].split("/")[1]

                if o['file_content_type'] != 'audio/x-wav':
                    continue
                
                file_ext ="wav"

                #print(o)
                try:
                    taxon_name = re.sub("[\W_]+", "", ob['taxon']['preferred_common_name'])

                    dir = f"{src_data_dir}{taxon_name}/"

                    if not directory_exists(dir):
                        create_directory(dir)
                    urllib.request.urlretrieve(f"{o['file_url']}", f"{dir}{ob['sounds'][0]['id']}.{file_ext}")
                except:
                    pass

classes = list()
for path in Path(src_data_dir).iterdir():
    if path.is_dir():
        classes.append(path.name)

num_classes = len(classes)


Collecting wav audio observations for Lithobates clamitans (Green Frog)
Collecting wav audio observations for Lithobates palustris (Pickerel Frog)
Collecting wav audio observations for Lithobates sylvaticus (Wood Frog)
Collecting wav audio observations for Anaxyrus americanus (American Toad)
Collecting wav audio observations for Pseudacris feriarum (Upland Chorus Frog)
Collecting wav audio observations for Hyla chrysoscelis (Cope's Gray Treefrog)
Collecting wav audio observations for Pseudacris crucifer (Spring Peeper)
Collecting wav audio observations for Lithobates catesbeianus (American Bullfrog)


## Data Processing / Spectrogram Creation

In [3]:
# ETL process for creating spectrogram images from .wav files

from novus_pytils.files import (create_directory, directory_exists, delete_directory)

# delete existing training directories
for species in classes:
    if directory_exists(os.path.join(train_data_dir, species)):
        delete_directory(os.path.join(train_data_dir, species))

    # make directories for each species
    create_directory(os.path.join(train_data_dir, species))

    for wav in [file for file in glob.glob(os.path.join(src_data_dir, species, "*.wav"))]:
        audio_object = Audio.from_file(wav)
        audio_object = audio_object.resample(freq)
        audio_object = audio_object.bandpass(low_f=low_cut, high_f=high_cut, order= 12)
        
        clips, clip_df = audio_object.split(clip_duration=clip_seconds, clip_overlap=overlap_seconds, final_clip=None)
        spectrogram_objects = [Spectrogram.from_audio(audio_object, window_samples=400).bandpass(min_f=low_cut, max_f=high_cut) for audio_object in clips]

        count = 0
        for spectrogram_object in spectrogram_objects:
            spectrogram_image = spectrogram_object.to_image(shape=image_shape)
            spectrogram_image.save(os.path.join(train_data_dir,species, count.__str__() + ".png"))
            count += 1

