In [2]:
import requests
import librosa
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os

GLOBALS

In [2]:
data_dir = 'data'

In [3]:
requests.get(f'https://xeno-canto.org/api/2/recordings?query=gen:Acrocephalus+cnt:russia').json()['recordings']

[{'id': '835032',
  'gen': 'Acrocephalus',
  'sp': 'arundinaceus',
  'ssp': '',
  'group': 'birds',
  'en': 'Great Reed Warbler',
  'rec': 'Albert Lastukhin',
  'cnt': 'Russian Federation',
  'loc': 'Tonya Avangardnaya, Kamyzyaksky District, Astrakhan Oblast',
  'lat': '45.685',
  'lng': '48.0622',
  'alt': '-30',
  'type': 'alarm call',
  'sex': '',
  'stage': '',
  'method': 'field recording',
  'url': '//xeno-canto.org/835032',
  'file': 'https://xeno-canto.org/835032/download',
  'file-name': 'XC835032-Acrocephalus-arundinaceus-0909_080143,р.-Кизань-разливы,A-alarm.wav',
  'sono': {'small': '//xeno-canto.org/sounds/uploaded/LELYWQKUZX/ffts/XC835032-small.png',
   'med': '//xeno-canto.org/sounds/uploaded/LELYWQKUZX/ffts/XC835032-med.png',
   'large': '//xeno-canto.org/sounds/uploaded/LELYWQKUZX/ffts/XC835032-large.png',
   'full': '//xeno-canto.org/sounds/uploaded/LELYWQKUZX/ffts/XC835032-full.png'},
  'osci': {'small': '//xeno-canto.org/sounds/uploaded/LELYWQKUZX/wave/XC835032-smal

In [7]:
def get_birds_names() -> set:
    response = requests.get('https://xeno-canto.org/api/2/recordings?query=loc:Leningradskaya')
    local_data = response.json()['recordings']
    df_local = pd.json_normalize(local_data)
    birds = set(df_local['gen'])
    return birds

In [8]:
def make_dirs(birds):
    for bird in birds:
        if not os.path.exists(os.path.join(data_dir, bird)):
            os.mkdir(os.path.join(data_dir, bird))

In [None]:
# теперь по каждому файлу посмотреть например временной ряд mean и вычленить фичи с помощью fbkats и далее строить логистическую регрессию например

In [163]:
def parse_files(birds):
    count_recs = 0
    for bird in birds:
        print(f'current bird is {bird}')
        response = requests.get(f'https://xeno-canto.org/api/2/recordings?query=gen:{bird}+cnt:russia').json()['recordings']
        if len(response) == 0:
            continue
        cur_df = pd.json_normalize(response)
        for i in range(cur_df.shape[0]):
            id = cur_df.loc[i]['id']
            
            audio = cur_df.loc[i]['file']
            if audio != '':
                audio_response = requests.get(audio)
                with open(os.path.join(data_dir, bird, id) + '.mp3', 'wb') as f:
                    f.write(audio_response.content)

            sono = 'http:' + cur_df.loc[i]['sono.full']
            if sono != 'http:':
                sono_ext = '.' + sono.split('.')[-1]
                sono_response = requests.get(sono)
                with open(os.path.join(data_dir, bird, id + '_sono') + sono_ext, 'wb') as f:
                    f.write(sono_response.content)

            osci = 'http:' + cur_df.loc[i]['osci.large']
            if osci != 'http:':
                osci_ext = '.' + osci.split('.')[-1]
                osci_response = requests.get(osci)
                with open(os.path.join(data_dir, bird, id + '_osci') + osci_ext, 'wb') as f:
                    f.write(osci_response.content)
            count_recs += 1
    return count_recs

In [4]:
def get_left_df(birds):
    data = []
    for bird in birds:
        response = requests.get(f'https://xeno-canto.org/api/2/recordings?query=gen:{bird}+cnt:russia').json()['recordings']
        if len(response) == 0:
            continue
        data += response
    return pd.json_normalize(data).iloc[:, :-7]

In [None]:
birds = get_birds_names()
make_dirs(birds)
parse_files(birds)
left_df = get_left_df(birds)

In [9]:
birds = get_birds_names()
left_df = get_left_df(birds)

### Getting our features from audio

In [14]:
def numify(file: str, step_div):
    values = librosa.load(file)[0]
    step = len(values) / step_div
    print(len(values), step)
    ranges = [range(i * int(step), (i + 1) * int(step)) for i in range(int(step_div) - 1)]
    dfs = []
    for j in range(len(ranges)):
        dfs.append(pd.DataFrame(pd.Series([values[i] for i in ranges[j]]).describe()).transpose())
    df = pd.concat(dfs)
    df['index'] = list(range(len(dfs)))
    df = df.set_index('index')
    return df

In [15]:
def get_list_files():
    list_files = [os.path.join(dirname, filename) for dirname, _, filenames in os.walk('data') for filename in filenames][1:]
    return list(filter(lambda x: x[-3:] in ('mp3', 'wav'), list_files))

In [16]:
def numify_all(step_div):
    for file in get_list_files():
        path_list = file.split('/')
        path_list[-1] = path_list[-1][:-4] + '.csv'
        path = '/'.join(path_list)
        numify(file, step_div).to_csv(path)
numify_all(10000)

### Joining midterm and shortterm features

In [180]:
def migrate_features2csv() -> None:
    all_csv_features = [dirname + '/' + file for dirname, _, files in os.walk('data') for file in files if 'wav_' in file]
    mt_features = [file for file in all_csv_features if 'mt' in file]
    for mt in mt_features:
        st = mt.replace('mt', 'st')
        df_mt = pd.read_csv(mt)
        df_mt = df_mt.set_axis([f'm{i}' for i in range(df_mt.shape[-1])], axis=1)
        df_st = pd.read_csv(st)
        df_st = df_st.set_axis([f's{i}' for i in range(df_st.shape[-1])], axis=1)
        final_df = df_st.join(df_mt)
        final_df.to_csv(mt.replace('wav_mt', 'wav'))

In [181]:
migrate_features2csv()

In [173]:
def get_birds_enum(numQ: bool = True):
    birds = [dirs for _, dirs, _ in os.walk('data')][0]
    return {i: birds[i] for i in range(len(birds))} if numQ else {birds[i]: i for i in range(len(birds))}

def get_all_df() -> pd.DataFrame:
    csv_files = [dirname + '/' + file for dirname, _, files in os.walk('data') for file in files if 'wav.csv' in file]
    df = pd.read_csv('/Users/juggjup/PycharmProjects/sync-files/mad_nbs/birds/data/Dendrocopos/103937.wav.csv', index_col=0)
    columns = ['index'] + list(df.columns)
    df_all = pd.DataFrame(columns=columns)
    birds_enum = get_birds_enum(False)
    for i, file in enumerate(csv_files):
        df = pd.read_csv(file, index_col=0)
        id = file.split('/')[-1].split('.')[0]
        bird = file.split('/')[1]
        df_all.at[i, 'index'] = id
        for column in columns[1:]:
            df_all.at[i, column] = df[column].dropna().to_numpy()
        df_all.at[i, 'target'] = birds_enum[bird]
        print(f'File {i} {file} ready', end='\r')
    return df_all

In [174]:
df_all = get_all_df()

File 2514 data/Troglodytes/148474.wav.csv readyy

In [182]:
df_all.to_csv('data/features.csv')

KeyboardInterrupt: 