<a href="https://colab.research.google.com/github/jazoza/mad/blob/main/03_MAD_intro_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Making Arguments with Data

Introduction into working with datasets

## Working with SIGID wiki radio dataset

Source: Sigid Wiki, https://www.sigidwiki.com/wiki/Signal_Identification_Guide


## Locate .pkl files in your own GDrive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# import external packages

import pandas as pd
import requests

## Get Radio dataset as JSON

In [None]:
response = requests.get('https://radioexplorations.ch/study-2/data/df_fma_entropy_radio.json')

json = response.json()

# remove unusable data
json.pop('bmus-proj')
json.pop('bmus')
json.pop('chunks')
json.pop('noise')
json.pop('imgs_path_high')
json.pop('imgs_path_low')

In [None]:
#@title put the dataset into a dataframe
df = pd.DataFrame.from_dict(json)

In [None]:
#@title show the dataframe
df

## Examine the dataset - unique values, co-ocurrences, etc

In [None]:
# How many unique signal names?
len(df.Signal_type.unique())

In [None]:
# How many signal names  are duplicates?
df.Signal_type.duplicated().sum()

In [None]:
# List all countries in which signals have been recorded
df.Location.unique()

In [None]:
# Which countries have most recordings? 
df.Location.value_counts()

In [None]:
# Which is the most common modulation?
df.Modulation.value_counts()

In [None]:
# Most common modulations, when they appear more than once:
df.Modulation.value_counts()[df.Modulation.value_counts() > 1]

In [None]:
# pivot table js

## Importing readymade datasets as 'pickles'

In [None]:
import pickle 
! pwd #check working directory, find where your files are
%cd /content/drive/MyDrive/
!ls

In [None]:
folder_path = '/content/drive/MyDrive/datasets-SOM/' # set the path to your files

In [None]:
radio_data_path = folder_path+'radio.pkl'
fma_data_path = folder_path+'fma.pkl'
xenocanto_data_path = folder_path+'xenocanto.pkl'

radio_data = pickle.load(open(radio_data_path, 'rb'))
# /\.\/(\w*?)\//g
data_length = len(radio_data['features'])
radio_data['known'] = ['NaN'] * data_length
index = 0
for path in radio_data['Sample_audio']:
    if './known/' in path:
       radio_data['known'][index] = 'known'
    else:
        radio_data['known'][index] = 'unknown'
    index+=1

fma_data = pickle.load(open(fma_data_path, 'rb'))

xenocanto_data = pickle.load(open(xenocanto_data_path, 'rb'))

## Working with Free Music Archive FMA

Source: https://github.com/mdeff/fma 

In [None]:
fma_data

## Working with XenoCanto

Source: https://gist.github.com/rhine3/4829bf66381c7aa05c1f656cec4fa040   


In [None]:
xenocanto_data

In [None]:
# From which countries to most recordings come from?
xenocanto_data.cnt.value_counts()

In [None]:
# Where do which recordists mostly record?
xenocanto_data.filter(["cnt","rec"]).value_counts()

In [None]:
!